Add point-in-time S&P 500 backtest to expose survivorship bias
The existing framework fetches today's S&P 500 constituents from Wikipedia
and applies that list to the entire 10-year price history — classic
survivorship bias. Stocks that went bankrupt or were removed for poor
performance are absent, while today's winners (which may have been minor
names 10 years ago) are implicitly selected. This materially inflates
reported strategy returns.
New pipeline:
- universe_history.py reconstructs per-ticker membership intervals by
walking Wikipedia's "Selected changes" table backward from today.
- research/fetch_historical.py downloads prices for all 848 tickers
that were ever members (Yahoo returns ~675 of them; ~170 fully
delisted names are unavailable — remaining partial bias).
- research/pit_backtest.py masks prices to NaN outside membership
windows so strategies naturally cannot select non-members.
- research/strategies_plus.py adds RecoveryMomentumPlus (generalized
Recovery+Momentum with configurable weighting / blend / regime hook)
and an EnsembleStrategy.
- research/optimize.py runs five experiments: bias drift, hyperparameter
sweep (2016-2022 train / 2023-2026 test), SPY MA regime filter,
weighting schemes, and an uncorrelated-config ensemble.
Headline finding: the biased backtest reports 40.9% CAGR for
recovery_mom_top10 over 2016-2026; the point-in-time version reports
22.4% (vs 14.0% SPY buy-and-hold). True edge is ~8pp CAGR, not ~27pp.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
105
research/fetch_historical.py
Normal file
105
research/fetch_historical.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Fetch price history for all tickers that were ever S&P 500 members — including
|
||||
delisted ones — and save to data/us_pit.csv. This is the foundation for a
|
||||
survivorship-bias-free backtest.
|
||||
|
||||
NOTE: Yahoo Finance no longer serves price data for many fully-delisted tickers
|
||||
(bankruptcies, old mergers). Those are silently skipped. The result is still
|
||||
a major improvement over "today's S&P 500 extrapolated 10 years back", but it
|
||||
is NOT a perfect point-in-time dataset — only a dataset where the universe
|
||||
mask is correct at each date. A subset of worst-outcome tickers (e.g., ABK,
|
||||
ACAS) will be missing entirely. This caveat is documented in the run summary.
|
||||
"""
|
||||
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pandas as pd
|
||||
import yfinance as yf
|
||||
|
||||
import universe_history as uh
|
||||
|
||||
DATA_DIR = "data"
|
||||
OUT_PATH = os.path.join(DATA_DIR, "us_pit.csv")
|
||||
YEARS = 10
|
||||
BATCH_SIZE = 50
|
||||
|
||||
|
||||
def fetch_all_historical(force: bool = False) -> pd.DataFrame:
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
intervals = uh.load_sp500_history()
|
||||
tickers = uh.all_tickers_ever(intervals) + ["SPY"]
|
||||
tickers = sorted(set(tickers))
|
||||
|
||||
existing = None
|
||||
if os.path.exists(OUT_PATH) and not force:
|
||||
existing = pd.read_csv(OUT_PATH, index_col=0, parse_dates=True)
|
||||
missing = [t for t in tickers if t not in existing.columns]
|
||||
if not missing:
|
||||
# Just append latest dates
|
||||
last_date = existing.index[-1]
|
||||
if (datetime.now() - last_date.to_pydatetime()).days < 2:
|
||||
print(f"--- us_pit.csv already up to date: {existing.shape} ---")
|
||||
return existing
|
||||
tickers = list(existing.columns)
|
||||
start = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
print(f"--- Appending new dates from {start} for {len(tickers)} tickers ---")
|
||||
new = _download_batched(tickers, start=start)
|
||||
if new is not None and not new.empty:
|
||||
combined = pd.concat([existing, new]).sort_index()
|
||||
combined = combined[~combined.index.duplicated(keep="last")]
|
||||
combined.to_csv(OUT_PATH)
|
||||
print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
|
||||
return combined
|
||||
return existing
|
||||
else:
|
||||
print(f"--- Have {existing.shape[1]} cols; need {len(missing)} more ---")
|
||||
tickers = missing
|
||||
|
||||
start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d")
|
||||
new = _download_batched(tickers, start=start)
|
||||
|
||||
if existing is not None and new is not None and not new.empty:
|
||||
combined = pd.concat([existing, new.reindex(existing.index)], axis=1)
|
||||
# Add any new rows from `new` not in existing
|
||||
new_only_idx = new.index.difference(existing.index)
|
||||
if len(new_only_idx) > 0:
|
||||
combined_new = new.loc[new_only_idx].reindex(columns=combined.columns)
|
||||
combined = pd.concat([combined, combined_new]).sort_index()
|
||||
else:
|
||||
combined = new
|
||||
|
||||
combined.to_csv(OUT_PATH)
|
||||
print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
|
||||
return combined
|
||||
|
||||
|
||||
def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None:
|
||||
frames = []
|
||||
n = len(tickers)
|
||||
for i in range(0, n, BATCH_SIZE):
|
||||
batch = tickers[i:i + BATCH_SIZE]
|
||||
print(f" [{i}/{n}] fetching {len(batch)} tickers...", flush=True)
|
||||
try:
|
||||
raw = yf.download(batch, start=start, auto_adjust=True,
|
||||
progress=False, threads=True)
|
||||
if raw.empty:
|
||||
continue
|
||||
if isinstance(raw.columns, pd.MultiIndex):
|
||||
close = raw["Close"]
|
||||
else:
|
||||
close = raw[["Close"]].rename(columns={"Close": batch[0]})
|
||||
close = close.dropna(axis=1, how="all")
|
||||
if not close.empty:
|
||||
frames.append(close)
|
||||
except Exception as e:
|
||||
print(f" batch failed: {e}")
|
||||
if not frames:
|
||||
return None
|
||||
result = pd.concat(frames, axis=1).sort_index()
|
||||
result = result.loc[:, ~result.columns.duplicated()]
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fetch_all_historical()
|
||||
Reference in New Issue
Block a user