Add point-in-time S&P 500 backtest to expose survivorship bias

The existing framework fetches today's S&P 500 constituents from Wikipedia and applies that list to the entire 10-year price history — classic survivorship bias. Stocks that went bankrupt or were removed for poor performance are absent, while today's winners (which may have been minor names 10 years ago) are implicitly selected. This materially inflates reported strategy returns. New pipeline: - universe_history.py reconstructs per-ticker membership intervals by walking Wikipedia's "Selected changes" table backward from today. - research/fetch_historical.py downloads prices for all 848 tickers that were ever members (Yahoo returns ~675 of them; ~170 fully delisted names are unavailable — remaining partial bias). - research/pit_backtest.py masks prices to NaN outside membership windows so strategies naturally cannot select non-members. - research/strategies_plus.py adds RecoveryMomentumPlus (generalized Recovery+Momentum with configurable weighting / blend / regime hook) and an EnsembleStrategy. - research/optimize.py runs five experiments: bias drift, hyperparameter sweep (2016-2022 train / 2023-2026 test), SPY MA regime filter, weighting schemes, and an uncorrelated-config ensemble. Headline finding: the biased backtest reports 40.9% CAGR for recovery_mom_top10 over 2016-2026; the point-in-time version reports 22.4% (vs 14.0% SPY buy-and-hold). True edge is ~8pp CAGR, not ~27pp. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-17 16:26:02 +08:00
parent 2015b62104
commit 5e1c4a681d
7 changed files with 910 additions and 0 deletions
--- a/research/fetch_historical.py
+++ b/research/fetch_historical.py
@@ -0,0 +1,105 @@
+"""
+Fetch price history for all tickers that were ever S&P 500 members — including
+delisted ones — and save to data/us_pit.csv. This is the foundation for a
+survivorship-bias-free backtest.
+
+NOTE: Yahoo Finance no longer serves price data for many fully-delisted tickers
+(bankruptcies, old mergers). Those are silently skipped. The result is still
+a major improvement over "today's S&P 500 extrapolated 10 years back", but it
+is NOT a perfect point-in-time dataset — only a dataset where the universe
+mask is correct at each date. A subset of worst-outcome tickers (e.g., ABK,
+ACAS) will be missing entirely. This caveat is documented in the run summary.
+"""
+
+import os
+from datetime import datetime, timedelta
+
+import pandas as pd
+import yfinance as yf
+
+import universe_history as uh
+
+DATA_DIR = "data"
+OUT_PATH = os.path.join(DATA_DIR, "us_pit.csv")
+YEARS = 10
+BATCH_SIZE = 50
+
+
+def fetch_all_historical(force: bool = False) -> pd.DataFrame:
+    os.makedirs(DATA_DIR, exist_ok=True)
+    intervals = uh.load_sp500_history()
+    tickers = uh.all_tickers_ever(intervals) + ["SPY"]
+    tickers = sorted(set(tickers))
+
+    existing = None
+    if os.path.exists(OUT_PATH) and not force:
+        existing = pd.read_csv(OUT_PATH, index_col=0, parse_dates=True)
+        missing = [t for t in tickers if t not in existing.columns]
+        if not missing:
+            # Just append latest dates
+            last_date = existing.index[-1]
+            if (datetime.now() - last_date.to_pydatetime()).days < 2:
+                print(f"--- us_pit.csv already up to date: {existing.shape} ---")
+                return existing
+            tickers = list(existing.columns)
+            start = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")
+            print(f"--- Appending new dates from {start} for {len(tickers)} tickers ---")
+            new = _download_batched(tickers, start=start)
+            if new is not None and not new.empty:
+                combined = pd.concat([existing, new]).sort_index()
+                combined = combined[~combined.index.duplicated(keep="last")]
+                combined.to_csv(OUT_PATH)
+                print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
+                return combined
+            return existing
+        else:
+            print(f"--- Have {existing.shape[1]} cols; need {len(missing)} more ---")
+            tickers = missing
+
+    start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d")
+    new = _download_batched(tickers, start=start)
+
+    if existing is not None and new is not None and not new.empty:
+        combined = pd.concat([existing, new.reindex(existing.index)], axis=1)
+        # Add any new rows from `new` not in existing
+        new_only_idx = new.index.difference(existing.index)
+        if len(new_only_idx) > 0:
+            combined_new = new.loc[new_only_idx].reindex(columns=combined.columns)
+            combined = pd.concat([combined, combined_new]).sort_index()
+    else:
+        combined = new
+
+    combined.to_csv(OUT_PATH)
+    print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
+    return combined
+
+
+def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None:
+    frames = []
+    n = len(tickers)
+    for i in range(0, n, BATCH_SIZE):
+        batch = tickers[i:i + BATCH_SIZE]
+        print(f"  [{i}/{n}] fetching {len(batch)} tickers...", flush=True)
+        try:
+            raw = yf.download(batch, start=start, auto_adjust=True,
+                              progress=False, threads=True)
+            if raw.empty:
+                continue
+            if isinstance(raw.columns, pd.MultiIndex):
+                close = raw["Close"]
+            else:
+                close = raw[["Close"]].rename(columns={"Close": batch[0]})
+            close = close.dropna(axis=1, how="all")
+            if not close.empty:
+                frames.append(close)
+        except Exception as e:
+            print(f"    batch failed: {e}")
+    if not frames:
+        return None
+    result = pd.concat(frames, axis=1).sort_index()
+    result = result.loc[:, ~result.columns.duplicated()]
+    return result
+
+
+if __name__ == "__main__":
+    fetch_all_historical()