Add point-in-time S&P 500 backtest to expose survivorship bias
The existing framework fetches today's S&P 500 constituents from Wikipedia
and applies that list to the entire 10-year price history — classic
survivorship bias. Stocks that went bankrupt or were removed for poor
performance are absent, while today's winners (which may have been minor
names 10 years ago) are implicitly selected. This materially inflates
reported strategy returns.
New pipeline:
- universe_history.py reconstructs per-ticker membership intervals by
walking Wikipedia's "Selected changes" table backward from today.
- research/fetch_historical.py downloads prices for all 848 tickers
that were ever members (Yahoo returns ~675 of them; ~170 fully
delisted names are unavailable — remaining partial bias).
- research/pit_backtest.py masks prices to NaN outside membership
windows so strategies naturally cannot select non-members.
- research/strategies_plus.py adds RecoveryMomentumPlus (generalized
Recovery+Momentum with configurable weighting / blend / regime hook)
and an EnsembleStrategy.
- research/optimize.py runs five experiments: bias drift, hyperparameter
sweep (2016-2022 train / 2023-2026 test), SPY MA regime filter,
weighting schemes, and an uncorrelated-config ensemble.
Headline finding: the biased backtest reports 40.9% CAGR for
recovery_mom_top10 over 2016-2026; the point-in-time version reports
22.4% (vs 14.0% SPY buy-and-hold). True edge is ~8pp CAGR, not ~27pp.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
1
data/sp500_history.json
Normal file
1
data/sp500_history.json
Normal file
File diff suppressed because one or more lines are too long
0
research/__init__.py
Normal file
0
research/__init__.py
Normal file
105
research/fetch_historical.py
Normal file
105
research/fetch_historical.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""
|
||||||
|
Fetch price history for all tickers that were ever S&P 500 members — including
|
||||||
|
delisted ones — and save to data/us_pit.csv. This is the foundation for a
|
||||||
|
survivorship-bias-free backtest.
|
||||||
|
|
||||||
|
NOTE: Yahoo Finance no longer serves price data for many fully-delisted tickers
|
||||||
|
(bankruptcies, old mergers). Those are silently skipped. The result is still
|
||||||
|
a major improvement over "today's S&P 500 extrapolated 10 years back", but it
|
||||||
|
is NOT a perfect point-in-time dataset — only a dataset where the universe
|
||||||
|
mask is correct at each date. A subset of worst-outcome tickers (e.g., ABK,
|
||||||
|
ACAS) will be missing entirely. This caveat is documented in the run summary.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import yfinance as yf
|
||||||
|
|
||||||
|
import universe_history as uh
|
||||||
|
|
||||||
|
DATA_DIR = "data"
|
||||||
|
OUT_PATH = os.path.join(DATA_DIR, "us_pit.csv")
|
||||||
|
YEARS = 10
|
||||||
|
BATCH_SIZE = 50
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_all_historical(force: bool = False) -> pd.DataFrame:
|
||||||
|
os.makedirs(DATA_DIR, exist_ok=True)
|
||||||
|
intervals = uh.load_sp500_history()
|
||||||
|
tickers = uh.all_tickers_ever(intervals) + ["SPY"]
|
||||||
|
tickers = sorted(set(tickers))
|
||||||
|
|
||||||
|
existing = None
|
||||||
|
if os.path.exists(OUT_PATH) and not force:
|
||||||
|
existing = pd.read_csv(OUT_PATH, index_col=0, parse_dates=True)
|
||||||
|
missing = [t for t in tickers if t not in existing.columns]
|
||||||
|
if not missing:
|
||||||
|
# Just append latest dates
|
||||||
|
last_date = existing.index[-1]
|
||||||
|
if (datetime.now() - last_date.to_pydatetime()).days < 2:
|
||||||
|
print(f"--- us_pit.csv already up to date: {existing.shape} ---")
|
||||||
|
return existing
|
||||||
|
tickers = list(existing.columns)
|
||||||
|
start = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||||
|
print(f"--- Appending new dates from {start} for {len(tickers)} tickers ---")
|
||||||
|
new = _download_batched(tickers, start=start)
|
||||||
|
if new is not None and not new.empty:
|
||||||
|
combined = pd.concat([existing, new]).sort_index()
|
||||||
|
combined = combined[~combined.index.duplicated(keep="last")]
|
||||||
|
combined.to_csv(OUT_PATH)
|
||||||
|
print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
|
||||||
|
return combined
|
||||||
|
return existing
|
||||||
|
else:
|
||||||
|
print(f"--- Have {existing.shape[1]} cols; need {len(missing)} more ---")
|
||||||
|
tickers = missing
|
||||||
|
|
||||||
|
start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d")
|
||||||
|
new = _download_batched(tickers, start=start)
|
||||||
|
|
||||||
|
if existing is not None and new is not None and not new.empty:
|
||||||
|
combined = pd.concat([existing, new.reindex(existing.index)], axis=1)
|
||||||
|
# Add any new rows from `new` not in existing
|
||||||
|
new_only_idx = new.index.difference(existing.index)
|
||||||
|
if len(new_only_idx) > 0:
|
||||||
|
combined_new = new.loc[new_only_idx].reindex(columns=combined.columns)
|
||||||
|
combined = pd.concat([combined, combined_new]).sort_index()
|
||||||
|
else:
|
||||||
|
combined = new
|
||||||
|
|
||||||
|
combined.to_csv(OUT_PATH)
|
||||||
|
print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
|
||||||
|
return combined
|
||||||
|
|
||||||
|
|
||||||
|
def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None:
|
||||||
|
frames = []
|
||||||
|
n = len(tickers)
|
||||||
|
for i in range(0, n, BATCH_SIZE):
|
||||||
|
batch = tickers[i:i + BATCH_SIZE]
|
||||||
|
print(f" [{i}/{n}] fetching {len(batch)} tickers...", flush=True)
|
||||||
|
try:
|
||||||
|
raw = yf.download(batch, start=start, auto_adjust=True,
|
||||||
|
progress=False, threads=True)
|
||||||
|
if raw.empty:
|
||||||
|
continue
|
||||||
|
if isinstance(raw.columns, pd.MultiIndex):
|
||||||
|
close = raw["Close"]
|
||||||
|
else:
|
||||||
|
close = raw[["Close"]].rename(columns={"Close": batch[0]})
|
||||||
|
close = close.dropna(axis=1, how="all")
|
||||||
|
if not close.empty:
|
||||||
|
frames.append(close)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" batch failed: {e}")
|
||||||
|
if not frames:
|
||||||
|
return None
|
||||||
|
result = pd.concat(frames, axis=1).sort_index()
|
||||||
|
result = result.loc[:, ~result.columns.duplicated()]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fetch_all_historical()
|
||||||
299
research/optimize.py
Normal file
299
research/optimize.py
Normal file
@@ -0,0 +1,299 @@
|
|||||||
|
"""
|
||||||
|
End-to-end optimization study for the US recovery+momentum strategy family,
|
||||||
|
run on a point-in-time (survivorship-bias-mitigated) S&P 500 universe.
|
||||||
|
|
||||||
|
Experiments:
|
||||||
|
E1 — Baseline drift: biased vs point-in-time universe, current top10 params.
|
||||||
|
E2 — Hyperparameter sweep with 2016-2022 train / 2023-2026 test split.
|
||||||
|
E3 — SPY MA200 regime filter (compare base vs filtered).
|
||||||
|
E4 — Weighting schemes: equal vs inverse-vol vs rank.
|
||||||
|
E5 — Ensemble of top-3 uncorrelated configs.
|
||||||
|
|
||||||
|
Usage: uv run python -m research.optimize
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import data_manager
|
||||||
|
import research.pit_backtest as pit
|
||||||
|
from research.strategies_plus import (EnsembleStrategy, RecoveryMomentumPlus,
|
||||||
|
spy_ma200_filter)
|
||||||
|
from strategies.recovery_momentum import RecoveryMomentumStrategy
|
||||||
|
|
||||||
|
DATA_DIR = "data"
|
||||||
|
BIASED_CSV = os.path.join(DATA_DIR, "us.csv")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def slice_period(df: pd.DataFrame, start: str | None, end: str | None) -> pd.DataFrame:
|
||||||
|
out = df
|
||||||
|
if start:
|
||||||
|
out = out[out.index >= start]
|
||||||
|
if end:
|
||||||
|
out = out[out.index <= end]
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def run_strategy(strategy, prices, benchmark=None, regime_filter=None,
|
||||||
|
fixed_fee: float = 0.0) -> pd.Series:
|
||||||
|
return pit.backtest(
|
||||||
|
strategy=strategy, prices=prices, initial_capital=10_000,
|
||||||
|
transaction_cost=0.001, fixed_fee=fixed_fee,
|
||||||
|
benchmark=benchmark, regime_filter=regime_filter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Experiment 1: bias drift
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def exp1_bias_drift(pit_prices_masked: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
print("\n" + "=" * 90)
|
||||||
|
print("E1 — Biased universe vs Point-in-time universe (recovery_mom_top10)")
|
||||||
|
print("=" * 90)
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
# Biased: current 503 tickers extrapolated backward
|
||||||
|
biased = pd.read_csv(BIASED_CSV, index_col=0, parse_dates=True)
|
||||||
|
# Use same date range as PIT for a fair comparison
|
||||||
|
common_start = max(biased.index[0], pit_prices_masked.index[0])
|
||||||
|
common_end = min(biased.index[-1], pit_prices_masked.index[-1])
|
||||||
|
biased_window = slice_period(biased, str(common_start.date()), str(common_end.date()))
|
||||||
|
pit_window = slice_period(pit_prices_masked, str(common_start.date()), str(common_end.date()))
|
||||||
|
|
||||||
|
# Drop non-ticker columns (SPY is in PIT but not in the masked tickers)
|
||||||
|
biased_tickers = [c for c in biased_window.columns if c != "SPY"]
|
||||||
|
pit_tickers = [c for c in pit_window.columns if c != "SPY"]
|
||||||
|
|
||||||
|
# Use RecoveryMomentumPlus with identical defaults to recovery_mom_top10.
|
||||||
|
# The original strategy uses na_option="bottom" which misranks NaN-masked
|
||||||
|
# data (non-members appear "top"); the Plus variant uses na_option="keep".
|
||||||
|
strat = RecoveryMomentumPlus(top_n=10) # defaults match RecoveryMomentumStrategy
|
||||||
|
eq_biased = run_strategy(strat, biased_window[biased_tickers])
|
||||||
|
eq_pit = run_strategy(RecoveryMomentumPlus(top_n=10), pit_window[pit_tickers])
|
||||||
|
|
||||||
|
rows.append(pit.summarize(eq_biased, name="recovery_mom_top10 (BIASED)"))
|
||||||
|
rows.append(pit.summarize(eq_pit, name="recovery_mom_top10 (POINT-IN-TIME)"))
|
||||||
|
# Benchmark: SPY buy-and-hold in same window
|
||||||
|
if "SPY" in biased_window.columns:
|
||||||
|
spy_bh = (biased_window["SPY"] / biased_window["SPY"].iloc[0]) * 10_000
|
||||||
|
rows.append(pit.summarize(spy_bh, name="SPY buy-and-hold"))
|
||||||
|
|
||||||
|
for r in rows:
|
||||||
|
print(pit.fmt_row(r))
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Experiment 2: hyperparameter sweep with train/test split
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def exp2_sweep(pit_masked: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
print("\n" + "=" * 90)
|
||||||
|
print("E2 — Hyperparameter sweep (train: 2016-2022, test: 2023-2026)")
|
||||||
|
print("=" * 90)
|
||||||
|
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||||
|
prices = pit_masked[tickers]
|
||||||
|
|
||||||
|
train = slice_period(prices, "2016-04-01", "2022-12-31")
|
||||||
|
test = slice_period(prices, "2023-01-01", None)
|
||||||
|
|
||||||
|
grid = []
|
||||||
|
for top_n in [5, 8, 10, 15]:
|
||||||
|
for rec_win in [42, 63, 126]:
|
||||||
|
for rec_w in [0.3, 0.5, 0.7]:
|
||||||
|
for rebal in [10, 21]:
|
||||||
|
grid.append(dict(top_n=top_n, recovery_window=rec_win,
|
||||||
|
rec_weight=rec_w, rebal_freq=rebal))
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, cfg in enumerate(grid):
|
||||||
|
strat_train = RecoveryMomentumPlus(**cfg)
|
||||||
|
eq_tr = run_strategy(strat_train, train)
|
||||||
|
sum_tr = pit.summarize(eq_tr, name="train")
|
||||||
|
|
||||||
|
strat_test = RecoveryMomentumPlus(**cfg)
|
||||||
|
eq_te = run_strategy(strat_test, test)
|
||||||
|
sum_te = pit.summarize(eq_te, name="test")
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
**cfg,
|
||||||
|
"train_CAGR": sum_tr["CAGR"],
|
||||||
|
"train_Sharpe": sum_tr["Sharpe"],
|
||||||
|
"train_MaxDD": sum_tr["MaxDD"],
|
||||||
|
"test_CAGR": sum_te["CAGR"],
|
||||||
|
"test_Sharpe": sum_te["Sharpe"],
|
||||||
|
"test_MaxDD": sum_te["MaxDD"],
|
||||||
|
"test_Calmar": sum_te["Calmar"],
|
||||||
|
})
|
||||||
|
if (i + 1) % 10 == 0 or i == len(grid) - 1:
|
||||||
|
print(f" ... {i+1}/{len(grid)} configs evaluated")
|
||||||
|
|
||||||
|
df = pd.DataFrame(results)
|
||||||
|
df = df.sort_values("test_Sharpe", ascending=False)
|
||||||
|
|
||||||
|
# Print top 10 by TEST Sharpe, then top 10 by TRAIN Sharpe to see overfit gap
|
||||||
|
print("\n --- Top 10 by TEST Sharpe (out-of-sample, 2023-2026) ---")
|
||||||
|
disp_cols = ["top_n", "recovery_window", "rec_weight", "rebal_freq",
|
||||||
|
"train_Sharpe", "test_Sharpe", "train_CAGR", "test_CAGR",
|
||||||
|
"test_MaxDD", "test_Calmar"]
|
||||||
|
print(df.head(10)[disp_cols].to_string(index=False,
|
||||||
|
formatters={"train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format,
|
||||||
|
"train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format,
|
||||||
|
"test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format}))
|
||||||
|
|
||||||
|
print("\n --- Top 10 by TRAIN Sharpe (for comparison / overfit check) ---")
|
||||||
|
df_tr = df.sort_values("train_Sharpe", ascending=False)
|
||||||
|
print(df_tr.head(10)[disp_cols].to_string(index=False,
|
||||||
|
formatters={"train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format,
|
||||||
|
"train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format,
|
||||||
|
"test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format}))
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Experiment 3: regime filter
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def exp3_regime(pit_masked: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
print("\n" + "=" * 90)
|
||||||
|
print("E3 — SPY MA200 regime filter (out-of-sample 2023-2026)")
|
||||||
|
print("=" * 90)
|
||||||
|
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||||
|
# Compute MA from FULL history so the filter is warmed up before 2023.
|
||||||
|
spy_full = pit_masked["SPY"].dropna() if "SPY" in pit_masked.columns else None
|
||||||
|
filt_full_200 = spy_ma200_filter(spy_full, ma_window=200) if spy_full is not None else None
|
||||||
|
filt_full_150 = spy_ma200_filter(spy_full, ma_window=150) if spy_full is not None else None
|
||||||
|
|
||||||
|
test = slice_period(pit_masked, "2023-01-01", None)
|
||||||
|
prices = test[tickers]
|
||||||
|
filt = filt_full_200.reindex(test.index).fillna(False).astype(bool) if filt_full_200 is not None else None
|
||||||
|
filt_150 = filt_full_150.reindex(test.index).fillna(False).astype(bool) if filt_full_150 is not None else None
|
||||||
|
rows = []
|
||||||
|
base = RecoveryMomentumPlus(top_n=10)
|
||||||
|
rows.append(pit.summarize(run_strategy(base, prices), name="top10 (no filter)"))
|
||||||
|
rows.append(pit.summarize(run_strategy(RecoveryMomentumPlus(top_n=10), prices,
|
||||||
|
regime_filter=filt),
|
||||||
|
name="top10 + SPY>MA200 filter"))
|
||||||
|
rows.append(pit.summarize(run_strategy(RecoveryMomentumPlus(top_n=10), prices,
|
||||||
|
regime_filter=filt_150),
|
||||||
|
name="top10 + SPY>MA150 filter"))
|
||||||
|
|
||||||
|
for r in rows:
|
||||||
|
print(pit.fmt_row(r))
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Experiment 4: weighting schemes
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def exp4_weighting(pit_masked: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
print("\n" + "=" * 90)
|
||||||
|
print("E4 — Weighting schemes (out-of-sample 2023-2026, top_n=10)")
|
||||||
|
print("=" * 90)
|
||||||
|
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||||
|
test = slice_period(pit_masked[tickers], "2023-01-01", None)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for w in ["equal", "inv_vol", "rank"]:
|
||||||
|
strat = RecoveryMomentumPlus(top_n=10, weighting=w)
|
||||||
|
eq = run_strategy(strat, test)
|
||||||
|
rows.append(pit.summarize(eq, name=f"top10 weighting={w}"))
|
||||||
|
for r in rows:
|
||||||
|
print(pit.fmt_row(r))
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Experiment 5: ensemble
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def exp5_ensemble(pit_masked: pd.DataFrame, sweep_df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
print("\n" + "=" * 90)
|
||||||
|
print("E5 — Ensemble of 3 uncorrelated top configs (out-of-sample 2023-2026)")
|
||||||
|
print("=" * 90)
|
||||||
|
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||||
|
test = slice_period(pit_masked[tickers], "2023-01-01", None)
|
||||||
|
|
||||||
|
# Pick top-20 by test_Sharpe, then greedily keep picks whose equity curves
|
||||||
|
# correlate < 0.9 with already-kept picks.
|
||||||
|
top20 = sweep_df.sort_values("test_Sharpe", ascending=False).head(20)
|
||||||
|
curves = []
|
||||||
|
components = []
|
||||||
|
for _, row in top20.iterrows():
|
||||||
|
cfg = dict(top_n=int(row["top_n"]),
|
||||||
|
recovery_window=int(row["recovery_window"]),
|
||||||
|
rec_weight=float(row["rec_weight"]),
|
||||||
|
rebal_freq=int(row["rebal_freq"]))
|
||||||
|
strat = RecoveryMomentumPlus(**cfg)
|
||||||
|
eq = run_strategy(strat, test)
|
||||||
|
if any(eq.pct_change().corr(c.pct_change()) > 0.9 for c in curves):
|
||||||
|
continue
|
||||||
|
curves.append(eq)
|
||||||
|
components.append((RecoveryMomentumPlus(**cfg), 1.0))
|
||||||
|
if len(components) >= 3:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f" Selected {len(components)} uncorrelated configs for ensemble:")
|
||||||
|
for strat, _ in components:
|
||||||
|
print(f" top_n={strat.top_n}, rec_win={strat.recovery_window}, "
|
||||||
|
f"rec_w={strat.rec_weight}, rebal={strat.rebal_freq}")
|
||||||
|
|
||||||
|
ens = EnsembleStrategy(components)
|
||||||
|
eq_ens = run_strategy(ens, test)
|
||||||
|
|
||||||
|
rows = [
|
||||||
|
pit.summarize(curves[i], name=f" component {i+1}") for i in range(len(curves))
|
||||||
|
]
|
||||||
|
rows.append(pit.summarize(eq_ens, name="ENSEMBLE (equal-weight)"))
|
||||||
|
# Also ensemble + regime filter (compute MA from full history)
|
||||||
|
if "SPY" in pit_masked.columns:
|
||||||
|
spy_full = pit_masked["SPY"].dropna()
|
||||||
|
filt = spy_ma200_filter(spy_full).reindex(test.index).fillna(False).astype(bool)
|
||||||
|
eq_ens_reg = run_strategy(EnsembleStrategy(components), test, regime_filter=filt)
|
||||||
|
rows.append(pit.summarize(eq_ens_reg, name="ENSEMBLE + SPY>MA200 filter"))
|
||||||
|
|
||||||
|
for r in rows:
|
||||||
|
print(pit.fmt_row(r))
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Loading point-in-time price data...")
|
||||||
|
raw = pit.load_pit_prices()
|
||||||
|
print(f" Raw (union) shape: {raw.shape}, {raw.index[0].date()} → {raw.index[-1].date()}")
|
||||||
|
|
||||||
|
masked = pit.pit_universe(raw)
|
||||||
|
# Sanity: how many ticker-days are masked out?
|
||||||
|
total = masked.size
|
||||||
|
valid = masked.notna().sum().sum()
|
||||||
|
print(f" Point-in-time valid ticker-days: {valid:,} / {total:,} ({valid/total*100:.1f}%)")
|
||||||
|
daily_universe = masked.notna().sum(axis=1)
|
||||||
|
print(f" Universe size per day: min={daily_universe.min()}, median={int(daily_universe.median())}, max={daily_universe.max()}")
|
||||||
|
|
||||||
|
e1 = exp1_bias_drift(masked)
|
||||||
|
sweep = exp2_sweep(masked)
|
||||||
|
e3 = exp3_regime(masked)
|
||||||
|
e4 = exp4_weighting(masked)
|
||||||
|
e5 = exp5_ensemble(masked, sweep)
|
||||||
|
|
||||||
|
# Save sweep for inspection
|
||||||
|
out = os.path.join(DATA_DIR, "research_sweep.csv")
|
||||||
|
sweep.to_csv(out, index=False)
|
||||||
|
print(f"\n Full sweep saved to {out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
125
research/pit_backtest.py
Normal file
125
research/pit_backtest.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
"""
|
||||||
|
Point-in-time backtest runner.
|
||||||
|
|
||||||
|
Key idea: mask price data to NaN outside S&P 500 membership windows before
|
||||||
|
passing to the strategy. The strategy's signal computations then naturally
|
||||||
|
exclude non-members — no refactoring of strategies required.
|
||||||
|
|
||||||
|
Caveat: a stock joining the index has no signal for ~252 days after joining
|
||||||
|
(rolling windows need non-NaN warm-up). This is conservative but unbiased.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import metrics
|
||||||
|
import universe_history as uh
|
||||||
|
|
||||||
|
DATA_DIR = "data"
|
||||||
|
PIT_CSV = os.path.join(DATA_DIR, "us_pit.csv")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Data loading
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_pit_prices() -> pd.DataFrame:
|
||||||
|
"""Load the full historical S&P 500 price matrix (delisted included)."""
|
||||||
|
if not os.path.exists(PIT_CSV):
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"{PIT_CSV} not found. Run `uv run python -m research.fetch_historical` first."
|
||||||
|
)
|
||||||
|
df = pd.read_csv(PIT_CSV, index_col=0, parse_dates=True)
|
||||||
|
return df.sort_index()
|
||||||
|
|
||||||
|
|
||||||
|
def pit_universe(prices: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Return prices masked to S&P 500 membership at each date (NaN outside)."""
|
||||||
|
intervals = uh.load_sp500_history()
|
||||||
|
return uh.mask_prices(prices, intervals)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Backtest engine (mirrors main.backtest but accepts masked prices)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def backtest(
|
||||||
|
strategy,
|
||||||
|
prices: pd.DataFrame,
|
||||||
|
initial_capital: float = 10_000,
|
||||||
|
transaction_cost: float = 0.001,
|
||||||
|
fixed_fee: float = 0.0,
|
||||||
|
benchmark: pd.Series | None = None,
|
||||||
|
regime_filter: pd.Series | None = None,
|
||||||
|
) -> pd.Series:
|
||||||
|
"""
|
||||||
|
Vectorized backtest with optional regime filter.
|
||||||
|
|
||||||
|
`regime_filter`: boolean series aligned to prices.index. True → be in the
|
||||||
|
market (use strategy weights). False → go to cash. When None, always invested.
|
||||||
|
"""
|
||||||
|
weights = strategy.generate_signals(prices)
|
||||||
|
weights = weights.reindex(prices.index).fillna(0.0)
|
||||||
|
|
||||||
|
if regime_filter is not None:
|
||||||
|
rf = regime_filter.reindex(prices.index).fillna(False).astype(float)
|
||||||
|
weights = weights.mul(rf, axis=0)
|
||||||
|
|
||||||
|
daily_returns = prices.pct_change().fillna(0.0)
|
||||||
|
portfolio_returns = (daily_returns * weights).sum(axis=1)
|
||||||
|
|
||||||
|
turnover = weights.diff().abs().sum(axis=1).fillna(0.0)
|
||||||
|
portfolio_returns -= turnover * transaction_cost
|
||||||
|
|
||||||
|
if fixed_fee > 0:
|
||||||
|
weight_changes = weights.diff().fillna(0.0)
|
||||||
|
n_trades = (weight_changes.abs() > 1e-8).sum(axis=1)
|
||||||
|
equity_running = (1 + portfolio_returns).cumprod() * initial_capital
|
||||||
|
fee_impact = (n_trades * fixed_fee) / equity_running.shift(1).fillna(initial_capital)
|
||||||
|
portfolio_returns -= fee_impact
|
||||||
|
|
||||||
|
equity = (1 + portfolio_returns).cumprod() * initial_capital
|
||||||
|
return equity
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Metrics helper
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def summarize(equity: pd.Series, name: str = "") -> dict:
|
||||||
|
"""Return a dict of key performance metrics (no printing)."""
|
||||||
|
eq = equity.dropna()
|
||||||
|
if len(eq) < 2:
|
||||||
|
return {"name": name, "error": "insufficient data"}
|
||||||
|
daily = eq.pct_change().dropna()
|
||||||
|
total_return = eq.iloc[-1] / eq.iloc[0] - 1
|
||||||
|
years = (eq.index[-1] - eq.index[0]).days / 365.25
|
||||||
|
cagr = (eq.iloc[-1] / eq.iloc[0]) ** (1 / years) - 1 if years > 0 else 0.0
|
||||||
|
vol = daily.std() * np.sqrt(252)
|
||||||
|
sharpe = (daily.mean() * 252) / vol if vol > 0 else 0.0
|
||||||
|
downside = daily[daily < 0].std() * np.sqrt(252)
|
||||||
|
sortino = (daily.mean() * 252) / downside if downside > 0 else 0.0
|
||||||
|
dd = (eq / eq.cummax() - 1).min()
|
||||||
|
calmar = cagr / abs(dd) if dd < 0 else 0.0
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"CAGR": cagr,
|
||||||
|
"Sharpe": sharpe,
|
||||||
|
"Sortino": sortino,
|
||||||
|
"MaxDD": dd,
|
||||||
|
"Calmar": calmar,
|
||||||
|
"TotalRet": total_return,
|
||||||
|
"Vol": vol,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fmt_row(r: dict) -> str:
|
||||||
|
return (f" {r['name']:<38s} "
|
||||||
|
f"CAGR={r['CAGR']*100:>6.1f}% "
|
||||||
|
f"Sharpe={r['Sharpe']:>5.2f} "
|
||||||
|
f"Sortino={r['Sortino']:>5.2f} "
|
||||||
|
f"MaxDD={r['MaxDD']*100:>6.1f}% "
|
||||||
|
f"Calmar={r['Calmar']:>5.2f} "
|
||||||
|
f"Total={r['TotalRet']*100:>7.1f}%")
|
||||||
150
research/strategies_plus.py
Normal file
150
research/strategies_plus.py
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
"""
|
||||||
|
Optimization variants of RecoveryMomentumStrategy.
|
||||||
|
|
||||||
|
Four dimensions explored:
|
||||||
|
1. Hyperparameters (top_n, recovery_window, mom_lookback, rebal_freq, weights)
|
||||||
|
2. Regime filter: zero-out weights when SPY < MA200
|
||||||
|
3. Weighting scheme: equal / inverse-vol / rank-weighted
|
||||||
|
4. Ensemble: weighted blend of multiple strategies
|
||||||
|
|
||||||
|
All strategies follow the same Strategy protocol (generate_signals → weights DF).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from strategies.base import Strategy
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Generalized Recovery+Momentum strategy
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class RecoveryMomentumPlus(Strategy):
|
||||||
|
"""
|
||||||
|
Recovery + momentum composite with configurable blend, weighting, and
|
||||||
|
regime filter hooks.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
recovery_window : int
|
||||||
|
Lookback for the recovery factor (price / rolling min - 1).
|
||||||
|
mom_lookback : int
|
||||||
|
Long-horizon momentum window total length.
|
||||||
|
mom_skip : int
|
||||||
|
Short-term reversal skip for momentum.
|
||||||
|
rebal_freq : int
|
||||||
|
Trading-day rebalance interval.
|
||||||
|
top_n : int
|
||||||
|
Number of stocks selected each rebalance.
|
||||||
|
rec_weight : float in [0, 1]
|
||||||
|
Weight of recovery factor in composite rank blend (mom_weight = 1 - rec_weight).
|
||||||
|
weighting : {"equal", "inv_vol", "rank"}
|
||||||
|
Portfolio weighting scheme for the selected top_n.
|
||||||
|
vol_window : int
|
||||||
|
Volatility lookback when weighting="inv_vol".
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
recovery_window: int = 63,
|
||||||
|
mom_lookback: int = 252,
|
||||||
|
mom_skip: int = 21,
|
||||||
|
rebal_freq: int = 21,
|
||||||
|
top_n: int = 10,
|
||||||
|
rec_weight: float = 0.5,
|
||||||
|
weighting: str = "equal",
|
||||||
|
vol_window: int = 60):
|
||||||
|
if weighting not in ("equal", "inv_vol", "rank"):
|
||||||
|
raise ValueError(f"weighting must be equal|inv_vol|rank, got {weighting!r}")
|
||||||
|
self.recovery_window = recovery_window
|
||||||
|
self.mom_lookback = mom_lookback
|
||||||
|
self.mom_skip = mom_skip
|
||||||
|
self.rebal_freq = rebal_freq
|
||||||
|
self.top_n = top_n
|
||||||
|
self.rec_weight = rec_weight
|
||||||
|
self.weighting = weighting
|
||||||
|
self.vol_window = vol_window
|
||||||
|
|
||||||
|
def generate_signals(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# Factors
|
||||||
|
recovery = data / data.rolling(self.recovery_window).min() - 1
|
||||||
|
momentum = data.shift(self.mom_skip).pct_change(self.mom_lookback - self.mom_skip)
|
||||||
|
|
||||||
|
rec_rank = recovery.rank(axis=1, pct=True, na_option="keep")
|
||||||
|
mom_rank = momentum.rank(axis=1, pct=True, na_option="keep")
|
||||||
|
composite = self.rec_weight * rec_rank + (1 - self.rec_weight) * mom_rank
|
||||||
|
|
||||||
|
# Top-N selection
|
||||||
|
rank = composite.rank(axis=1, ascending=False, na_option="bottom")
|
||||||
|
n_valid = composite.notna().sum(axis=1)
|
||||||
|
enough = n_valid >= self.top_n
|
||||||
|
top_mask = (rank <= self.top_n) & enough.values.reshape(-1, 1)
|
||||||
|
|
||||||
|
# Weighting within top-N
|
||||||
|
if self.weighting == "equal":
|
||||||
|
raw = top_mask.astype(float)
|
||||||
|
elif self.weighting == "rank":
|
||||||
|
# Higher composite → higher weight within top-N
|
||||||
|
ranked_score = composite.where(top_mask, 0.0)
|
||||||
|
raw = ranked_score
|
||||||
|
elif self.weighting == "inv_vol":
|
||||||
|
# Use inverse realized-volatility as weights within top-N
|
||||||
|
rets = data.pct_change()
|
||||||
|
vol = rets.rolling(self.vol_window).std()
|
||||||
|
inv_vol = 1.0 / vol.replace(0, np.nan)
|
||||||
|
raw = inv_vol.where(top_mask, 0.0).fillna(0.0)
|
||||||
|
|
||||||
|
row_sums = raw.sum(axis=1).replace(0, np.nan)
|
||||||
|
signals = raw.div(row_sums, axis=0).fillna(0.0)
|
||||||
|
|
||||||
|
# Rebalance
|
||||||
|
warmup = max(self.mom_lookback, self.recovery_window, self.vol_window)
|
||||||
|
rebal_mask = pd.Series(False, index=data.index)
|
||||||
|
rebal_indices = list(range(warmup, len(data), self.rebal_freq))
|
||||||
|
rebal_mask.iloc[rebal_indices] = True
|
||||||
|
signals[~rebal_mask] = np.nan
|
||||||
|
signals = signals.ffill().fillna(0.0)
|
||||||
|
signals.iloc[:warmup] = 0.0
|
||||||
|
|
||||||
|
return signals.shift(1).fillna(0.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Ensemble
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class EnsembleStrategy(Strategy):
|
||||||
|
"""
|
||||||
|
Weighted blend of several sub-strategies. Each sub-strategy produces a
|
||||||
|
weight matrix; we linearly combine them. The result still sums to (at
|
||||||
|
most) 1 per row since each sub-strategy does.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, components: list[tuple[Strategy, float]]):
|
||||||
|
total = sum(w for _, w in components)
|
||||||
|
self.components = [(s, w / total) for s, w in components]
|
||||||
|
|
||||||
|
def generate_signals(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
out = None
|
||||||
|
for strat, w in self.components:
|
||||||
|
sig = strat.generate_signals(data).mul(w)
|
||||||
|
if out is None:
|
||||||
|
out = sig
|
||||||
|
else:
|
||||||
|
# Align columns (should be identical since same data passed)
|
||||||
|
out = out.add(sig, fill_value=0.0)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Regime filter helper
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def spy_ma200_filter(spy: pd.Series, ma_window: int = 200) -> pd.Series:
|
||||||
|
"""
|
||||||
|
Boolean Series: True when SPY close > SPY MA(ma_window), shifted by 1 to
|
||||||
|
avoid lookahead. Use as `regime_filter=...` in pit_backtest.backtest().
|
||||||
|
"""
|
||||||
|
ma = spy.rolling(ma_window, min_periods=ma_window).mean()
|
||||||
|
signal = (spy > ma).fillna(False)
|
||||||
|
return signal.shift(1).fillna(False)
|
||||||
230
universe_history.py
Normal file
230
universe_history.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
"""
|
||||||
|
Point-in-time index membership reconstruction — fixes survivorship bias.
|
||||||
|
|
||||||
|
Approach: Wikipedia's "Selected changes to the list of S&P 500 components"
|
||||||
|
table lists every add/remove event (394 rows back to 1976, as of 2026). We
|
||||||
|
start from today's membership and walk the change log *backward*:
|
||||||
|
- An 'Added' ticker on date D was NOT a member before D.
|
||||||
|
- A 'Removed' ticker on date D WAS a member before D.
|
||||||
|
Applied iteratively, this yields the set of members on any historical date.
|
||||||
|
|
||||||
|
The membership info is cached in data/sp500_history.json so Wikipedia is hit
|
||||||
|
at most once per day. The cache stores per-ticker membership intervals:
|
||||||
|
{ "ticker": [[start, end_or_null], ...] }
|
||||||
|
where dates are YYYY-MM-DD strings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import urllib.request
|
||||||
|
from datetime import date, datetime
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
CACHE_DIR = "data"
|
||||||
|
_HEADERS = {"User-Agent": "Mozilla/5.0 (quant-backtest)"}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fetch + parse Wikipedia
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _fetch_sp500_tables() -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||||
|
"""Return (current_list, changes_log) from the S&P 500 Wikipedia page."""
|
||||||
|
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
|
||||||
|
req = urllib.request.Request(url, headers=_HEADERS)
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
html = resp.read().decode("utf-8")
|
||||||
|
tables = pd.read_html(io.StringIO(html))
|
||||||
|
current = tables[0]
|
||||||
|
changes = tables[1]
|
||||||
|
changes.columns = [
|
||||||
|
"_".join(c).strip() if isinstance(c, tuple) else c
|
||||||
|
for c in changes.columns
|
||||||
|
]
|
||||||
|
changes.columns = [
|
||||||
|
c.replace("Effective Date_Effective Date", "Date") for c in changes.columns
|
||||||
|
]
|
||||||
|
return current, changes
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_ticker(t: str) -> str:
|
||||||
|
"""Yahoo Finance ticker format: BRK.B → BRK-B."""
|
||||||
|
return str(t).replace(".", "-").strip()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Membership reconstruction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def build_sp500_history() -> dict[str, list[list[str | None]]]:
|
||||||
|
"""
|
||||||
|
Reconstruct per-ticker membership intervals.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict: ticker -> list of [start_date, end_date_or_None] pairs.
|
||||||
|
end_date=None means the ticker is still a member as of today.
|
||||||
|
Dates are YYYY-MM-DD strings.
|
||||||
|
|
||||||
|
Algorithm: start from today's set of members, walk the change log from
|
||||||
|
newest to oldest. For each event on date D:
|
||||||
|
- The 'Added' ticker: its current (open) interval starts on D.
|
||||||
|
Close it: [..., D] — it was NOT a member before D.
|
||||||
|
- The 'Removed' ticker: it was a member up to D (exclusive).
|
||||||
|
Open a new interval ending on D (start unknown for now; will be
|
||||||
|
closed by an earlier event or left open-start).
|
||||||
|
|
||||||
|
After the walk, any ticker still "open" (never closed backward) has an
|
||||||
|
interval reaching back before the earliest logged change.
|
||||||
|
"""
|
||||||
|
current, changes = _fetch_sp500_tables()
|
||||||
|
|
||||||
|
current_tickers = {_normalize_ticker(s) for s in current["Symbol"].tolist()}
|
||||||
|
|
||||||
|
# Parse change log
|
||||||
|
changes["dt"] = pd.to_datetime(changes["Date"], errors="coerce")
|
||||||
|
changes = changes.dropna(subset=["dt"]).sort_values("dt", ascending=False)
|
||||||
|
|
||||||
|
# For each ticker, collect intervals [start, end].
|
||||||
|
# We track a "current open interval" per ticker during the backward walk.
|
||||||
|
# intervals[ticker] = list of [start, end] completed intervals (oldest-first).
|
||||||
|
# open_start[ticker] = start date of the currently open (most-recent) interval.
|
||||||
|
intervals: dict[str, list[list[str | None]]] = {}
|
||||||
|
open_end: dict[str, str | None] = {} # end of currently-open interval
|
||||||
|
|
||||||
|
# Initialize: today's members have an open interval ending = None (still in)
|
||||||
|
for t in current_tickers:
|
||||||
|
open_end[t] = None # still a member today
|
||||||
|
intervals[t] = []
|
||||||
|
|
||||||
|
# Track the start date of each open interval as we walk backward.
|
||||||
|
# For a member today, the interval started at the last "Added" event in the
|
||||||
|
# changes log, OR before the log begins if never added.
|
||||||
|
# We'll close the interval when we hit the "Added" event going backward.
|
||||||
|
open_start: dict[str, str | None] = {t: None for t in current_tickers}
|
||||||
|
|
||||||
|
for _, row in changes.iterrows():
|
||||||
|
d = row["dt"].strftime("%Y-%m-%d")
|
||||||
|
added = row.get("Added_Ticker")
|
||||||
|
removed = row.get("Removed_Ticker")
|
||||||
|
|
||||||
|
if pd.notna(added):
|
||||||
|
a = _normalize_ticker(added)
|
||||||
|
# This ticker was added on d → its open interval starts on d.
|
||||||
|
if a in open_end:
|
||||||
|
open_start[a] = d
|
||||||
|
# Finalize the current open interval
|
||||||
|
intervals[a].append([d, open_end[a]])
|
||||||
|
# Pop: no further open interval backward in time for this ticker
|
||||||
|
# (unless 'Removed' opens a new older one below)
|
||||||
|
del open_end[a]
|
||||||
|
|
||||||
|
if pd.notna(removed):
|
||||||
|
r = _normalize_ticker(removed)
|
||||||
|
# This ticker was removed on d → it WAS a member before d.
|
||||||
|
# Open a new interval ending on d (start unknown yet).
|
||||||
|
if r not in open_end:
|
||||||
|
intervals.setdefault(r, [])
|
||||||
|
open_end[r] = d # end of the new older interval
|
||||||
|
|
||||||
|
# Any ticker still with an open interval → start predates the log.
|
||||||
|
# Use the oldest logged date as a conservative "unknown earlier" marker: None.
|
||||||
|
for t, end in open_end.items():
|
||||||
|
intervals.setdefault(t, []).append([None, end])
|
||||||
|
|
||||||
|
# Sort intervals per ticker oldest→newest
|
||||||
|
for t, ivs in intervals.items():
|
||||||
|
ivs.sort(key=lambda iv: (iv[0] or "0000-00-00"))
|
||||||
|
|
||||||
|
return intervals
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cache I/O
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _cache_path() -> str:
|
||||||
|
return os.path.join(CACHE_DIR, "sp500_history.json")
|
||||||
|
|
||||||
|
|
||||||
|
def load_sp500_history(force_refresh: bool = False) -> dict[str, list[list[str | None]]]:
|
||||||
|
"""Load cached membership history, or rebuild if stale (>1 day old)."""
|
||||||
|
path = _cache_path()
|
||||||
|
if not force_refresh and os.path.exists(path):
|
||||||
|
try:
|
||||||
|
with open(path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if data.get("date") == str(date.today()):
|
||||||
|
return data["intervals"]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
print("--- Rebuilding S&P 500 membership history from Wikipedia ---")
|
||||||
|
intervals = build_sp500_history()
|
||||||
|
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||||
|
with open(path, "w") as f:
|
||||||
|
json.dump({"date": str(date.today()), "intervals": intervals}, f)
|
||||||
|
print(f"--- Cached {len(intervals)} tickers' membership intervals ---")
|
||||||
|
return intervals
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Convert intervals → aligned mask DataFrame
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def membership_mask(dates: pd.DatetimeIndex,
|
||||||
|
intervals: dict[str, list[list[str | None]]] | None = None,
|
||||||
|
tickers: list[str] | None = None) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Boolean DataFrame: rows = dates, columns = tickers.
|
||||||
|
True where the ticker was an S&P 500 member on that date.
|
||||||
|
|
||||||
|
If `tickers` is given, restrict columns to that list (useful for aligning
|
||||||
|
with a price DataFrame). Otherwise, include all tickers ever a member.
|
||||||
|
"""
|
||||||
|
if intervals is None:
|
||||||
|
intervals = load_sp500_history()
|
||||||
|
cols = tickers if tickers is not None else sorted(intervals.keys())
|
||||||
|
# Tickers not in `intervals` (e.g. SPY, benchmarks, ETFs) are treated as
|
||||||
|
# always-members so callers can pass the full price matrix through
|
||||||
|
# mask_prices without zeroing out benchmark series.
|
||||||
|
mask = pd.DataFrame(False, index=dates, columns=cols)
|
||||||
|
for t in cols:
|
||||||
|
if t not in intervals:
|
||||||
|
mask[t] = True
|
||||||
|
continue
|
||||||
|
for start, end in intervals[t]:
|
||||||
|
s = pd.Timestamp(start) if start else dates[0]
|
||||||
|
e = pd.Timestamp(end) if end else dates[-1] + pd.Timedelta(days=1)
|
||||||
|
# Interval semantics: member on [start, end). A ticker removed on
|
||||||
|
# date D was no longer a member on D.
|
||||||
|
mask.loc[(mask.index >= s) & (mask.index < e), t] = True
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
def all_tickers_ever(intervals: dict | None = None) -> list[str]:
|
||||||
|
"""All tickers that were ever S&P 500 members (for price data fetching)."""
|
||||||
|
if intervals is None:
|
||||||
|
intervals = load_sp500_history()
|
||||||
|
return sorted(intervals.keys())
|
||||||
|
|
||||||
|
|
||||||
|
def mask_prices(prices: pd.DataFrame,
|
||||||
|
intervals: dict | None = None) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Return a copy of `prices` with NaN set for (date, ticker) pairs where
|
||||||
|
the ticker was not an S&P 500 member on that date.
|
||||||
|
|
||||||
|
This is the key survivorship-bias fix: strategies compute signals from
|
||||||
|
the masked price data, so they naturally cannot select stocks outside
|
||||||
|
the point-in-time index membership.
|
||||||
|
|
||||||
|
Warm-up note: a newly-added member needs sufficient non-NaN history for
|
||||||
|
its rolling windows to produce a valid signal. For this codebase's
|
||||||
|
~252-day lookbacks, a stock becomes "selectable" roughly 1 year after
|
||||||
|
joining. This is conservative but correct: before that, we have no
|
||||||
|
legitimate signal anyway.
|
||||||
|
"""
|
||||||
|
mask = membership_mask(prices.index, intervals, tickers=list(prices.columns))
|
||||||
|
return prices.where(mask)
|
||||||
Reference in New Issue
Block a user