""" Factor-level optimization on the point-in-time S&P 500 universe. Builds on the sweep results in data/sweep_*.csv. Runs four experiments: O1 — RecoveryMomentumPlus hyperparameter grid (top_n × rec_window × rec_weight × rebal), with 2016-2022 train / 2023-2026 test split. Picks best by Sharpe-on-test. O2 — SPY>MA regime filter applied to the 3 highest-Sharpe strategies (10y window). O3 — Top-3 uncorrelated ensemble: greedy corr<0.85 selection → equal-weight blend. O4 — Factor-mix parameter sweep on the FactorCombo "up_cap+mom_gap" signal (top_n × rebal_freq). All experiments run on PIT-masked data. Results printed + written to data/factor_optimize_.csv. Usage: uv run python -m research.factor_optimize """ from __future__ import annotations import os import warnings import numpy as np import pandas as pd import research.pit_backtest as pit from research.strategies_plus import (EnsembleStrategy, RecoveryMomentumPlus, spy_ma200_filter) from strategies.factor_combo import FactorComboStrategy, SIGNAL_REGISTRY from strategies.momentum_quality import MomentumQualityStrategy from strategies.recovery_momentum import RecoveryMomentumStrategy warnings.filterwarnings("ignore", category=FutureWarning) DATA_DIR = "data" BENCHMARK = "SPY" def load_masked_prices(): raw = pit.load_pit_prices() masked = pit.pit_universe(raw) if BENCHMARK in raw.columns: masked[BENCHMARK] = raw[BENCHMARK] return masked def slice_period(df, start=None, end=None): out = df if start: out = out[out.index >= start] if end: out = out[out.index <= end] return out def run(strat, prices, *, regime_filter=None): return pit.backtest( strategy=strat, prices=prices, initial_capital=10_000, transaction_cost=0.001, regime_filter=regime_filter, ) # --------------------------------------------------------------------------- # O1 — RecoveryMomentumPlus hyperparameter grid # --------------------------------------------------------------------------- def o1_hyperparam_sweep(masked): print("\n" + "=" * 100) print("O1 — RecoveryMomentumPlus sweep (train 2016-2022 / test 2023-2026)") print("=" * 100) tickers = [c for c in masked.columns if c != BENCHMARK] prices = masked[tickers] train = slice_period(prices, "2016-04-19", "2022-12-31") test = slice_period(prices, "2023-01-01", None) grid = [] for top_n in (5, 10, 15, 20): for rec_win in (42, 63, 126): for rec_w in (0.3, 0.5, 0.7): for rebal in (5, 10, 21): grid.append((top_n, rec_win, rec_w, rebal)) rows = [] for i, (top_n, rec_win, rec_w, rebal) in enumerate(grid, 1): cfg = dict(top_n=top_n, recovery_window=rec_win, rec_weight=rec_w, rebal_freq=rebal) tr = pit.summarize(run(RecoveryMomentumPlus(**cfg), train), "") te = pit.summarize(run(RecoveryMomentumPlus(**cfg), test), "") rows.append({**cfg, "train_CAGR": tr["CAGR"], "train_Sharpe": tr["Sharpe"], "test_CAGR": te["CAGR"], "test_Sharpe": te["Sharpe"], "test_MaxDD": te["MaxDD"], "test_Calmar": te["Calmar"]}) if i % 12 == 0 or i == len(grid): print(f" … {i}/{len(grid)} configs evaluated") df = pd.DataFrame(rows).sort_values("test_Sharpe", ascending=False) out = os.path.join(DATA_DIR, "factor_optimize_O1.csv") df.to_csv(out, index=False) print("\n --- Top 10 by out-of-sample Sharpe (2023-2026) ---") disp = ["top_n", "recovery_window", "rec_weight", "rebal_freq", "train_Sharpe", "test_Sharpe", "train_CAGR", "test_CAGR", "test_MaxDD", "test_Calmar"] print(df.head(10)[disp].to_string(index=False, formatters={ "train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format, "train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format, "test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format, })) return df # --------------------------------------------------------------------------- # O2 — Regime filter on the top strategies # --------------------------------------------------------------------------- def o2_regime(masked): print("\n" + "=" * 100) print("O2 — SPY > MA regime filter on top strategies (full 10y PIT)") print("=" * 100) tickers = [c for c in masked.columns if c != BENCHMARK] prices = masked[tickers] spy_full = masked[BENCHMARK].dropna() contenders = { "Recovery+Mom Top10": RecoveryMomentumStrategy(top_n=10), "fc_up_cap_mom_gap_monthly": FactorComboStrategy("up_cap+mom_gap", rebal_freq=21, top_n=10), "fc_rec63_mom_gap_monthly": FactorComboStrategy("rec63+mom_gap", rebal_freq=21, top_n=10), } rows = [] for name, strat in contenders.items(): base = run(strat, prices) rows.append({"strategy": name, "filter": "none", **{k: v for k, v in pit.summarize(base, "").items() if k != "name"}}) for ma in (200, 150, 100): filt = spy_ma200_filter(spy_full, ma_window=ma).reindex(prices.index).fillna(False) strat_fresh = _fresh_copy(strat) eq = run(strat_fresh, prices, regime_filter=filt) rows.append({"strategy": name, "filter": f"SPY>MA{ma}", **{k: v for k, v in pit.summarize(eq, "").items() if k != "name"}}) df = pd.DataFrame(rows) df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O2.csv"), index=False) print(f" {'strategy':<32s} {'filter':<12s} {'CAGR':>7s} {'Sharpe':>7s} " f"{'MaxDD':>7s} {'Calmar':>7s}") for _, r in df.iterrows(): print(f" {r['strategy']:<32s} {r['filter']:<12s} " f"{r['CAGR']*100:>6.1f}% {r['Sharpe']:>7.2f} " f"{r['MaxDD']*100:>6.1f}% {r['Calmar']:>7.2f}") return df def _fresh_copy(strat): """Re-instantiate a strategy so state (if any) is reset between backtests.""" if isinstance(strat, RecoveryMomentumStrategy): return RecoveryMomentumStrategy( recovery_window=strat.recovery_window, mom_lookback=strat.mom_lookback, mom_skip=strat.mom_skip, rebal_freq=strat.rebal_freq, top_n=strat.top_n) if isinstance(strat, FactorComboStrategy): return FactorComboStrategy(strat.signal_name, rebal_freq=strat.rebal_freq, top_n=strat.top_n) if isinstance(strat, MomentumQualityStrategy): return MomentumQualityStrategy( momentum_period=strat.momentum_period, skip=strat.skip, quality_window=strat.quality_window, top_n=strat.top_n) return strat # already stateless for our uses # --------------------------------------------------------------------------- # O3 — Uncorrelated ensemble # --------------------------------------------------------------------------- def o3_ensemble(masked): print("\n" + "=" * 100) print("O3 — Greedy uncorrelated ensemble (full 10y PIT)") print("=" * 100) tickers = [c for c in masked.columns if c != BENCHMARK] prices = masked[tickers] spy_full = masked[BENCHMARK].dropna() # Candidate pool: the production strategies that cleared 0.75 Sharpe in 10y sweep. candidates: list[tuple[str, object]] = [ ("Recovery+Mom Top10", RecoveryMomentumStrategy(top_n=10)), ("fc_up_cap_mom_gap_monthly", FactorComboStrategy("up_cap+mom_gap", 21, 10)), ("fc_rec63_mom_gap_monthly", FactorComboStrategy("rec63+mom_gap", 21, 10)), ("fc_up_cap_quality_mom_monthly", FactorComboStrategy("up_cap+quality_mom", 21, 10)), ("fc_rec_mfilt_deep_upvol_monthly", FactorComboStrategy("rec_mfilt+deep_upvol", 21, 10)), ("fc_mom7m_rec126_monthly", FactorComboStrategy("mom7m+rec126", 21, 10)), ("Recovery+Mom Top20", RecoveryMomentumStrategy(top_n=20)), ("fc_down_resil_qual_mom_monthly", FactorComboStrategy("down_resil+qual_mom", 21, 10)), ] equities: dict[str, pd.Series] = {name: run(s, prices) for name, s in candidates} returns = pd.DataFrame({n: eq.pct_change().fillna(0) for n, eq in equities.items()}) sharpes = {n: pit.summarize(eq, n)["Sharpe"] for n, eq in equities.items()} order = sorted(candidates, key=lambda t: sharpes[t[0]], reverse=True) picked_names: list[str] = [] picked: list[tuple[object, float]] = [] for name, strat in order: if any(returns[name].corr(returns[p]) > 0.85 for p in picked_names): continue picked_names.append(name) picked.append((strat, 1.0)) if len(picked) >= 3: break print(f" Selected {len(picked)} uncorrelated components:") for name in picked_names: print(f" - {name} (Sharpe={sharpes[name]:.2f})") ens = EnsembleStrategy(picked) eq_ens = run(ens, prices) filt = spy_ma200_filter(spy_full).reindex(prices.index).fillna(False) eq_ens_reg = run(EnsembleStrategy(picked), prices, regime_filter=filt) spy_bh = (masked[BENCHMARK].dropna().pipe(lambda s: s / s.iloc[0] * 10_000)) rows = [pit.summarize(spy_bh, "SPY buy-and-hold")] for name in picked_names: rows.append(pit.summarize(equities[name], f" component: {name}")) rows.append(pit.summarize(eq_ens, "ENSEMBLE (equal-weight, no filter)")) rows.append(pit.summarize(eq_ens_reg, "ENSEMBLE + SPY>MA200 filter")) for r in rows: print(pit.fmt_row(r)) df = pd.DataFrame(rows) df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O3.csv"), index=False) return df, picked_names # --------------------------------------------------------------------------- # O4 — FactorCombo up_cap+mom_gap: top_n × rebal sweep # --------------------------------------------------------------------------- def o4_factorcombo_sweep(masked): print("\n" + "=" * 100) print("O4 — FactorCombo up_cap+mom_gap: top_n × rebal (full 10y PIT)") print("=" * 100) tickers = [c for c in masked.columns if c != BENCHMARK] prices = masked[tickers] rows = [] for top_n in (5, 8, 10, 15, 20, 30): for rebal in (5, 10, 21, 42): strat = FactorComboStrategy("up_cap+mom_gap", rebal_freq=rebal, top_n=top_n) eq = run(strat, prices) s = pit.summarize(eq, f"top_n={top_n} rebal={rebal}") rows.append({"top_n": top_n, "rebal": rebal, "CAGR": s["CAGR"], "Sharpe": s["Sharpe"], "MaxDD": s["MaxDD"], "Calmar": s["Calmar"]}) df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False) df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O4.csv"), index=False) print(f" {'top_n':<8s}{'rebal':<8s}{'CAGR':>8s}{'Sharpe':>9s}" f"{'MaxDD':>9s}{'Calmar':>9s}") for _, r in df.iterrows(): print(f" {int(r['top_n']):<8d}{int(r['rebal']):<8d}" f"{r['CAGR']*100:>7.1f}%{r['Sharpe']:>9.2f}" f"{r['MaxDD']*100:>8.1f}%{r['Calmar']:>9.2f}") return df def main(): print("Loading PIT-masked price data…") masked = load_masked_prices() print(f" shape={masked.shape} range={masked.index[0].date()} → {masked.index[-1].date()}") o1 = o1_hyperparam_sweep(masked) o2 = o2_regime(masked) o3, picks = o3_ensemble(masked) o4 = o4_factorcombo_sweep(masked) print("\n" + "=" * 100) print("Summary: best config from each experiment") print("=" * 100) best_o1 = o1.iloc[0] print(f" O1 best OOS Sharpe: top_n={int(best_o1['top_n'])} rec_win={int(best_o1['recovery_window'])} " f"rec_w={best_o1['rec_weight']} rebal={int(best_o1['rebal_freq'])} " f"→ test Sharpe={best_o1['test_Sharpe']:.2f} test CAGR={best_o1['test_CAGR']*100:.1f}%") best_o4 = o4.iloc[0] print(f" O4 best overall: top_n={int(best_o4['top_n'])} rebal={int(best_o4['rebal'])} " f"Sharpe={best_o4['Sharpe']:.2f} CAGR={best_o4['CAGR']*100:.1f}% " f"Calmar={best_o4['Calmar']:.2f}") if __name__ == "__main__": main()