295 lines
12 KiB
Python
295 lines
12 KiB
Python
"""
|
||
Factor-level optimization on the point-in-time S&P 500 universe.
|
||
|
||
Builds on the sweep results in data/sweep_*.csv. Runs four experiments:
|
||
|
||
O1 — RecoveryMomentumPlus hyperparameter grid (top_n × rec_window × rec_weight × rebal),
|
||
with 2016-2022 train / 2023-2026 test split. Picks best by Sharpe-on-test.
|
||
O2 — SPY>MA regime filter applied to the 3 highest-Sharpe strategies (10y window).
|
||
O3 — Top-3 uncorrelated ensemble: greedy corr<0.85 selection → equal-weight blend.
|
||
O4 — Factor-mix parameter sweep on the FactorCombo "up_cap+mom_gap" signal
|
||
(top_n × rebal_freq).
|
||
|
||
All experiments run on PIT-masked data. Results printed + written to
|
||
data/factor_optimize_<exp>.csv.
|
||
|
||
Usage:
|
||
uv run python -m research.factor_optimize
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import warnings
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
import research.pit_backtest as pit
|
||
from research.strategies_plus import (EnsembleStrategy, RecoveryMomentumPlus,
|
||
spy_ma200_filter)
|
||
from strategies.factor_combo import FactorComboStrategy, SIGNAL_REGISTRY
|
||
from strategies.momentum_quality import MomentumQualityStrategy
|
||
from strategies.recovery_momentum import RecoveryMomentumStrategy
|
||
|
||
warnings.filterwarnings("ignore", category=FutureWarning)
|
||
|
||
DATA_DIR = "data"
|
||
BENCHMARK = "SPY"
|
||
|
||
|
||
def load_masked_prices():
|
||
raw = pit.load_pit_prices()
|
||
masked = pit.pit_universe(raw)
|
||
if BENCHMARK in raw.columns:
|
||
masked[BENCHMARK] = raw[BENCHMARK]
|
||
return masked
|
||
|
||
|
||
def slice_period(df, start=None, end=None):
|
||
out = df
|
||
if start:
|
||
out = out[out.index >= start]
|
||
if end:
|
||
out = out[out.index <= end]
|
||
return out
|
||
|
||
|
||
def run(strat, prices, *, regime_filter=None):
|
||
return pit.backtest(
|
||
strategy=strat, prices=prices, initial_capital=10_000,
|
||
transaction_cost=0.001, regime_filter=regime_filter,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# O1 — RecoveryMomentumPlus hyperparameter grid
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def o1_hyperparam_sweep(masked):
|
||
print("\n" + "=" * 100)
|
||
print("O1 — RecoveryMomentumPlus sweep (train 2016-2022 / test 2023-2026)")
|
||
print("=" * 100)
|
||
tickers = [c for c in masked.columns if c != BENCHMARK]
|
||
prices = masked[tickers]
|
||
train = slice_period(prices, "2016-04-19", "2022-12-31")
|
||
test = slice_period(prices, "2023-01-01", None)
|
||
|
||
grid = []
|
||
for top_n in (5, 10, 15, 20):
|
||
for rec_win in (42, 63, 126):
|
||
for rec_w in (0.3, 0.5, 0.7):
|
||
for rebal in (5, 10, 21):
|
||
grid.append((top_n, rec_win, rec_w, rebal))
|
||
|
||
rows = []
|
||
for i, (top_n, rec_win, rec_w, rebal) in enumerate(grid, 1):
|
||
cfg = dict(top_n=top_n, recovery_window=rec_win,
|
||
rec_weight=rec_w, rebal_freq=rebal)
|
||
tr = pit.summarize(run(RecoveryMomentumPlus(**cfg), train), "")
|
||
te = pit.summarize(run(RecoveryMomentumPlus(**cfg), test), "")
|
||
rows.append({**cfg,
|
||
"train_CAGR": tr["CAGR"], "train_Sharpe": tr["Sharpe"],
|
||
"test_CAGR": te["CAGR"], "test_Sharpe": te["Sharpe"],
|
||
"test_MaxDD": te["MaxDD"], "test_Calmar": te["Calmar"]})
|
||
if i % 12 == 0 or i == len(grid):
|
||
print(f" … {i}/{len(grid)} configs evaluated")
|
||
|
||
df = pd.DataFrame(rows).sort_values("test_Sharpe", ascending=False)
|
||
out = os.path.join(DATA_DIR, "factor_optimize_O1.csv")
|
||
df.to_csv(out, index=False)
|
||
|
||
print("\n --- Top 10 by out-of-sample Sharpe (2023-2026) ---")
|
||
disp = ["top_n", "recovery_window", "rec_weight", "rebal_freq",
|
||
"train_Sharpe", "test_Sharpe", "train_CAGR", "test_CAGR",
|
||
"test_MaxDD", "test_Calmar"]
|
||
print(df.head(10)[disp].to_string(index=False, formatters={
|
||
"train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format,
|
||
"train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format,
|
||
"test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format,
|
||
}))
|
||
return df
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# O2 — Regime filter on the top strategies
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def o2_regime(masked):
|
||
print("\n" + "=" * 100)
|
||
print("O2 — SPY > MA regime filter on top strategies (full 10y PIT)")
|
||
print("=" * 100)
|
||
tickers = [c for c in masked.columns if c != BENCHMARK]
|
||
prices = masked[tickers]
|
||
|
||
spy_full = masked[BENCHMARK].dropna()
|
||
|
||
contenders = {
|
||
"Recovery+Mom Top10": RecoveryMomentumStrategy(top_n=10),
|
||
"fc_up_cap_mom_gap_monthly": FactorComboStrategy("up_cap+mom_gap",
|
||
rebal_freq=21, top_n=10),
|
||
"fc_rec63_mom_gap_monthly": FactorComboStrategy("rec63+mom_gap",
|
||
rebal_freq=21, top_n=10),
|
||
}
|
||
|
||
rows = []
|
||
for name, strat in contenders.items():
|
||
base = run(strat, prices)
|
||
rows.append({"strategy": name, "filter": "none",
|
||
**{k: v for k, v in pit.summarize(base, "").items() if k != "name"}})
|
||
for ma in (200, 150, 100):
|
||
filt = spy_ma200_filter(spy_full, ma_window=ma).reindex(prices.index).fillna(False)
|
||
strat_fresh = _fresh_copy(strat)
|
||
eq = run(strat_fresh, prices, regime_filter=filt)
|
||
rows.append({"strategy": name, "filter": f"SPY>MA{ma}",
|
||
**{k: v for k, v in pit.summarize(eq, "").items() if k != "name"}})
|
||
|
||
df = pd.DataFrame(rows)
|
||
df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O2.csv"), index=False)
|
||
|
||
print(f" {'strategy':<32s} {'filter':<12s} {'CAGR':>7s} {'Sharpe':>7s} "
|
||
f"{'MaxDD':>7s} {'Calmar':>7s}")
|
||
for _, r in df.iterrows():
|
||
print(f" {r['strategy']:<32s} {r['filter']:<12s} "
|
||
f"{r['CAGR']*100:>6.1f}% {r['Sharpe']:>7.2f} "
|
||
f"{r['MaxDD']*100:>6.1f}% {r['Calmar']:>7.2f}")
|
||
return df
|
||
|
||
|
||
def _fresh_copy(strat):
|
||
"""Re-instantiate a strategy so state (if any) is reset between backtests."""
|
||
if isinstance(strat, RecoveryMomentumStrategy):
|
||
return RecoveryMomentumStrategy(
|
||
recovery_window=strat.recovery_window, mom_lookback=strat.mom_lookback,
|
||
mom_skip=strat.mom_skip, rebal_freq=strat.rebal_freq, top_n=strat.top_n)
|
||
if isinstance(strat, FactorComboStrategy):
|
||
return FactorComboStrategy(strat.signal_name, rebal_freq=strat.rebal_freq,
|
||
top_n=strat.top_n)
|
||
if isinstance(strat, MomentumQualityStrategy):
|
||
return MomentumQualityStrategy(
|
||
momentum_period=strat.momentum_period, skip=strat.skip,
|
||
quality_window=strat.quality_window, top_n=strat.top_n)
|
||
return strat # already stateless for our uses
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# O3 — Uncorrelated ensemble
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def o3_ensemble(masked):
|
||
print("\n" + "=" * 100)
|
||
print("O3 — Greedy uncorrelated ensemble (full 10y PIT)")
|
||
print("=" * 100)
|
||
tickers = [c for c in masked.columns if c != BENCHMARK]
|
||
prices = masked[tickers]
|
||
spy_full = masked[BENCHMARK].dropna()
|
||
|
||
# Candidate pool: the production strategies that cleared 0.75 Sharpe in 10y sweep.
|
||
candidates: list[tuple[str, object]] = [
|
||
("Recovery+Mom Top10", RecoveryMomentumStrategy(top_n=10)),
|
||
("fc_up_cap_mom_gap_monthly", FactorComboStrategy("up_cap+mom_gap", 21, 10)),
|
||
("fc_rec63_mom_gap_monthly", FactorComboStrategy("rec63+mom_gap", 21, 10)),
|
||
("fc_up_cap_quality_mom_monthly", FactorComboStrategy("up_cap+quality_mom", 21, 10)),
|
||
("fc_rec_mfilt_deep_upvol_monthly", FactorComboStrategy("rec_mfilt+deep_upvol", 21, 10)),
|
||
("fc_mom7m_rec126_monthly", FactorComboStrategy("mom7m+rec126", 21, 10)),
|
||
("Recovery+Mom Top20", RecoveryMomentumStrategy(top_n=20)),
|
||
("fc_down_resil_qual_mom_monthly", FactorComboStrategy("down_resil+qual_mom", 21, 10)),
|
||
]
|
||
|
||
equities: dict[str, pd.Series] = {name: run(s, prices) for name, s in candidates}
|
||
returns = pd.DataFrame({n: eq.pct_change().fillna(0) for n, eq in equities.items()})
|
||
sharpes = {n: pit.summarize(eq, n)["Sharpe"] for n, eq in equities.items()}
|
||
order = sorted(candidates, key=lambda t: sharpes[t[0]], reverse=True)
|
||
|
||
picked_names: list[str] = []
|
||
picked: list[tuple[object, float]] = []
|
||
for name, strat in order:
|
||
if any(returns[name].corr(returns[p]) > 0.85 for p in picked_names):
|
||
continue
|
||
picked_names.append(name)
|
||
picked.append((strat, 1.0))
|
||
if len(picked) >= 3:
|
||
break
|
||
|
||
print(f" Selected {len(picked)} uncorrelated components:")
|
||
for name in picked_names:
|
||
print(f" - {name} (Sharpe={sharpes[name]:.2f})")
|
||
|
||
ens = EnsembleStrategy(picked)
|
||
eq_ens = run(ens, prices)
|
||
filt = spy_ma200_filter(spy_full).reindex(prices.index).fillna(False)
|
||
eq_ens_reg = run(EnsembleStrategy(picked), prices, regime_filter=filt)
|
||
|
||
spy_bh = (masked[BENCHMARK].dropna().pipe(lambda s: s / s.iloc[0] * 10_000))
|
||
rows = [pit.summarize(spy_bh, "SPY buy-and-hold")]
|
||
for name in picked_names:
|
||
rows.append(pit.summarize(equities[name], f" component: {name}"))
|
||
rows.append(pit.summarize(eq_ens, "ENSEMBLE (equal-weight, no filter)"))
|
||
rows.append(pit.summarize(eq_ens_reg, "ENSEMBLE + SPY>MA200 filter"))
|
||
for r in rows:
|
||
print(pit.fmt_row(r))
|
||
|
||
df = pd.DataFrame(rows)
|
||
df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O3.csv"), index=False)
|
||
return df, picked_names
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# O4 — FactorCombo up_cap+mom_gap: top_n × rebal sweep
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def o4_factorcombo_sweep(masked):
|
||
print("\n" + "=" * 100)
|
||
print("O4 — FactorCombo up_cap+mom_gap: top_n × rebal (full 10y PIT)")
|
||
print("=" * 100)
|
||
tickers = [c for c in masked.columns if c != BENCHMARK]
|
||
prices = masked[tickers]
|
||
|
||
rows = []
|
||
for top_n in (5, 8, 10, 15, 20, 30):
|
||
for rebal in (5, 10, 21, 42):
|
||
strat = FactorComboStrategy("up_cap+mom_gap", rebal_freq=rebal, top_n=top_n)
|
||
eq = run(strat, prices)
|
||
s = pit.summarize(eq, f"top_n={top_n} rebal={rebal}")
|
||
rows.append({"top_n": top_n, "rebal": rebal,
|
||
"CAGR": s["CAGR"], "Sharpe": s["Sharpe"],
|
||
"MaxDD": s["MaxDD"], "Calmar": s["Calmar"]})
|
||
|
||
df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False)
|
||
df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O4.csv"), index=False)
|
||
|
||
print(f" {'top_n':<8s}{'rebal':<8s}{'CAGR':>8s}{'Sharpe':>9s}"
|
||
f"{'MaxDD':>9s}{'Calmar':>9s}")
|
||
for _, r in df.iterrows():
|
||
print(f" {int(r['top_n']):<8d}{int(r['rebal']):<8d}"
|
||
f"{r['CAGR']*100:>7.1f}%{r['Sharpe']:>9.2f}"
|
||
f"{r['MaxDD']*100:>8.1f}%{r['Calmar']:>9.2f}")
|
||
return df
|
||
|
||
|
||
def main():
|
||
print("Loading PIT-masked price data…")
|
||
masked = load_masked_prices()
|
||
print(f" shape={masked.shape} range={masked.index[0].date()} → {masked.index[-1].date()}")
|
||
|
||
o1 = o1_hyperparam_sweep(masked)
|
||
o2 = o2_regime(masked)
|
||
o3, picks = o3_ensemble(masked)
|
||
o4 = o4_factorcombo_sweep(masked)
|
||
|
||
print("\n" + "=" * 100)
|
||
print("Summary: best config from each experiment")
|
||
print("=" * 100)
|
||
best_o1 = o1.iloc[0]
|
||
print(f" O1 best OOS Sharpe: top_n={int(best_o1['top_n'])} rec_win={int(best_o1['recovery_window'])} "
|
||
f"rec_w={best_o1['rec_weight']} rebal={int(best_o1['rebal_freq'])} "
|
||
f"→ test Sharpe={best_o1['test_Sharpe']:.2f} test CAGR={best_o1['test_CAGR']*100:.1f}%")
|
||
best_o4 = o4.iloc[0]
|
||
print(f" O4 best overall: top_n={int(best_o4['top_n'])} rebal={int(best_o4['rebal'])} "
|
||
f"Sharpe={best_o4['Sharpe']:.2f} CAGR={best_o4['CAGR']*100:.1f}% "
|
||
f"Calmar={best_o4['Calmar']:.2f}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|