Files
quant/research/factor_optimize.py

295 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Factor-level optimization on the point-in-time S&P 500 universe.
Builds on the sweep results in data/sweep_*.csv. Runs four experiments:
O1 — RecoveryMomentumPlus hyperparameter grid (top_n × rec_window × rec_weight × rebal),
with 2016-2022 train / 2023-2026 test split. Picks best by Sharpe-on-test.
O2 — SPY>MA regime filter applied to the 3 highest-Sharpe strategies (10y window).
O3 — Top-3 uncorrelated ensemble: greedy corr<0.85 selection → equal-weight blend.
O4 — Factor-mix parameter sweep on the FactorCombo "up_cap+mom_gap" signal
(top_n × rebal_freq).
All experiments run on PIT-masked data. Results printed + written to
data/factor_optimize_<exp>.csv.
Usage:
uv run python -m research.factor_optimize
"""
from __future__ import annotations
import os
import warnings
import numpy as np
import pandas as pd
import research.pit_backtest as pit
from research.strategies_plus import (EnsembleStrategy, RecoveryMomentumPlus,
spy_ma200_filter)
from strategies.factor_combo import FactorComboStrategy, SIGNAL_REGISTRY
from strategies.momentum_quality import MomentumQualityStrategy
from strategies.recovery_momentum import RecoveryMomentumStrategy
warnings.filterwarnings("ignore", category=FutureWarning)
DATA_DIR = "data"
BENCHMARK = "SPY"
def load_masked_prices():
raw = pit.load_pit_prices()
masked = pit.pit_universe(raw)
if BENCHMARK in raw.columns:
masked[BENCHMARK] = raw[BENCHMARK]
return masked
def slice_period(df, start=None, end=None):
out = df
if start:
out = out[out.index >= start]
if end:
out = out[out.index <= end]
return out
def run(strat, prices, *, regime_filter=None):
return pit.backtest(
strategy=strat, prices=prices, initial_capital=10_000,
transaction_cost=0.001, regime_filter=regime_filter,
)
# ---------------------------------------------------------------------------
# O1 — RecoveryMomentumPlus hyperparameter grid
# ---------------------------------------------------------------------------
def o1_hyperparam_sweep(masked):
print("\n" + "=" * 100)
print("O1 — RecoveryMomentumPlus sweep (train 2016-2022 / test 2023-2026)")
print("=" * 100)
tickers = [c for c in masked.columns if c != BENCHMARK]
prices = masked[tickers]
train = slice_period(prices, "2016-04-19", "2022-12-31")
test = slice_period(prices, "2023-01-01", None)
grid = []
for top_n in (5, 10, 15, 20):
for rec_win in (42, 63, 126):
for rec_w in (0.3, 0.5, 0.7):
for rebal in (5, 10, 21):
grid.append((top_n, rec_win, rec_w, rebal))
rows = []
for i, (top_n, rec_win, rec_w, rebal) in enumerate(grid, 1):
cfg = dict(top_n=top_n, recovery_window=rec_win,
rec_weight=rec_w, rebal_freq=rebal)
tr = pit.summarize(run(RecoveryMomentumPlus(**cfg), train), "")
te = pit.summarize(run(RecoveryMomentumPlus(**cfg), test), "")
rows.append({**cfg,
"train_CAGR": tr["CAGR"], "train_Sharpe": tr["Sharpe"],
"test_CAGR": te["CAGR"], "test_Sharpe": te["Sharpe"],
"test_MaxDD": te["MaxDD"], "test_Calmar": te["Calmar"]})
if i % 12 == 0 or i == len(grid):
print(f"{i}/{len(grid)} configs evaluated")
df = pd.DataFrame(rows).sort_values("test_Sharpe", ascending=False)
out = os.path.join(DATA_DIR, "factor_optimize_O1.csv")
df.to_csv(out, index=False)
print("\n --- Top 10 by out-of-sample Sharpe (2023-2026) ---")
disp = ["top_n", "recovery_window", "rec_weight", "rebal_freq",
"train_Sharpe", "test_Sharpe", "train_CAGR", "test_CAGR",
"test_MaxDD", "test_Calmar"]
print(df.head(10)[disp].to_string(index=False, formatters={
"train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format,
"train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format,
"test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format,
}))
return df
# ---------------------------------------------------------------------------
# O2 — Regime filter on the top strategies
# ---------------------------------------------------------------------------
def o2_regime(masked):
print("\n" + "=" * 100)
print("O2 — SPY > MA regime filter on top strategies (full 10y PIT)")
print("=" * 100)
tickers = [c for c in masked.columns if c != BENCHMARK]
prices = masked[tickers]
spy_full = masked[BENCHMARK].dropna()
contenders = {
"Recovery+Mom Top10": RecoveryMomentumStrategy(top_n=10),
"fc_up_cap_mom_gap_monthly": FactorComboStrategy("up_cap+mom_gap",
rebal_freq=21, top_n=10),
"fc_rec63_mom_gap_monthly": FactorComboStrategy("rec63+mom_gap",
rebal_freq=21, top_n=10),
}
rows = []
for name, strat in contenders.items():
base = run(strat, prices)
rows.append({"strategy": name, "filter": "none",
**{k: v for k, v in pit.summarize(base, "").items() if k != "name"}})
for ma in (200, 150, 100):
filt = spy_ma200_filter(spy_full, ma_window=ma).reindex(prices.index).fillna(False)
strat_fresh = _fresh_copy(strat)
eq = run(strat_fresh, prices, regime_filter=filt)
rows.append({"strategy": name, "filter": f"SPY>MA{ma}",
**{k: v for k, v in pit.summarize(eq, "").items() if k != "name"}})
df = pd.DataFrame(rows)
df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O2.csv"), index=False)
print(f" {'strategy':<32s} {'filter':<12s} {'CAGR':>7s} {'Sharpe':>7s} "
f"{'MaxDD':>7s} {'Calmar':>7s}")
for _, r in df.iterrows():
print(f" {r['strategy']:<32s} {r['filter']:<12s} "
f"{r['CAGR']*100:>6.1f}% {r['Sharpe']:>7.2f} "
f"{r['MaxDD']*100:>6.1f}% {r['Calmar']:>7.2f}")
return df
def _fresh_copy(strat):
"""Re-instantiate a strategy so state (if any) is reset between backtests."""
if isinstance(strat, RecoveryMomentumStrategy):
return RecoveryMomentumStrategy(
recovery_window=strat.recovery_window, mom_lookback=strat.mom_lookback,
mom_skip=strat.mom_skip, rebal_freq=strat.rebal_freq, top_n=strat.top_n)
if isinstance(strat, FactorComboStrategy):
return FactorComboStrategy(strat.signal_name, rebal_freq=strat.rebal_freq,
top_n=strat.top_n)
if isinstance(strat, MomentumQualityStrategy):
return MomentumQualityStrategy(
momentum_period=strat.momentum_period, skip=strat.skip,
quality_window=strat.quality_window, top_n=strat.top_n)
return strat # already stateless for our uses
# ---------------------------------------------------------------------------
# O3 — Uncorrelated ensemble
# ---------------------------------------------------------------------------
def o3_ensemble(masked):
print("\n" + "=" * 100)
print("O3 — Greedy uncorrelated ensemble (full 10y PIT)")
print("=" * 100)
tickers = [c for c in masked.columns if c != BENCHMARK]
prices = masked[tickers]
spy_full = masked[BENCHMARK].dropna()
# Candidate pool: the production strategies that cleared 0.75 Sharpe in 10y sweep.
candidates: list[tuple[str, object]] = [
("Recovery+Mom Top10", RecoveryMomentumStrategy(top_n=10)),
("fc_up_cap_mom_gap_monthly", FactorComboStrategy("up_cap+mom_gap", 21, 10)),
("fc_rec63_mom_gap_monthly", FactorComboStrategy("rec63+mom_gap", 21, 10)),
("fc_up_cap_quality_mom_monthly", FactorComboStrategy("up_cap+quality_mom", 21, 10)),
("fc_rec_mfilt_deep_upvol_monthly", FactorComboStrategy("rec_mfilt+deep_upvol", 21, 10)),
("fc_mom7m_rec126_monthly", FactorComboStrategy("mom7m+rec126", 21, 10)),
("Recovery+Mom Top20", RecoveryMomentumStrategy(top_n=20)),
("fc_down_resil_qual_mom_monthly", FactorComboStrategy("down_resil+qual_mom", 21, 10)),
]
equities: dict[str, pd.Series] = {name: run(s, prices) for name, s in candidates}
returns = pd.DataFrame({n: eq.pct_change().fillna(0) for n, eq in equities.items()})
sharpes = {n: pit.summarize(eq, n)["Sharpe"] for n, eq in equities.items()}
order = sorted(candidates, key=lambda t: sharpes[t[0]], reverse=True)
picked_names: list[str] = []
picked: list[tuple[object, float]] = []
for name, strat in order:
if any(returns[name].corr(returns[p]) > 0.85 for p in picked_names):
continue
picked_names.append(name)
picked.append((strat, 1.0))
if len(picked) >= 3:
break
print(f" Selected {len(picked)} uncorrelated components:")
for name in picked_names:
print(f" - {name} (Sharpe={sharpes[name]:.2f})")
ens = EnsembleStrategy(picked)
eq_ens = run(ens, prices)
filt = spy_ma200_filter(spy_full).reindex(prices.index).fillna(False)
eq_ens_reg = run(EnsembleStrategy(picked), prices, regime_filter=filt)
spy_bh = (masked[BENCHMARK].dropna().pipe(lambda s: s / s.iloc[0] * 10_000))
rows = [pit.summarize(spy_bh, "SPY buy-and-hold")]
for name in picked_names:
rows.append(pit.summarize(equities[name], f" component: {name}"))
rows.append(pit.summarize(eq_ens, "ENSEMBLE (equal-weight, no filter)"))
rows.append(pit.summarize(eq_ens_reg, "ENSEMBLE + SPY>MA200 filter"))
for r in rows:
print(pit.fmt_row(r))
df = pd.DataFrame(rows)
df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O3.csv"), index=False)
return df, picked_names
# ---------------------------------------------------------------------------
# O4 — FactorCombo up_cap+mom_gap: top_n × rebal sweep
# ---------------------------------------------------------------------------
def o4_factorcombo_sweep(masked):
print("\n" + "=" * 100)
print("O4 — FactorCombo up_cap+mom_gap: top_n × rebal (full 10y PIT)")
print("=" * 100)
tickers = [c for c in masked.columns if c != BENCHMARK]
prices = masked[tickers]
rows = []
for top_n in (5, 8, 10, 15, 20, 30):
for rebal in (5, 10, 21, 42):
strat = FactorComboStrategy("up_cap+mom_gap", rebal_freq=rebal, top_n=top_n)
eq = run(strat, prices)
s = pit.summarize(eq, f"top_n={top_n} rebal={rebal}")
rows.append({"top_n": top_n, "rebal": rebal,
"CAGR": s["CAGR"], "Sharpe": s["Sharpe"],
"MaxDD": s["MaxDD"], "Calmar": s["Calmar"]})
df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False)
df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O4.csv"), index=False)
print(f" {'top_n':<8s}{'rebal':<8s}{'CAGR':>8s}{'Sharpe':>9s}"
f"{'MaxDD':>9s}{'Calmar':>9s}")
for _, r in df.iterrows():
print(f" {int(r['top_n']):<8d}{int(r['rebal']):<8d}"
f"{r['CAGR']*100:>7.1f}%{r['Sharpe']:>9.2f}"
f"{r['MaxDD']*100:>8.1f}%{r['Calmar']:>9.2f}")
return df
def main():
print("Loading PIT-masked price data…")
masked = load_masked_prices()
print(f" shape={masked.shape} range={masked.index[0].date()}{masked.index[-1].date()}")
o1 = o1_hyperparam_sweep(masked)
o2 = o2_regime(masked)
o3, picks = o3_ensemble(masked)
o4 = o4_factorcombo_sweep(masked)
print("\n" + "=" * 100)
print("Summary: best config from each experiment")
print("=" * 100)
best_o1 = o1.iloc[0]
print(f" O1 best OOS Sharpe: top_n={int(best_o1['top_n'])} rec_win={int(best_o1['recovery_window'])} "
f"rec_w={best_o1['rec_weight']} rebal={int(best_o1['rebal_freq'])} "
f"→ test Sharpe={best_o1['test_Sharpe']:.2f} test CAGR={best_o1['test_CAGR']*100:.1f}%")
best_o4 = o4.iloc[0]
print(f" O4 best overall: top_n={int(best_o4['top_n'])} rebal={int(best_o4['rebal'])} "
f"Sharpe={best_o4['Sharpe']:.2f} CAGR={best_o4['CAGR']*100:.1f}% "
f"Calmar={best_o4['Calmar']:.2f}")
if __name__ == "__main__":
main()