quant/research/factor_optimize.py

"""
Factor-level optimization on the point-in-time S&P 500 universe.

Builds on the sweep results in data/sweep_*.csv. Runs four experiments:

  O1 — RecoveryMomentumPlus hyperparameter grid (top_n × rec_window × rec_weight × rebal),
       with 2016-2022 train / 2023-2026 test split. Picks best by Sharpe-on-test.
  O2 — SPY>MA regime filter applied to the 3 highest-Sharpe strategies (10y window).
  O3 — Top-3 uncorrelated ensemble: greedy corr<0.85 selection → equal-weight blend.
  O4 — Factor-mix parameter sweep on the FactorCombo "up_cap+mom_gap" signal
       (top_n × rebal_freq).

All experiments run on PIT-masked data. Results printed + written to
data/factor_optimize_<exp>.csv.

Usage:
    uv run python -m research.factor_optimize
"""

from __future__ import annotations

import os
import warnings

import numpy as np
import pandas as pd

import research.pit_backtest as pit
from research.strategies_plus import (EnsembleStrategy, RecoveryMomentumPlus,
                                      spy_ma200_filter)
from strategies.factor_combo import FactorComboStrategy, SIGNAL_REGISTRY
from strategies.momentum_quality import MomentumQualityStrategy
from strategies.recovery_momentum import RecoveryMomentumStrategy

warnings.filterwarnings("ignore", category=FutureWarning)

DATA_DIR = "data"
BENCHMARK = "SPY"


def load_masked_prices():
    raw = pit.load_pit_prices()
    masked = pit.pit_universe(raw)
    if BENCHMARK in raw.columns:
        masked[BENCHMARK] = raw[BENCHMARK]
    return masked


def slice_period(df, start=None, end=None):
    out = df
    if start:
        out = out[out.index >= start]
    if end:
        out = out[out.index <= end]
    return out


def run(strat, prices, *, regime_filter=None):
    return pit.backtest(
        strategy=strat, prices=prices, initial_capital=10_000,
        transaction_cost=0.001, regime_filter=regime_filter,
    )


# ---------------------------------------------------------------------------
# O1 — RecoveryMomentumPlus hyperparameter grid
# ---------------------------------------------------------------------------

def o1_hyperparam_sweep(masked):
    print("\n" + "=" * 100)
    print("O1 — RecoveryMomentumPlus sweep (train 2016-2022 / test 2023-2026)")
    print("=" * 100)
    tickers = [c for c in masked.columns if c != BENCHMARK]
    prices = masked[tickers]
    train = slice_period(prices, "2016-04-19", "2022-12-31")
    test = slice_period(prices, "2023-01-01", None)

    grid = []
    for top_n in (5, 10, 15, 20):
        for rec_win in (42, 63, 126):
            for rec_w in (0.3, 0.5, 0.7):
                for rebal in (5, 10, 21):
                    grid.append((top_n, rec_win, rec_w, rebal))

    rows = []
    for i, (top_n, rec_win, rec_w, rebal) in enumerate(grid, 1):
        cfg = dict(top_n=top_n, recovery_window=rec_win,
                   rec_weight=rec_w, rebal_freq=rebal)
        tr = pit.summarize(run(RecoveryMomentumPlus(**cfg), train), "")
        te = pit.summarize(run(RecoveryMomentumPlus(**cfg), test), "")
        rows.append({**cfg,
                     "train_CAGR": tr["CAGR"], "train_Sharpe": tr["Sharpe"],
                     "test_CAGR": te["CAGR"],  "test_Sharpe": te["Sharpe"],
                     "test_MaxDD": te["MaxDD"], "test_Calmar": te["Calmar"]})
        if i % 12 == 0 or i == len(grid):
            print(f"  … {i}/{len(grid)} configs evaluated")

    df = pd.DataFrame(rows).sort_values("test_Sharpe", ascending=False)
    out = os.path.join(DATA_DIR, "factor_optimize_O1.csv")
    df.to_csv(out, index=False)

    print("\n  --- Top 10 by out-of-sample Sharpe (2023-2026) ---")
    disp = ["top_n", "recovery_window", "rec_weight", "rebal_freq",
            "train_Sharpe", "test_Sharpe", "train_CAGR", "test_CAGR",
            "test_MaxDD", "test_Calmar"]
    print(df.head(10)[disp].to_string(index=False, formatters={
        "train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format,
        "train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format,
        "test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format,
    }))
    return df


# ---------------------------------------------------------------------------
# O2 — Regime filter on the top strategies
# ---------------------------------------------------------------------------

def o2_regime(masked):
    print("\n" + "=" * 100)
    print("O2 — SPY > MA regime filter on top strategies (full 10y PIT)")
    print("=" * 100)
    tickers = [c for c in masked.columns if c != BENCHMARK]
    prices = masked[tickers]

    spy_full = masked[BENCHMARK].dropna()

    contenders = {
        "Recovery+Mom Top10":          RecoveryMomentumStrategy(top_n=10),
        "fc_up_cap_mom_gap_monthly":   FactorComboStrategy("up_cap+mom_gap",
                                                             rebal_freq=21, top_n=10),
        "fc_rec63_mom_gap_monthly":    FactorComboStrategy("rec63+mom_gap",
                                                             rebal_freq=21, top_n=10),
    }

    rows = []
    for name, strat in contenders.items():
        base = run(strat, prices)
        rows.append({"strategy": name, "filter": "none",
                     **{k: v for k, v in pit.summarize(base, "").items() if k != "name"}})
        for ma in (200, 150, 100):
            filt = spy_ma200_filter(spy_full, ma_window=ma).reindex(prices.index).fillna(False)
            strat_fresh = _fresh_copy(strat)
            eq = run(strat_fresh, prices, regime_filter=filt)
            rows.append({"strategy": name, "filter": f"SPY>MA{ma}",
                         **{k: v for k, v in pit.summarize(eq, "").items() if k != "name"}})

    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O2.csv"), index=False)

    print(f"  {'strategy':<32s} {'filter':<12s} {'CAGR':>7s} {'Sharpe':>7s} "
          f"{'MaxDD':>7s} {'Calmar':>7s}")
    for _, r in df.iterrows():
        print(f"  {r['strategy']:<32s} {r['filter']:<12s} "
              f"{r['CAGR']*100:>6.1f}% {r['Sharpe']:>7.2f} "
              f"{r['MaxDD']*100:>6.1f}% {r['Calmar']:>7.2f}")
    return df


def _fresh_copy(strat):
    """Re-instantiate a strategy so state (if any) is reset between backtests."""
    if isinstance(strat, RecoveryMomentumStrategy):
        return RecoveryMomentumStrategy(
            recovery_window=strat.recovery_window, mom_lookback=strat.mom_lookback,
            mom_skip=strat.mom_skip, rebal_freq=strat.rebal_freq, top_n=strat.top_n)
    if isinstance(strat, FactorComboStrategy):
        return FactorComboStrategy(strat.signal_name, rebal_freq=strat.rebal_freq,
                                     top_n=strat.top_n)
    if isinstance(strat, MomentumQualityStrategy):
        return MomentumQualityStrategy(
            momentum_period=strat.momentum_period, skip=strat.skip,
            quality_window=strat.quality_window, top_n=strat.top_n)
    return strat  # already stateless for our uses


# ---------------------------------------------------------------------------
# O3 — Uncorrelated ensemble
# ---------------------------------------------------------------------------

def o3_ensemble(masked):
    print("\n" + "=" * 100)
    print("O3 — Greedy uncorrelated ensemble (full 10y PIT)")
    print("=" * 100)
    tickers = [c for c in masked.columns if c != BENCHMARK]
    prices = masked[tickers]
    spy_full = masked[BENCHMARK].dropna()

    # Candidate pool: the production strategies that cleared 0.75 Sharpe in 10y sweep.
    candidates: list[tuple[str, object]] = [
        ("Recovery+Mom Top10",          RecoveryMomentumStrategy(top_n=10)),
        ("fc_up_cap_mom_gap_monthly",   FactorComboStrategy("up_cap+mom_gap", 21, 10)),
        ("fc_rec63_mom_gap_monthly",    FactorComboStrategy("rec63+mom_gap", 21, 10)),
        ("fc_up_cap_quality_mom_monthly", FactorComboStrategy("up_cap+quality_mom", 21, 10)),
        ("fc_rec_mfilt_deep_upvol_monthly", FactorComboStrategy("rec_mfilt+deep_upvol", 21, 10)),
        ("fc_mom7m_rec126_monthly",     FactorComboStrategy("mom7m+rec126", 21, 10)),
        ("Recovery+Mom Top20",          RecoveryMomentumStrategy(top_n=20)),
        ("fc_down_resil_qual_mom_monthly", FactorComboStrategy("down_resil+qual_mom", 21, 10)),
    ]

    equities: dict[str, pd.Series] = {name: run(s, prices) for name, s in candidates}
    returns = pd.DataFrame({n: eq.pct_change().fillna(0) for n, eq in equities.items()})
    sharpes = {n: pit.summarize(eq, n)["Sharpe"] for n, eq in equities.items()}
    order = sorted(candidates, key=lambda t: sharpes[t[0]], reverse=True)

    picked_names: list[str] = []
    picked: list[tuple[object, float]] = []
    for name, strat in order:
        if any(returns[name].corr(returns[p]) > 0.85 for p in picked_names):
            continue
        picked_names.append(name)
        picked.append((strat, 1.0))
        if len(picked) >= 3:
            break

    print(f"  Selected {len(picked)} uncorrelated components:")
    for name in picked_names:
        print(f"    - {name} (Sharpe={sharpes[name]:.2f})")

    ens = EnsembleStrategy(picked)
    eq_ens = run(ens, prices)
    filt = spy_ma200_filter(spy_full).reindex(prices.index).fillna(False)
    eq_ens_reg = run(EnsembleStrategy(picked), prices, regime_filter=filt)

    spy_bh = (masked[BENCHMARK].dropna().pipe(lambda s: s / s.iloc[0] * 10_000))
    rows = [pit.summarize(spy_bh, "SPY buy-and-hold")]
    for name in picked_names:
        rows.append(pit.summarize(equities[name], f"  component: {name}"))
    rows.append(pit.summarize(eq_ens, "ENSEMBLE (equal-weight, no filter)"))
    rows.append(pit.summarize(eq_ens_reg, "ENSEMBLE + SPY>MA200 filter"))
    for r in rows:
        print(pit.fmt_row(r))

    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O3.csv"), index=False)
    return df, picked_names


# ---------------------------------------------------------------------------
# O4 — FactorCombo up_cap+mom_gap: top_n × rebal sweep
# ---------------------------------------------------------------------------

def o4_factorcombo_sweep(masked):
    print("\n" + "=" * 100)
    print("O4 — FactorCombo up_cap+mom_gap: top_n × rebal (full 10y PIT)")
    print("=" * 100)
    tickers = [c for c in masked.columns if c != BENCHMARK]
    prices = masked[tickers]

    rows = []
    for top_n in (5, 8, 10, 15, 20, 30):
        for rebal in (5, 10, 21, 42):
            strat = FactorComboStrategy("up_cap+mom_gap", rebal_freq=rebal, top_n=top_n)
            eq = run(strat, prices)
            s = pit.summarize(eq, f"top_n={top_n} rebal={rebal}")
            rows.append({"top_n": top_n, "rebal": rebal,
                         "CAGR": s["CAGR"], "Sharpe": s["Sharpe"],
                         "MaxDD": s["MaxDD"], "Calmar": s["Calmar"]})

    df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False)
    df.to_csv(os.path.join(DATA_DIR, "factor_optimize_O4.csv"), index=False)

    print(f"  {'top_n':<8s}{'rebal':<8s}{'CAGR':>8s}{'Sharpe':>9s}"
          f"{'MaxDD':>9s}{'Calmar':>9s}")
    for _, r in df.iterrows():
        print(f"  {int(r['top_n']):<8d}{int(r['rebal']):<8d}"
              f"{r['CAGR']*100:>7.1f}%{r['Sharpe']:>9.2f}"
              f"{r['MaxDD']*100:>8.1f}%{r['Calmar']:>9.2f}")
    return df


def main():
    print("Loading PIT-masked price data…")
    masked = load_masked_prices()
    print(f"  shape={masked.shape} range={masked.index[0].date()} → {masked.index[-1].date()}")

    o1 = o1_hyperparam_sweep(masked)
    o2 = o2_regime(masked)
    o3, picks = o3_ensemble(masked)
    o4 = o4_factorcombo_sweep(masked)

    print("\n" + "=" * 100)
    print("Summary: best config from each experiment")
    print("=" * 100)
    best_o1 = o1.iloc[0]
    print(f"  O1 best OOS Sharpe: top_n={int(best_o1['top_n'])} rec_win={int(best_o1['recovery_window'])} "
          f"rec_w={best_o1['rec_weight']} rebal={int(best_o1['rebal_freq'])} "
          f"→ test Sharpe={best_o1['test_Sharpe']:.2f} test CAGR={best_o1['test_CAGR']*100:.1f}%")
    best_o4 = o4.iloc[0]
    print(f"  O4 best overall: top_n={int(best_o4['top_n'])} rebal={int(best_o4['rebal'])} "
          f"Sharpe={best_o4['Sharpe']:.2f} CAGR={best_o4['CAGR']*100:.1f}% "
          f"Calmar={best_o4['Calmar']:.2f}")


if __name__ == "__main__":
    main()