""" Professional QR-style factor research on the PIT S&P 500 universe. Stage 1 — Factor diagnostics. IC (Spearman, 21d fwd), t-stat, realistic long-short decile backtest (monthly rebalance, 10 bps t-cost). Stage 2 — Composite backtest 1/3/5/10y vs champions. For 1y window we pre-pend 2y of warmup then score returns on the last 1y only, so strategies with 252d+ warmup are actually active in-window. Stage 3 — Config sweep across weight_scheme × top_n × rebal × vol_target. Outputs CSVs to data/alpha_research_*.csv. """ from __future__ import annotations import os import warnings import numpy as np import pandas as pd import research.pit_backtest as pit from research.alpha_factors import (AlphaFactorStrategy, _rolling_beta_and_residvol, f_mom_12_1, f_mom_7_1, f_rev_1m, f_w52_high, f_max5_neg, f_recovery_63, f_trend_strength, xsec_rank, _rolling_ls_sharpe) from strategies.factor_combo import FactorComboStrategy from strategies.recovery_momentum import RecoveryMomentumStrategy warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) DATA_DIR = "data" BENCHMARK = "SPY" def load(): raw = pit.load_pit_prices() masked = pit.pit_universe(raw) if BENCHMARK in raw.columns: masked[BENCHMARK] = raw[BENCHMARK] return masked def warmup_slice(df: pd.DataFrame, years: int, warmup_days: int = 500) -> tuple[pd.DataFrame, pd.Timestamp]: """Return (prices_with_warmup, measurement_start). Strategies are fed the longer series, but metrics must be computed only from measurement_start.""" measurement_start = df.index[-1] - pd.DateOffset(years=years) first_day = df.index[0] # Keep all rows between measurement_start - warmup_days and end. cutoff = max(first_day, measurement_start - pd.Timedelta(days=warmup_days * 1.5)) sliced = df[df.index >= cutoff] return sliced, measurement_start def measure(eq: pd.Series, start: pd.Timestamp, name: str = "") -> dict: eq = eq[eq.index >= start] # Re-base to 10_000 at start eq = eq / eq.iloc[0] * 10_000 return pit.summarize(eq, name=name) # --------------------------------------------------------------------------- # Stage 1 — Factor diagnostics # --------------------------------------------------------------------------- def factor_diagnostics(masked: pd.DataFrame): print("\n" + "=" * 110) print("Stage 1 — Factor diagnostics (full 10y PIT, monthly rebal, 10bps t-cost)") print("=" * 110) tickers = [c for c in masked.columns if c != BENCHMARK] prices = masked[tickers] mkt_ret = masked[BENCHMARK].pct_change(fill_method=None) betas, resid_vol = _rolling_beta_and_residvol(prices, mkt_ret, window=60) from research.alpha_factors import f_mom_residual factor_builders = { "mom_12_1": lambda: f_mom_12_1(prices), "mom_7_1": lambda: f_mom_7_1(prices), "mom_residual": lambda: f_mom_residual(prices, mkt_ret, betas=betas), "rev_1m": lambda: f_rev_1m(prices), "w52_high": lambda: f_w52_high(prices), "max5_neg": lambda: f_max5_neg(prices), "recovery_63": lambda: f_recovery_63(prices), "trend_strength": lambda: f_trend_strength(prices), "idio_vol_neg": lambda: -resid_vol, "low_beta": lambda: -betas, } fwd_21 = prices.shift(-21) / prices - 1 fwd_rank = fwd_21.rank(axis=1, pct=True, na_option="keep") rows = [] for name, build in factor_builders.items(): fac = build() fr = xsec_rank(fac) ic_daily = fr.corrwith(fwd_rank, axis=1).dropna() ic_mean = ic_daily.mean() ic_t = ic_mean / (ic_daily.std() / np.sqrt(len(ic_daily))) if len(ic_daily) > 1 else 0.0 ls = realistic_decile_spread(fr, prices, rebal=21, tcost=0.001) long_only = realistic_top_decile(fr, prices, rebal=21, tcost=0.001) rows.append({ "factor": name, "IC_mean": ic_mean, "IC_t": ic_t, "LS_CAGR": ls["CAGR"], "LS_Sharpe": ls["Sharpe"], "LO_CAGR": long_only["CAGR"], "LO_Sharpe": long_only["Sharpe"], "LO_MaxDD": long_only["MaxDD"], }) df = pd.DataFrame(rows).sort_values("LO_Sharpe", ascending=False) df.to_csv(os.path.join(DATA_DIR, "alpha_research_factors.csv"), index=False) print(df.to_string(index=False, formatters={ "IC_mean": "{:+.4f}".format, "IC_t": "{:+.2f}".format, "LS_CAGR": "{:+.1%}".format, "LS_Sharpe": "{:+.2f}".format, "LO_CAGR": "{:+.1%}".format, "LO_Sharpe": "{:+.2f}".format, "LO_MaxDD": "{:.1%}".format, })) return df def realistic_decile_spread(factor_rank, prices, rebal=21, tcost=0.001): """Long top-decile minus short bottom-decile, monthly rebal, 10bps t-cost.""" long_mask = factor_rank >= 0.9 short_mask = factor_rank <= 0.1 long_w = long_mask.astype(float).div(long_mask.sum(axis=1).replace(0, np.nan), axis=0) short_w = short_mask.astype(float).div(short_mask.sum(axis=1).replace(0, np.nan), axis=0) rebal_mask = pd.Series(False, index=factor_rank.index) rebal_mask.iloc[::rebal] = True long_w[~rebal_mask] = np.nan short_w[~rebal_mask] = np.nan long_w = long_w.ffill().fillna(0.0) short_w = short_w.ffill().fillna(0.0) rets = prices.pct_change(fill_method=None) ls = ((long_w.shift(1) * rets).sum(axis=1) - (short_w.shift(1) * rets).sum(axis=1)) \ - (long_w.diff().abs().sum(axis=1).fillna(0.0) + short_w.diff().abs().sum(axis=1).fillna(0.0)) * tcost ls = ls.fillna(0.0).iloc[252:] eq = (1 + ls).cumprod() * 10_000 return pit.summarize(eq, name="ls") def realistic_top_decile(factor_rank, prices, rebal=21, tcost=0.001): """Long-only top-decile equal-weight portfolio with t-cost.""" long_mask = factor_rank >= 0.9 long_w = long_mask.astype(float).div(long_mask.sum(axis=1).replace(0, np.nan), axis=0) rebal_mask = pd.Series(False, index=factor_rank.index) rebal_mask.iloc[::rebal] = True long_w[~rebal_mask] = np.nan long_w = long_w.ffill().fillna(0.0) rets = prices.pct_change(fill_method=None) port_ret = (long_w.shift(1) * rets).sum(axis=1) \ - long_w.diff().abs().sum(axis=1).fillna(0.0) * tcost port_ret = port_ret.fillna(0.0).iloc[252:] eq = (1 + port_ret).cumprod() * 10_000 return pit.summarize(eq, name="lo") # --------------------------------------------------------------------------- # Stage 2 — Composite backtest # --------------------------------------------------------------------------- def composite_backtest(masked: pd.DataFrame): print("\n" + "=" * 110) print("Stage 2 — IC / LS-Sharpe-weighted composite vs champions (1/3/5/10y)") print("=" * 110) tickers = [c for c in masked.columns if c != BENCHMARK] mkt_ret_full = masked[BENCHMARK].pct_change(fill_method=None) configs = { "Alpha(LS-Sharpe, tn=15, rebal=10)": lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=10, vol_target_annual=None, weight_scheme="ls_sharpe"), "Alpha(LS-Sharpe, tn=15, rebal=21)": lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21, vol_target_annual=None, weight_scheme="ls_sharpe"), "Alpha(LS-Sharpe+VT18, tn=15, rebal=21)": lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21, vol_target_annual=0.18, weight_scheme="ls_sharpe"), "Alpha(IC, tn=15, rebal=21)": lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21, vol_target_annual=None, weight_scheme="ic"), "Recovery+Mom Top10": lambda: RecoveryMomentumStrategy(top_n=10), "fc_up_cap+mom_gap": lambda: FactorComboStrategy("up_cap+mom_gap", rebal_freq=21, top_n=10), } all_rows = [] for years in (10, 5, 3, 1): sliced, measurement_start = warmup_slice(masked, years, warmup_days=500) prices = sliced[tickers] print(f"\n --- Window: last {years}y " f"(measure {measurement_start.date()} → {sliced.index[-1].date()}, " f"warmup from {sliced.index[0].date()}) ---") spy = sliced[BENCHMARK].dropna() spy_eq = (spy / spy.iloc[0]) * 10_000 rows = [{"years": years, "strategy": "SPY buy-and-hold", **{k: v for k, v in measure(spy_eq, measurement_start, "").items() if k != "name"}}] for name, factory in configs.items(): strat = factory() eq = pit.backtest(strategy=strat, prices=prices, initial_capital=10_000, transaction_cost=0.001) m = measure(eq, measurement_start, "") rows.append({"years": years, "strategy": name, **{k: v for k, v in m.items() if k != "name"}}) for r in rows: print(f" {r['strategy']:<42s} " f"CAGR={r['CAGR']*100:>6.1f}% " f"Sharpe={r['Sharpe']:>5.2f} " f"Sortino={r['Sortino']:>5.2f} " f"MaxDD={r['MaxDD']*100:>6.1f}% " f"Calmar={r['Calmar']:>5.2f}") all_rows.extend(rows) df = pd.DataFrame(all_rows) df.to_csv(os.path.join(DATA_DIR, "alpha_research_composite.csv"), index=False) return df # --------------------------------------------------------------------------- # Stage 3 — Config sweep # --------------------------------------------------------------------------- def config_sweep(masked: pd.DataFrame): print("\n" + "=" * 110) print("Stage 3 — AlphaFactor config sweep (10y)") print("=" * 110) tickers = [c for c in masked.columns if c != BENCHMARK] prices = masked[tickers] mkt_ret = masked[BENCHMARK].pct_change(fill_method=None) rows = [] for scheme in ("ls_sharpe", "ic", "equal"): for top_n in (10, 15, 20): for rebal in (10, 21): for vt in (None, 0.18): strat = AlphaFactorStrategy(mkt_ret, top_n=top_n, rebal_freq=rebal, vol_target_annual=vt, weight_scheme=scheme) eq = pit.backtest(strat, prices, initial_capital=10_000, transaction_cost=0.001) s = pit.summarize(eq, "") rows.append({"scheme": scheme, "top_n": top_n, "rebal": rebal, "vt": vt if vt is not None else "none", "CAGR": s["CAGR"], "Sharpe": s["Sharpe"], "MaxDD": s["MaxDD"], "Calmar": s["Calmar"]}) df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False) df.to_csv(os.path.join(DATA_DIR, "alpha_research_sweep.csv"), index=False) print(df.head(15).to_string(index=False, formatters={ "CAGR": "{:.1%}".format, "Sharpe": "{:.2f}".format, "MaxDD": "{:.1%}".format, "Calmar": "{:.2f}".format, })) return df def main(): print("Loading PIT data…") masked = load() print(f" shape={masked.shape} range={masked.index[0].date()} → {masked.index[-1].date()}") factor_diagnostics(masked) composite_backtest(masked) sweep = config_sweep(masked) print("\n" + "=" * 110) print("Top 5 configs:") print("=" * 110) print(sweep.head(5).to_string(index=False, formatters={ "CAGR": "{:.1%}".format, "Sharpe": "{:.2f}".format, "MaxDD": "{:.1%}".format, "Calmar": "{:.2f}".format, })) if __name__ == "__main__": main()