280 lines
12 KiB
Python
280 lines
12 KiB
Python
"""
|
||
Professional QR-style factor research on the PIT S&P 500 universe.
|
||
|
||
Stage 1 — Factor diagnostics.
|
||
IC (Spearman, 21d fwd), t-stat, realistic long-short decile backtest
|
||
(monthly rebalance, 10 bps t-cost).
|
||
|
||
Stage 2 — Composite backtest 1/3/5/10y vs champions.
|
||
For 1y window we pre-pend 2y of warmup then score returns on the last 1y
|
||
only, so strategies with 252d+ warmup are actually active in-window.
|
||
|
||
Stage 3 — Config sweep across weight_scheme × top_n × rebal × vol_target.
|
||
|
||
Outputs CSVs to data/alpha_research_*.csv.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import warnings
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
import research.pit_backtest as pit
|
||
from research.alpha_factors import (AlphaFactorStrategy, _rolling_beta_and_residvol,
|
||
f_mom_12_1, f_mom_7_1, f_rev_1m, f_w52_high,
|
||
f_max5_neg, f_recovery_63, f_trend_strength,
|
||
xsec_rank, _rolling_ls_sharpe)
|
||
from strategies.factor_combo import FactorComboStrategy
|
||
from strategies.recovery_momentum import RecoveryMomentumStrategy
|
||
|
||
warnings.filterwarnings("ignore", category=FutureWarning)
|
||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||
|
||
DATA_DIR = "data"
|
||
BENCHMARK = "SPY"
|
||
|
||
|
||
def load():
|
||
raw = pit.load_pit_prices()
|
||
masked = pit.pit_universe(raw)
|
||
if BENCHMARK in raw.columns:
|
||
masked[BENCHMARK] = raw[BENCHMARK]
|
||
return masked
|
||
|
||
|
||
def warmup_slice(df: pd.DataFrame, years: int, warmup_days: int = 500) -> tuple[pd.DataFrame, pd.Timestamp]:
|
||
"""Return (prices_with_warmup, measurement_start). Strategies are fed the
|
||
longer series, but metrics must be computed only from measurement_start."""
|
||
measurement_start = df.index[-1] - pd.DateOffset(years=years)
|
||
first_day = df.index[0]
|
||
# Keep all rows between measurement_start - warmup_days and end.
|
||
cutoff = max(first_day, measurement_start - pd.Timedelta(days=warmup_days * 1.5))
|
||
sliced = df[df.index >= cutoff]
|
||
return sliced, measurement_start
|
||
|
||
|
||
def measure(eq: pd.Series, start: pd.Timestamp, name: str = "") -> dict:
|
||
eq = eq[eq.index >= start]
|
||
# Re-base to 10_000 at start
|
||
eq = eq / eq.iloc[0] * 10_000
|
||
return pit.summarize(eq, name=name)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Stage 1 — Factor diagnostics
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def factor_diagnostics(masked: pd.DataFrame):
|
||
print("\n" + "=" * 110)
|
||
print("Stage 1 — Factor diagnostics (full 10y PIT, monthly rebal, 10bps t-cost)")
|
||
print("=" * 110)
|
||
tickers = [c for c in masked.columns if c != BENCHMARK]
|
||
prices = masked[tickers]
|
||
mkt_ret = masked[BENCHMARK].pct_change(fill_method=None)
|
||
betas, resid_vol = _rolling_beta_and_residvol(prices, mkt_ret, window=60)
|
||
|
||
from research.alpha_factors import f_mom_residual
|
||
factor_builders = {
|
||
"mom_12_1": lambda: f_mom_12_1(prices),
|
||
"mom_7_1": lambda: f_mom_7_1(prices),
|
||
"mom_residual": lambda: f_mom_residual(prices, mkt_ret, betas=betas),
|
||
"rev_1m": lambda: f_rev_1m(prices),
|
||
"w52_high": lambda: f_w52_high(prices),
|
||
"max5_neg": lambda: f_max5_neg(prices),
|
||
"recovery_63": lambda: f_recovery_63(prices),
|
||
"trend_strength": lambda: f_trend_strength(prices),
|
||
"idio_vol_neg": lambda: -resid_vol,
|
||
"low_beta": lambda: -betas,
|
||
}
|
||
fwd_21 = prices.shift(-21) / prices - 1
|
||
fwd_rank = fwd_21.rank(axis=1, pct=True, na_option="keep")
|
||
|
||
rows = []
|
||
for name, build in factor_builders.items():
|
||
fac = build()
|
||
fr = xsec_rank(fac)
|
||
ic_daily = fr.corrwith(fwd_rank, axis=1).dropna()
|
||
ic_mean = ic_daily.mean()
|
||
ic_t = ic_mean / (ic_daily.std() / np.sqrt(len(ic_daily))) if len(ic_daily) > 1 else 0.0
|
||
ls = realistic_decile_spread(fr, prices, rebal=21, tcost=0.001)
|
||
long_only = realistic_top_decile(fr, prices, rebal=21, tcost=0.001)
|
||
rows.append({
|
||
"factor": name,
|
||
"IC_mean": ic_mean, "IC_t": ic_t,
|
||
"LS_CAGR": ls["CAGR"], "LS_Sharpe": ls["Sharpe"],
|
||
"LO_CAGR": long_only["CAGR"], "LO_Sharpe": long_only["Sharpe"],
|
||
"LO_MaxDD": long_only["MaxDD"],
|
||
})
|
||
|
||
df = pd.DataFrame(rows).sort_values("LO_Sharpe", ascending=False)
|
||
df.to_csv(os.path.join(DATA_DIR, "alpha_research_factors.csv"), index=False)
|
||
print(df.to_string(index=False, formatters={
|
||
"IC_mean": "{:+.4f}".format, "IC_t": "{:+.2f}".format,
|
||
"LS_CAGR": "{:+.1%}".format, "LS_Sharpe": "{:+.2f}".format,
|
||
"LO_CAGR": "{:+.1%}".format, "LO_Sharpe": "{:+.2f}".format,
|
||
"LO_MaxDD": "{:.1%}".format,
|
||
}))
|
||
return df
|
||
|
||
|
||
def realistic_decile_spread(factor_rank, prices, rebal=21, tcost=0.001):
|
||
"""Long top-decile minus short bottom-decile, monthly rebal, 10bps t-cost."""
|
||
long_mask = factor_rank >= 0.9
|
||
short_mask = factor_rank <= 0.1
|
||
long_w = long_mask.astype(float).div(long_mask.sum(axis=1).replace(0, np.nan), axis=0)
|
||
short_w = short_mask.astype(float).div(short_mask.sum(axis=1).replace(0, np.nan), axis=0)
|
||
rebal_mask = pd.Series(False, index=factor_rank.index)
|
||
rebal_mask.iloc[::rebal] = True
|
||
long_w[~rebal_mask] = np.nan
|
||
short_w[~rebal_mask] = np.nan
|
||
long_w = long_w.ffill().fillna(0.0)
|
||
short_w = short_w.ffill().fillna(0.0)
|
||
rets = prices.pct_change(fill_method=None)
|
||
ls = ((long_w.shift(1) * rets).sum(axis=1)
|
||
- (short_w.shift(1) * rets).sum(axis=1)) \
|
||
- (long_w.diff().abs().sum(axis=1).fillna(0.0)
|
||
+ short_w.diff().abs().sum(axis=1).fillna(0.0)) * tcost
|
||
ls = ls.fillna(0.0).iloc[252:]
|
||
eq = (1 + ls).cumprod() * 10_000
|
||
return pit.summarize(eq, name="ls")
|
||
|
||
|
||
def realistic_top_decile(factor_rank, prices, rebal=21, tcost=0.001):
|
||
"""Long-only top-decile equal-weight portfolio with t-cost."""
|
||
long_mask = factor_rank >= 0.9
|
||
long_w = long_mask.astype(float).div(long_mask.sum(axis=1).replace(0, np.nan), axis=0)
|
||
rebal_mask = pd.Series(False, index=factor_rank.index)
|
||
rebal_mask.iloc[::rebal] = True
|
||
long_w[~rebal_mask] = np.nan
|
||
long_w = long_w.ffill().fillna(0.0)
|
||
rets = prices.pct_change(fill_method=None)
|
||
port_ret = (long_w.shift(1) * rets).sum(axis=1) \
|
||
- long_w.diff().abs().sum(axis=1).fillna(0.0) * tcost
|
||
port_ret = port_ret.fillna(0.0).iloc[252:]
|
||
eq = (1 + port_ret).cumprod() * 10_000
|
||
return pit.summarize(eq, name="lo")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Stage 2 — Composite backtest
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def composite_backtest(masked: pd.DataFrame):
|
||
print("\n" + "=" * 110)
|
||
print("Stage 2 — IC / LS-Sharpe-weighted composite vs champions (1/3/5/10y)")
|
||
print("=" * 110)
|
||
tickers = [c for c in masked.columns if c != BENCHMARK]
|
||
mkt_ret_full = masked[BENCHMARK].pct_change(fill_method=None)
|
||
|
||
configs = {
|
||
"Alpha(LS-Sharpe, tn=15, rebal=10)":
|
||
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=10,
|
||
vol_target_annual=None, weight_scheme="ls_sharpe"),
|
||
"Alpha(LS-Sharpe, tn=15, rebal=21)":
|
||
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21,
|
||
vol_target_annual=None, weight_scheme="ls_sharpe"),
|
||
"Alpha(LS-Sharpe+VT18, tn=15, rebal=21)":
|
||
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21,
|
||
vol_target_annual=0.18, weight_scheme="ls_sharpe"),
|
||
"Alpha(IC, tn=15, rebal=21)":
|
||
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21,
|
||
vol_target_annual=None, weight_scheme="ic"),
|
||
"Recovery+Mom Top10": lambda: RecoveryMomentumStrategy(top_n=10),
|
||
"fc_up_cap+mom_gap": lambda: FactorComboStrategy("up_cap+mom_gap",
|
||
rebal_freq=21, top_n=10),
|
||
}
|
||
|
||
all_rows = []
|
||
for years in (10, 5, 3, 1):
|
||
sliced, measurement_start = warmup_slice(masked, years, warmup_days=500)
|
||
prices = sliced[tickers]
|
||
print(f"\n --- Window: last {years}y "
|
||
f"(measure {measurement_start.date()} → {sliced.index[-1].date()}, "
|
||
f"warmup from {sliced.index[0].date()}) ---")
|
||
spy = sliced[BENCHMARK].dropna()
|
||
spy_eq = (spy / spy.iloc[0]) * 10_000
|
||
rows = [{"years": years, "strategy": "SPY buy-and-hold",
|
||
**{k: v for k, v in measure(spy_eq, measurement_start, "").items()
|
||
if k != "name"}}]
|
||
for name, factory in configs.items():
|
||
strat = factory()
|
||
eq = pit.backtest(strategy=strat, prices=prices,
|
||
initial_capital=10_000, transaction_cost=0.001)
|
||
m = measure(eq, measurement_start, "")
|
||
rows.append({"years": years, "strategy": name,
|
||
**{k: v for k, v in m.items() if k != "name"}})
|
||
for r in rows:
|
||
print(f" {r['strategy']:<42s} "
|
||
f"CAGR={r['CAGR']*100:>6.1f}% "
|
||
f"Sharpe={r['Sharpe']:>5.2f} "
|
||
f"Sortino={r['Sortino']:>5.2f} "
|
||
f"MaxDD={r['MaxDD']*100:>6.1f}% "
|
||
f"Calmar={r['Calmar']:>5.2f}")
|
||
all_rows.extend(rows)
|
||
|
||
df = pd.DataFrame(all_rows)
|
||
df.to_csv(os.path.join(DATA_DIR, "alpha_research_composite.csv"), index=False)
|
||
return df
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Stage 3 — Config sweep
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def config_sweep(masked: pd.DataFrame):
|
||
print("\n" + "=" * 110)
|
||
print("Stage 3 — AlphaFactor config sweep (10y)")
|
||
print("=" * 110)
|
||
tickers = [c for c in masked.columns if c != BENCHMARK]
|
||
prices = masked[tickers]
|
||
mkt_ret = masked[BENCHMARK].pct_change(fill_method=None)
|
||
|
||
rows = []
|
||
for scheme in ("ls_sharpe", "ic", "equal"):
|
||
for top_n in (10, 15, 20):
|
||
for rebal in (10, 21):
|
||
for vt in (None, 0.18):
|
||
strat = AlphaFactorStrategy(mkt_ret, top_n=top_n, rebal_freq=rebal,
|
||
vol_target_annual=vt,
|
||
weight_scheme=scheme)
|
||
eq = pit.backtest(strat, prices, initial_capital=10_000,
|
||
transaction_cost=0.001)
|
||
s = pit.summarize(eq, "")
|
||
rows.append({"scheme": scheme, "top_n": top_n, "rebal": rebal,
|
||
"vt": vt if vt is not None else "none",
|
||
"CAGR": s["CAGR"], "Sharpe": s["Sharpe"],
|
||
"MaxDD": s["MaxDD"], "Calmar": s["Calmar"]})
|
||
|
||
df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False)
|
||
df.to_csv(os.path.join(DATA_DIR, "alpha_research_sweep.csv"), index=False)
|
||
print(df.head(15).to_string(index=False, formatters={
|
||
"CAGR": "{:.1%}".format, "Sharpe": "{:.2f}".format,
|
||
"MaxDD": "{:.1%}".format, "Calmar": "{:.2f}".format,
|
||
}))
|
||
return df
|
||
|
||
|
||
def main():
|
||
print("Loading PIT data…")
|
||
masked = load()
|
||
print(f" shape={masked.shape} range={masked.index[0].date()} → {masked.index[-1].date()}")
|
||
|
||
factor_diagnostics(masked)
|
||
composite_backtest(masked)
|
||
sweep = config_sweep(masked)
|
||
|
||
print("\n" + "=" * 110)
|
||
print("Top 5 configs:")
|
||
print("=" * 110)
|
||
print(sweep.head(5).to_string(index=False, formatters={
|
||
"CAGR": "{:.1%}".format, "Sharpe": "{:.2f}".format,
|
||
"MaxDD": "{:.1%}".format, "Calmar": "{:.2f}".format,
|
||
}))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|