Files
quant/research/alpha_research.py

280 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Professional QR-style factor research on the PIT S&P 500 universe.
Stage 1 — Factor diagnostics.
IC (Spearman, 21d fwd), t-stat, realistic long-short decile backtest
(monthly rebalance, 10 bps t-cost).
Stage 2 — Composite backtest 1/3/5/10y vs champions.
For 1y window we pre-pend 2y of warmup then score returns on the last 1y
only, so strategies with 252d+ warmup are actually active in-window.
Stage 3 — Config sweep across weight_scheme × top_n × rebal × vol_target.
Outputs CSVs to data/alpha_research_*.csv.
"""
from __future__ import annotations
import os
import warnings
import numpy as np
import pandas as pd
import research.pit_backtest as pit
from research.alpha_factors import (AlphaFactorStrategy, _rolling_beta_and_residvol,
f_mom_12_1, f_mom_7_1, f_rev_1m, f_w52_high,
f_max5_neg, f_recovery_63, f_trend_strength,
xsec_rank, _rolling_ls_sharpe)
from strategies.factor_combo import FactorComboStrategy
from strategies.recovery_momentum import RecoveryMomentumStrategy
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
DATA_DIR = "data"
BENCHMARK = "SPY"
def load():
raw = pit.load_pit_prices()
masked = pit.pit_universe(raw)
if BENCHMARK in raw.columns:
masked[BENCHMARK] = raw[BENCHMARK]
return masked
def warmup_slice(df: pd.DataFrame, years: int, warmup_days: int = 500) -> tuple[pd.DataFrame, pd.Timestamp]:
"""Return (prices_with_warmup, measurement_start). Strategies are fed the
longer series, but metrics must be computed only from measurement_start."""
measurement_start = df.index[-1] - pd.DateOffset(years=years)
first_day = df.index[0]
# Keep all rows between measurement_start - warmup_days and end.
cutoff = max(first_day, measurement_start - pd.Timedelta(days=warmup_days * 1.5))
sliced = df[df.index >= cutoff]
return sliced, measurement_start
def measure(eq: pd.Series, start: pd.Timestamp, name: str = "") -> dict:
eq = eq[eq.index >= start]
# Re-base to 10_000 at start
eq = eq / eq.iloc[0] * 10_000
return pit.summarize(eq, name=name)
# ---------------------------------------------------------------------------
# Stage 1 — Factor diagnostics
# ---------------------------------------------------------------------------
def factor_diagnostics(masked: pd.DataFrame):
print("\n" + "=" * 110)
print("Stage 1 — Factor diagnostics (full 10y PIT, monthly rebal, 10bps t-cost)")
print("=" * 110)
tickers = [c for c in masked.columns if c != BENCHMARK]
prices = masked[tickers]
mkt_ret = masked[BENCHMARK].pct_change(fill_method=None)
betas, resid_vol = _rolling_beta_and_residvol(prices, mkt_ret, window=60)
from research.alpha_factors import f_mom_residual
factor_builders = {
"mom_12_1": lambda: f_mom_12_1(prices),
"mom_7_1": lambda: f_mom_7_1(prices),
"mom_residual": lambda: f_mom_residual(prices, mkt_ret, betas=betas),
"rev_1m": lambda: f_rev_1m(prices),
"w52_high": lambda: f_w52_high(prices),
"max5_neg": lambda: f_max5_neg(prices),
"recovery_63": lambda: f_recovery_63(prices),
"trend_strength": lambda: f_trend_strength(prices),
"idio_vol_neg": lambda: -resid_vol,
"low_beta": lambda: -betas,
}
fwd_21 = prices.shift(-21) / prices - 1
fwd_rank = fwd_21.rank(axis=1, pct=True, na_option="keep")
rows = []
for name, build in factor_builders.items():
fac = build()
fr = xsec_rank(fac)
ic_daily = fr.corrwith(fwd_rank, axis=1).dropna()
ic_mean = ic_daily.mean()
ic_t = ic_mean / (ic_daily.std() / np.sqrt(len(ic_daily))) if len(ic_daily) > 1 else 0.0
ls = realistic_decile_spread(fr, prices, rebal=21, tcost=0.001)
long_only = realistic_top_decile(fr, prices, rebal=21, tcost=0.001)
rows.append({
"factor": name,
"IC_mean": ic_mean, "IC_t": ic_t,
"LS_CAGR": ls["CAGR"], "LS_Sharpe": ls["Sharpe"],
"LO_CAGR": long_only["CAGR"], "LO_Sharpe": long_only["Sharpe"],
"LO_MaxDD": long_only["MaxDD"],
})
df = pd.DataFrame(rows).sort_values("LO_Sharpe", ascending=False)
df.to_csv(os.path.join(DATA_DIR, "alpha_research_factors.csv"), index=False)
print(df.to_string(index=False, formatters={
"IC_mean": "{:+.4f}".format, "IC_t": "{:+.2f}".format,
"LS_CAGR": "{:+.1%}".format, "LS_Sharpe": "{:+.2f}".format,
"LO_CAGR": "{:+.1%}".format, "LO_Sharpe": "{:+.2f}".format,
"LO_MaxDD": "{:.1%}".format,
}))
return df
def realistic_decile_spread(factor_rank, prices, rebal=21, tcost=0.001):
"""Long top-decile minus short bottom-decile, monthly rebal, 10bps t-cost."""
long_mask = factor_rank >= 0.9
short_mask = factor_rank <= 0.1
long_w = long_mask.astype(float).div(long_mask.sum(axis=1).replace(0, np.nan), axis=0)
short_w = short_mask.astype(float).div(short_mask.sum(axis=1).replace(0, np.nan), axis=0)
rebal_mask = pd.Series(False, index=factor_rank.index)
rebal_mask.iloc[::rebal] = True
long_w[~rebal_mask] = np.nan
short_w[~rebal_mask] = np.nan
long_w = long_w.ffill().fillna(0.0)
short_w = short_w.ffill().fillna(0.0)
rets = prices.pct_change(fill_method=None)
ls = ((long_w.shift(1) * rets).sum(axis=1)
- (short_w.shift(1) * rets).sum(axis=1)) \
- (long_w.diff().abs().sum(axis=1).fillna(0.0)
+ short_w.diff().abs().sum(axis=1).fillna(0.0)) * tcost
ls = ls.fillna(0.0).iloc[252:]
eq = (1 + ls).cumprod() * 10_000
return pit.summarize(eq, name="ls")
def realistic_top_decile(factor_rank, prices, rebal=21, tcost=0.001):
"""Long-only top-decile equal-weight portfolio with t-cost."""
long_mask = factor_rank >= 0.9
long_w = long_mask.astype(float).div(long_mask.sum(axis=1).replace(0, np.nan), axis=0)
rebal_mask = pd.Series(False, index=factor_rank.index)
rebal_mask.iloc[::rebal] = True
long_w[~rebal_mask] = np.nan
long_w = long_w.ffill().fillna(0.0)
rets = prices.pct_change(fill_method=None)
port_ret = (long_w.shift(1) * rets).sum(axis=1) \
- long_w.diff().abs().sum(axis=1).fillna(0.0) * tcost
port_ret = port_ret.fillna(0.0).iloc[252:]
eq = (1 + port_ret).cumprod() * 10_000
return pit.summarize(eq, name="lo")
# ---------------------------------------------------------------------------
# Stage 2 — Composite backtest
# ---------------------------------------------------------------------------
def composite_backtest(masked: pd.DataFrame):
print("\n" + "=" * 110)
print("Stage 2 — IC / LS-Sharpe-weighted composite vs champions (1/3/5/10y)")
print("=" * 110)
tickers = [c for c in masked.columns if c != BENCHMARK]
mkt_ret_full = masked[BENCHMARK].pct_change(fill_method=None)
configs = {
"Alpha(LS-Sharpe, tn=15, rebal=10)":
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=10,
vol_target_annual=None, weight_scheme="ls_sharpe"),
"Alpha(LS-Sharpe, tn=15, rebal=21)":
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21,
vol_target_annual=None, weight_scheme="ls_sharpe"),
"Alpha(LS-Sharpe+VT18, tn=15, rebal=21)":
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21,
vol_target_annual=0.18, weight_scheme="ls_sharpe"),
"Alpha(IC, tn=15, rebal=21)":
lambda: AlphaFactorStrategy(mkt_ret_full, top_n=15, rebal_freq=21,
vol_target_annual=None, weight_scheme="ic"),
"Recovery+Mom Top10": lambda: RecoveryMomentumStrategy(top_n=10),
"fc_up_cap+mom_gap": lambda: FactorComboStrategy("up_cap+mom_gap",
rebal_freq=21, top_n=10),
}
all_rows = []
for years in (10, 5, 3, 1):
sliced, measurement_start = warmup_slice(masked, years, warmup_days=500)
prices = sliced[tickers]
print(f"\n --- Window: last {years}y "
f"(measure {measurement_start.date()}{sliced.index[-1].date()}, "
f"warmup from {sliced.index[0].date()}) ---")
spy = sliced[BENCHMARK].dropna()
spy_eq = (spy / spy.iloc[0]) * 10_000
rows = [{"years": years, "strategy": "SPY buy-and-hold",
**{k: v for k, v in measure(spy_eq, measurement_start, "").items()
if k != "name"}}]
for name, factory in configs.items():
strat = factory()
eq = pit.backtest(strategy=strat, prices=prices,
initial_capital=10_000, transaction_cost=0.001)
m = measure(eq, measurement_start, "")
rows.append({"years": years, "strategy": name,
**{k: v for k, v in m.items() if k != "name"}})
for r in rows:
print(f" {r['strategy']:<42s} "
f"CAGR={r['CAGR']*100:>6.1f}% "
f"Sharpe={r['Sharpe']:>5.2f} "
f"Sortino={r['Sortino']:>5.2f} "
f"MaxDD={r['MaxDD']*100:>6.1f}% "
f"Calmar={r['Calmar']:>5.2f}")
all_rows.extend(rows)
df = pd.DataFrame(all_rows)
df.to_csv(os.path.join(DATA_DIR, "alpha_research_composite.csv"), index=False)
return df
# ---------------------------------------------------------------------------
# Stage 3 — Config sweep
# ---------------------------------------------------------------------------
def config_sweep(masked: pd.DataFrame):
print("\n" + "=" * 110)
print("Stage 3 — AlphaFactor config sweep (10y)")
print("=" * 110)
tickers = [c for c in masked.columns if c != BENCHMARK]
prices = masked[tickers]
mkt_ret = masked[BENCHMARK].pct_change(fill_method=None)
rows = []
for scheme in ("ls_sharpe", "ic", "equal"):
for top_n in (10, 15, 20):
for rebal in (10, 21):
for vt in (None, 0.18):
strat = AlphaFactorStrategy(mkt_ret, top_n=top_n, rebal_freq=rebal,
vol_target_annual=vt,
weight_scheme=scheme)
eq = pit.backtest(strat, prices, initial_capital=10_000,
transaction_cost=0.001)
s = pit.summarize(eq, "")
rows.append({"scheme": scheme, "top_n": top_n, "rebal": rebal,
"vt": vt if vt is not None else "none",
"CAGR": s["CAGR"], "Sharpe": s["Sharpe"],
"MaxDD": s["MaxDD"], "Calmar": s["Calmar"]})
df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False)
df.to_csv(os.path.join(DATA_DIR, "alpha_research_sweep.csv"), index=False)
print(df.head(15).to_string(index=False, formatters={
"CAGR": "{:.1%}".format, "Sharpe": "{:.2f}".format,
"MaxDD": "{:.1%}".format, "Calmar": "{:.2f}".format,
}))
return df
def main():
print("Loading PIT data…")
masked = load()
print(f" shape={masked.shape} range={masked.index[0].date()}{masked.index[-1].date()}")
factor_diagnostics(masked)
composite_backtest(masked)
sweep = config_sweep(masked)
print("\n" + "=" * 110)
print("Top 5 configs:")
print("=" * 110)
print(sweep.head(5).to_string(index=False, formatters={
"CAGR": "{:.1%}".format, "Sharpe": "{:.2f}".format,
"MaxDD": "{:.1%}".format, "Calmar": "{:.2f}".format,
}))
if __name__ == "__main__":
main()