Files
quant/research/pit_optimization.py
Gahow Wang 541f7bcf5b research: add strategy evaluation and exploration scripts
Add 28 research scripts covering DCA simulation, momentum evaluation,
Sharpe optimization, trend rider analysis, and US fundamentals exploration.
2026-05-14 12:54:08 +08:00

286 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
PIT-compliant strategy optimization.
After fixing survivorship bias, CAGR dropped from 44.7% to 18.1% and Sharpe
from 1.52 to 0.84. The strategy barely beats SPY. Root causes:
1. Many top performers (CVNA, TSLA, MRNA, PLTR, APP) weren't in S&P 500
when the biased backtest selected them
2. "Bad" stocks removed from S&P 500 (PCG, M) WOULD have been selected by
recovery signals → losses not captured in biased backtest
Need to re-sweep parameters on PIT-corrected data:
- Maybe top_n needs to be different
- Rebalance frequency might need adjustment
- DD dampener parameters may need recalibration
- The signal itself might need modification
"""
from __future__ import annotations
import os, sys
import numpy as np
import pandas as pd
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from strategies.base import Strategy
import universe_history as uh
from research.pit_backtest import load_pit_prices, pit_universe
def _rank(df):
return df.rank(axis=1, pct=True, na_option="keep")
def compute_metrics(daily_rets: pd.Series) -> dict:
eq = (1 + daily_rets).cumprod()
n_years = len(daily_rets) / 252.0
cagr = eq.iloc[-1] ** (1.0 / n_years) - 1.0
vol = daily_rets.std() * np.sqrt(252)
sharpe = daily_rets.mean() / daily_rets.std() * np.sqrt(252) if daily_rets.std() > 0 else 0
running_max = eq.cummax()
dd = eq / running_max - 1
max_dd = dd.min()
calmar = cagr / abs(max_dd) if max_dd != 0 else 0
return {"cagr": cagr, "vol": vol, "sharpe": sharpe, "max_dd": max_dd, "calmar": calmar}
def yearly_returns(daily_rets: pd.Series) -> pd.Series:
eq = (1 + daily_rets).cumprod()
yearly = eq.resample("YE").last().pct_change()
yearly.iloc[0] = eq.resample("YE").last().iloc[0] - 1
yearly.index = yearly.index.year
return yearly
class PITEnsemble(Strategy):
"""Ensemble strategy with configurable params for PIT optimization."""
def __init__(self, top_n=12, rebal_freq=42, mom_blend=0.0,
asym_vol=True, asym_vol_floor=0.50,
dd_dampen=True, dd_floor=0.70, dd_denom=0.35,
mom_filter_on=True):
self.top_n = top_n
self.rebal_freq = rebal_freq
self.mom_blend = mom_blend
self.asym_vol = asym_vol
self.asym_vol_floor = asym_vol_floor
self.dd_dampen = dd_dampen
self.dd_floor = dd_floor
self.dd_denom = dd_denom
self.mom_filter_on = mom_filter_on
def generate_signals(self, data: pd.DataFrame) -> pd.DataFrame:
p = data
ret = p.pct_change()
# === Signal A: rec_mfilt + deep_upvol ===
rec_126 = p / p.rolling(126, min_periods=126).min() - 1
if self.mom_filter_on:
mom_filter = p.shift(21).pct_change(105)
rec_mfilt = rec_126.where(mom_filter > 0, np.nan)
else:
rec_mfilt = rec_126
rec_mfilt_r = _rank(rec_mfilt)
up_vol = ret.where(ret > 0, 0).rolling(20, min_periods=15).sum()
deep_upvol = _rank(rec_126) * _rank(up_vol)
deep_upvol_r = _rank(deep_upvol)
signal_a = 0.5 * rec_mfilt_r + 0.5 * deep_upvol_r
# === Signal B: Recovery 63d + 12-1 momentum ===
rec_63 = p / p.rolling(63, min_periods=63).min() - 1
mom_12_1 = p.shift(21).pct_change(231)
rec_63_r = _rank(rec_63)
mom_r = _rank(mom_12_1)
signal_b = 0.5 * rec_63_r + 0.5 * mom_r
# === Signal C: Pure momentum ===
signal_c = mom_r
# === Ensemble ===
α = self.mom_blend
if α > 0:
ensemble = (1 - α) / 2 * signal_a + (1 - α) / 2 * signal_b + α * signal_c
else:
ensemble = 0.5 * signal_a + 0.5 * signal_b
# === Select top_n ===
rank = ensemble.rank(axis=1, ascending=False, na_option="bottom")
n_valid = ensemble.notna().sum(axis=1)
enough = n_valid >= self.top_n
top_mask = (rank <= self.top_n) & enough.values.reshape(-1, 1)
raw = top_mask.astype(float)
row_sums = raw.sum(axis=1).replace(0, np.nan)
signals = raw.div(row_sums, axis=0).fillna(0.0)
# === Rebalance ===
warmup = 252
rebal_mask = pd.Series(False, index=data.index)
rebal_indices = list(range(warmup, len(data), self.rebal_freq))
rebal_mask.iloc[rebal_indices] = True
signals[~rebal_mask] = np.nan
signals = signals.ffill().fillna(0.0)
signals.iloc[:warmup] = 0.0
signals = signals.shift(1).fillna(0.0)
# === Asymmetric vol ===
if self.asym_vol:
daily_rets = data.pct_change().fillna(0.0)
port_rets = (signals * daily_rets).sum(axis=1)
short_vol = port_rets.rolling(20, min_periods=10).std() * np.sqrt(252)
vol_median = short_vol.rolling(252, min_periods=126).median()
recent_ret = port_rets.rolling(20, min_periods=10).sum()
high_vol_neg = (short_vol > vol_median * 1.5) & (recent_ret < 0)
asym_scale = pd.Series(1.0, index=data.index)
asym_scale[high_vol_neg] = self.asym_vol_floor
signals = signals.mul(asym_scale.shift(1).fillna(1.0), axis=0)
# === DD dampener ===
if self.dd_dampen:
daily_rets = data.pct_change().fillna(0.0)
mkt_rets = daily_rets.mean(axis=1)
mkt_eq = (1 + mkt_rets).cumprod()
mkt_dd = mkt_eq / mkt_eq.cummax() - 1
dd_scale = (1.0 + mkt_dd / self.dd_denom).clip(lower=self.dd_floor, upper=1.0)
signals = signals.mul(dd_scale.shift(1).fillna(1.0), axis=0)
return signals
def run_strategy(strat, data, start="2017-06-01", end="2026-05-13"):
weights = strat.generate_signals(data)
daily_rets = (weights * data.pct_change().fillna(0.0)).sum(axis=1)
return daily_rets.loc[start:end]
def fmt_row(label, m):
return (f"{label:<50s} {m['cagr']*100:>6.1f}% {m['vol']*100:>6.1f}% "
f"{m['sharpe']:>6.2f} {m['max_dd']*100:>6.1f}% {m['calmar']:>6.2f}")
def main():
print("=" * 90)
print("PIT-COMPLIANT STRATEGY OPTIMIZATION")
print("=" * 90)
# Load PIT data
pit_raw = load_pit_prices()
intervals = uh.load_sp500_history()
pit_data = uh.mask_prices(pit_raw, intervals)
print(f"PIT data: {pit_data.shape}")
# SPY benchmark
spy_rets = pit_raw["SPY"].pct_change().fillna(0.0).loc["2017-06-01":"2026-05-13"]
spy_m = compute_metrics(spy_rets)
print(f"\nSPY benchmark: CAGR {spy_m['cagr']*100:.1f}% Sharpe {spy_m['sharpe']:.2f}")
header = f"{'Config':<50s} {'CAGR':>7s} {'Vol':>7s} {'Sharpe':>6s} {'MaxDD':>7s} {'Calmar':>6s}"
# --- Sweep 1: top_n ---
print(f"\n--- top_n sweep (rebal=42, no risk mgmt) ---")
print(header)
print("-" * 90)
for n in [8, 10, 12, 15, 20, 25, 30]:
strat = PITEnsemble(top_n=n, rebal_freq=42, asym_vol=False, dd_dampen=False)
rets = run_strategy(strat, pit_data)
m = compute_metrics(rets)
print(fmt_row(f"top_n={n}", m))
# --- Sweep 2: rebal frequency ---
print(f"\n--- rebal sweep (top_n=20, no risk mgmt) ---")
print(header)
print("-" * 90)
for freq in [21, 42, 63]:
strat = PITEnsemble(top_n=20, rebal_freq=freq, asym_vol=False, dd_dampen=False)
rets = run_strategy(strat, pit_data)
m = compute_metrics(rets)
print(fmt_row(f"rebal={freq}d, top20", m))
# --- Sweep 3: momentum blend ---
print(f"\n--- momentum blend (top_n=20, rebal=42, no risk mgmt) ---")
print(header)
print("-" * 90)
for α in [0.0, 0.20, 0.30, 0.50, 0.70, 1.0]:
strat = PITEnsemble(top_n=20, rebal_freq=42, mom_blend=α, asym_vol=False, dd_dampen=False)
rets = run_strategy(strat, pit_data)
m = compute_metrics(rets)
label = "pure recovery" if α == 0 else "pure momentum" if α == 1.0 else f"mom_blend={α:.0%}"
print(fmt_row(label, m))
# --- Sweep 4: without mom_filter (recovery signal catches more stocks) ---
print(f"\n--- mom_filter ON vs OFF (top_n=20, rebal=42) ---")
print(header)
print("-" * 90)
for mf in [True, False]:
strat = PITEnsemble(top_n=20, rebal_freq=42, mom_filter_on=mf, asym_vol=False, dd_dampen=False)
rets = run_strategy(strat, pit_data)
m = compute_metrics(rets)
print(fmt_row(f"mom_filter={'ON' if mf else 'OFF'}", m))
# --- Sweep 5: risk overlays on best raw config ---
print(f"\n--- Risk overlays (best raw config) ---")
print(header)
print("-" * 90)
configs = [
("raw (no risk)", dict(asym_vol=False, dd_dampen=False)),
("+ asym_vol", dict(asym_vol=True, dd_dampen=False)),
("+ DD dampener", dict(asym_vol=False, dd_dampen=True)),
("+ both", dict(asym_vol=True, dd_dampen=True)),
]
for label, kwargs in configs:
for n in [12, 20]:
strat = PITEnsemble(top_n=n, rebal_freq=42, **kwargs)
rets = run_strategy(strat, pit_data)
m = compute_metrics(rets)
print(fmt_row(f"top{n}, {label}", m))
# --- Best PIT config: yearly breakdown ---
print(f"\n{'=' * 90}")
print("BEST PIT CONFIG — yearly analysis")
print(f"{'=' * 90}")
# Run a broad sweep to find the best
best_sharpe = 0
best_label = ""
best_rets = None
for n in [12, 15, 20, 25]:
for freq in [21, 42, 63]:
for α in [0.0, 0.30, 0.50, 1.0]:
for asym in [False, True]:
for dd in [False, True]:
strat = PITEnsemble(top_n=n, rebal_freq=freq, mom_blend=α,
asym_vol=asym, dd_dampen=dd)
rets = run_strategy(strat, pit_data)
m = compute_metrics(rets)
if m["sharpe"] > best_sharpe:
best_sharpe = m["sharpe"]
best_label = f"top{n}_rebal{freq}_mom{α:.0%}_asym{asym}_dd{dd}"
best_rets = rets
best_m = m
print(f"Best config: {best_label}")
print(fmt_row("BEST", best_m))
print(f"\n--- Yearly ---")
yr = yearly_returns(best_rets)
spy_yr = yearly_returns(spy_rets)
print(f" {'Year':>4s} {'Strategy':>10s} {'SPY':>10s} {'Alpha':>10s}")
for year in sorted(yr.index):
s = spy_yr.get(year, float("nan"))
alpha = yr[year] - s
print(f" {year:>4d} {yr[year]*100:>+9.1f}% {s*100:>+9.1f}% {alpha*100:>+9.1f}pp")
# Bootstrap
print(f"\n--- Bootstrap ---")
from research.trend_rider_p0 import block_bootstrap
boot = block_bootstrap(best_rets, n_boot=5000, block_len=42)
print(f" Sharpe: median={boot['sharpe'].median():.2f} "
f"5th={boot['sharpe'].quantile(0.05):.2f} "
f"95th={boot['sharpe'].quantile(0.95):.2f}")
print(f" P(Sharpe > 1.0): {(boot['sharpe'] > 1.0).mean()*100:.1f}%")
print(f" P(Sharpe > SPY's {spy_m['sharpe']:.2f}): {(boot['sharpe'] > spy_m['sharpe']).mean()*100:.1f}%")
if __name__ == "__main__":
main()