quant/research/pit_backtest.py

"""
Point-in-time backtest runner.

Key idea: mask price data to NaN outside S&P 500 membership windows before
passing to the strategy. The strategy's signal computations then naturally
exclude non-members — no refactoring of strategies required.

Caveat: a stock joining the index has no signal for ~252 days after joining
(rolling windows need non-NaN warm-up). This is conservative but unbiased.
"""

import os

import numpy as np
import pandas as pd

import metrics
import universe_history as uh

DATA_DIR = "data"
PIT_CSV = os.path.join(DATA_DIR, "us_pit.csv")


# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------

def load_pit_prices() -> pd.DataFrame:
    """Load the full historical S&P 500 price matrix (delisted included)."""
    if not os.path.exists(PIT_CSV):
        raise FileNotFoundError(
            f"{PIT_CSV} not found. Run `uv run python -m research.fetch_historical` first."
        )
    df = pd.read_csv(PIT_CSV, index_col=0, parse_dates=True)
    return df.sort_index()


def pit_universe(prices: pd.DataFrame) -> pd.DataFrame:
    """Return prices masked to S&P 500 membership at each date (NaN outside)."""
    intervals = uh.load_sp500_history()
    return uh.mask_prices(prices, intervals)


# ---------------------------------------------------------------------------
# Backtest engine (mirrors main.backtest but accepts masked prices)
# ---------------------------------------------------------------------------

def backtest(
    strategy,
    prices: pd.DataFrame,
    initial_capital: float = 10_000,
    transaction_cost: float = 0.001,
    fixed_fee: float = 0.0,
    benchmark: pd.Series | None = None,
    regime_filter: pd.Series | None = None,
) -> pd.Series:
    """
    Vectorized backtest with optional regime filter.

    `regime_filter`: boolean series aligned to prices.index. True → be in the
    market (use strategy weights). False → go to cash. When None, always invested.
    """
    weights = strategy.generate_signals(prices)
    weights = weights.reindex(prices.index).fillna(0.0)

    if regime_filter is not None:
        rf = regime_filter.reindex(prices.index).fillna(False).astype(float)
        weights = weights.mul(rf, axis=0)

    daily_returns = prices.pct_change().fillna(0.0)
    portfolio_returns = (daily_returns * weights).sum(axis=1)

    turnover = weights.diff().abs().sum(axis=1).fillna(0.0)
    portfolio_returns -= turnover * transaction_cost

    if fixed_fee > 0:
        weight_changes = weights.diff().fillna(0.0)
        n_trades = (weight_changes.abs() > 1e-8).sum(axis=1)
        equity_running = (1 + portfolio_returns).cumprod() * initial_capital
        fee_impact = (n_trades * fixed_fee) / equity_running.shift(1).fillna(initial_capital)
        portfolio_returns -= fee_impact

    equity = (1 + portfolio_returns).cumprod() * initial_capital
    return equity


# ---------------------------------------------------------------------------
# Metrics helper
# ---------------------------------------------------------------------------

def summarize(equity: pd.Series, name: str = "") -> dict:
    """Return a dict of key performance metrics (no printing)."""
    eq = equity.dropna()
    if len(eq) < 2:
        return {"name": name, "error": "insufficient data"}
    daily = eq.pct_change().dropna()
    total_return = eq.iloc[-1] / eq.iloc[0] - 1
    years = (eq.index[-1] - eq.index[0]).days / 365.25
    cagr = (eq.iloc[-1] / eq.iloc[0]) ** (1 / years) - 1 if years > 0 else 0.0
    vol = daily.std() * np.sqrt(252)
    sharpe = (daily.mean() * 252) / vol if vol > 0 else 0.0
    downside = daily[daily < 0].std() * np.sqrt(252)
    sortino = (daily.mean() * 252) / downside if downside > 0 else 0.0
    dd = (eq / eq.cummax() - 1).min()
    calmar = cagr / abs(dd) if dd < 0 else 0.0
    return {
        "name": name,
        "CAGR": cagr,
        "Sharpe": sharpe,
        "Sortino": sortino,
        "MaxDD": dd,
        "Calmar": calmar,
        "TotalRet": total_return,
        "Vol": vol,
    }


def fmt_row(r: dict) -> str:
    return (f"  {r['name']:<38s} "
            f"CAGR={r['CAGR']*100:>6.1f}%  "
            f"Sharpe={r['Sharpe']:>5.2f}  "
            f"Sortino={r['Sortino']:>5.2f}  "
            f"MaxDD={r['MaxDD']*100:>6.1f}%  "
            f"Calmar={r['Calmar']:>5.2f}  "
            f"Total={r['TotalRet']*100:>7.1f}%")