quant/strategies/ls_momentum.py

"""Industry-neutral long/short momentum on the S&P 500.

Strategy
--------
At each rebalance date (default: monthly):
  1. Compute 12-1 month momentum for every stock in the panel.
  2. Group stocks by GICS sector.
  3. Within each sector, rank by momentum.
  4. Long the top `long_pct` (default 20%) of each sector.
  5. Short the bottom `short_pct` (default 20%) of each sector.
  6. Equal-weight within long-leg and short-leg, scaled so gross long = 1.0
     and gross short = 1.0 → 200% gross exposure, ~0 net (β ≈ 0).

The β-neutrality comes from sector-level matching: each sector contributes
both long and short positions in equal $-amounts, so sector and (mostly)
market exposures cancel out.

Output
------
A weights DataFrame with positive (long) and negative (short) entries.
PIT-safe via terminal `.shift(1)`.

Costs
-----
Realistic backtest of L/S requires three additional costs not present in
long-only:
  * borrow fee on the short leg  (handled by the eval script, not here)
  * higher slippage per turnover (this strategy churns more than V5)
  * dividend payment on shorts   (small for SP500 ~ 1.5% × |short_w|)
The strategy reports raw weights; the eval script applies costs.
"""
from __future__ import annotations

import os
import urllib.request
import io
import json

import numpy as np
import pandas as pd

from strategies.base import Strategy


SECTOR_CACHE = "data/us_sectors.csv"
WIKIPEDIA_SP500_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"


def fetch_sp500_sectors(force: bool = False) -> pd.DataFrame:
    """Return a DataFrame indexed by ticker with GICS sector / sub-industry.

    Cached at data/us_sectors.csv. Wikipedia is the canonical source for
    current S&P 500 sector membership; for backtest purposes we use today's
    sector — sector membership is stable enough year-over-year that this
    introduces minimal lookahead bias for an industry-neutral strategy.
    """
    if not force and os.path.exists(SECTOR_CACHE):
        df = pd.read_csv(SECTOR_CACHE, index_col=0)
        if "GICS Sector" in df.columns and len(df) > 100:
            return df

    print("--- Fetching S&P 500 GICS sectors from Wikipedia ---")
    headers = {"User-Agent": "Mozilla/5.0 (quant-backtest)"}
    req = urllib.request.Request(WIKIPEDIA_SP500_URL, headers=headers)
    with urllib.request.urlopen(req) as resp:
        html = resp.read().decode("utf-8")
    tables = pd.read_html(io.StringIO(html))
    df = tables[0]
    df = df.rename(columns={"Symbol": "ticker"})
    df["ticker"] = df["ticker"].str.replace(".", "-", regex=False)
    df = df.set_index("ticker")
    keep = [c for c in df.columns if c in ("GICS Sector", "GICS Sub-Industry",
                                           "Security")]
    df = df[keep]
    os.makedirs(os.path.dirname(SECTOR_CACHE), exist_ok=True)
    df.to_csv(SECTOR_CACHE)
    print(f"--- Cached {len(df)} sector mappings to {SECTOR_CACHE} ---")
    return df


def _signal_mom_12_1(prices: pd.DataFrame) -> pd.DataFrame:
    """12-1 month cross-sectional momentum (highest = long)."""
    return prices.shift(21).pct_change(231)


def _signal_reversal_1m(prices: pd.DataFrame) -> pd.DataFrame:
    """1-month reversal: highest 21-day return → SHORT (so we negate)."""
    return -prices.pct_change(21)


def _signal_reversal_5d(prices: pd.DataFrame) -> pd.DataFrame:
    """Short-term 5-day reversal."""
    return -prices.pct_change(5)


def _signal_recovery_63(prices: pd.DataFrame) -> pd.DataFrame:
    """Recovery factor: price / 63d low (V-shape continuation, long-only-friendly)."""
    return prices / prices.rolling(63, min_periods=63).min() - 1


def _signal_low_vol(prices: pd.DataFrame) -> pd.DataFrame:
    """Low-vol: invert 60-day realized vol so low vol → high signal."""
    rets = prices.pct_change(fill_method=None)
    vol = rets.rolling(60, min_periods=40).std() * np.sqrt(252)
    return -vol


def _signal_quality_mom(prices: pd.DataFrame) -> pd.DataFrame:
    """Composite: 12-1 mom + consistency (% positive days over 252d) + low-vol.

    Combines a positive long-side selection (mom × consistency) and avoids the
    fragile far-tail of pure momentum by inverse-vol weighting.
    """
    mom = prices.shift(21).pct_change(231)
    rets = prices.pct_change(fill_method=None)
    pos_days = (rets > 0).rolling(252, min_periods=126).mean()
    vol = rets.rolling(60, min_periods=40).std() * np.sqrt(252)

    mom_r = mom.rank(axis=1, pct=True, na_option="keep")
    cons_r = pos_days.rank(axis=1, pct=True, na_option="keep")
    inv_vol_r = (-vol).rank(axis=1, pct=True, na_option="keep")
    return 0.4 * mom_r + 0.3 * cons_r + 0.3 * inv_vol_r


def _signal_mom_x_lowvol(prices: pd.DataFrame) -> pd.DataFrame:
    """Momentum filtered by low-vol — long winners, short LOW-vol losers.

    Reduces meme-stock blowups on the short leg by avoiding high-vol losers.
    """
    mom = prices.shift(21).pct_change(231)
    rets = prices.pct_change(fill_method=None)
    vol = rets.rolling(60, min_periods=40).std() * np.sqrt(252)
    mom_r = mom.rank(axis=1, pct=True, na_option="keep")
    inv_vol_r = (-vol).rank(axis=1, pct=True, na_option="keep")
    return 0.5 * mom_r + 0.5 * inv_vol_r


SIGNAL_REGISTRY = {
    "mom_12_1": _signal_mom_12_1,
    "reversal_1m": _signal_reversal_1m,
    "reversal_5d": _signal_reversal_5d,
    "recovery_63": _signal_recovery_63,
    "low_vol": _signal_low_vol,
    "quality_mom": _signal_quality_mom,
    "mom_x_lowvol": _signal_mom_x_lowvol,
}


class IndustryNeutralLSMomentum(Strategy):
    """Industry-neutral long/short portfolio with selectable signal."""

    def __init__(
        self,
        rebal_freq: int = 21,
        mom_lookback: int = 252,
        mom_skip: int = 21,
        long_pct: float = 0.20,
        short_pct: float = 0.20,
        min_sector_size: int = 5,
        sector_map: pd.Series | None = None,
        gross_long: float = 1.0,
        gross_short: float = 1.0,
        signal_name: str = "mom_12_1",
    ) -> None:
        self.rebal_freq = rebal_freq
        self.mom_lookback = mom_lookback
        self.mom_skip = mom_skip
        self.long_pct = long_pct
        self.short_pct = short_pct
        self.min_sector_size = min_sector_size
        self.sector_map = sector_map
        self.gross_long = gross_long
        self.gross_short = gross_short
        if signal_name not in SIGNAL_REGISTRY:
            raise ValueError(f"Unknown signal: {signal_name}")
        self.signal_name = signal_name
        self.signal_func = SIGNAL_REGISTRY[signal_name]

    def _resolve_sector_map(self, columns: list[str]) -> pd.Series:
        if self.sector_map is not None:
            return self.sector_map.reindex(columns)
        df = fetch_sp500_sectors()
        s = df["GICS Sector"]
        return s.reindex(columns)

    def generate_signals(self, data: pd.DataFrame) -> pd.DataFrame:
        cols = list(data.columns)
        sectors = self._resolve_sector_map(cols)
        mom = self.signal_func(data)

        weights = pd.DataFrame(0.0, index=data.index, columns=cols)
        warmup = self.mom_lookback + 5

        # Pre-compute which rows are rebal days
        rebal_idx = list(range(warmup, len(data), self.rebal_freq))
        rebal_set = set(rebal_idx)

        # Group columns by sector
        sector_to_cols: dict[str, list[str]] = {}
        for c in cols:
            s = sectors.get(c)
            if pd.isna(s):
                continue
            sector_to_cols.setdefault(s, []).append(c)

        for t in rebal_idx:
            row_mom = mom.iloc[t]
            longs: dict[str, float] = {}
            shorts: dict[str, float] = {}

            for sector, members in sector_to_cols.items():
                ms = row_mom.reindex(members).dropna()
                if len(ms) < self.min_sector_size:
                    continue
                n_long = max(1, int(round(len(ms) * self.long_pct)))
                n_short = max(1, int(round(len(ms) * self.short_pct)))
                ranked = ms.sort_values(ascending=False)
                long_picks = ranked.head(n_long).index
                short_picks = ranked.tail(n_short).index
                for sym in long_picks:
                    longs[sym] = longs.get(sym, 0.0) + 1.0
                for sym in short_picks:
                    shorts[sym] = shorts.get(sym, 0.0) - 1.0

            if not longs or not shorts:
                continue
            # Equal-weight within long leg and short leg
            n_l = sum(longs.values())
            n_s = -sum(shorts.values())
            for sym in longs:
                longs[sym] = self.gross_long * longs[sym] / n_l
            for sym in shorts:
                shorts[sym] = self.gross_short * shorts[sym] / n_s

            for sym, w in longs.items():
                weights.iat[t, cols.index(sym)] = w
            for sym, w in shorts.items():
                weights.iat[t, cols.index(sym)] = w

        # Forward-fill between rebal dates
        non_rebal_mask = pd.Series(True, index=data.index)
        for i in rebal_idx:
            non_rebal_mask.iat[i] = False
        weights[non_rebal_mask.values] = np.nan
        weights = weights.ffill().fillna(0.0)
        weights.iloc[:warmup] = 0.0

        return weights.shift(1).fillna(0.0)


__all__ = ["IndustryNeutralLSMomentum", "fetch_sp500_sectors"]