quant/research/permanent_yearly.py

"""Yearly evaluation of Permanent / TrendRider strategies vs stock pickers.

Two test cases per strategy, 2015-01-01 → 2025-12-31:

    Test 1 (annual reset): each calendar year starts with $10,000.
        We compute that year's compounded return and report the
        end-of-year equity. Years are independent.
    Test 2 (annual contribution): start with $10,000 in 2015, add
        $10,000 cash on the first trading day of each subsequent year.
        Report the running portfolio value at year-end (after all
        contributions and that year's gains/losses).

Strategies covered:
  * PermanentOverlay  — Browne 25/25/25/25 + Faber MA200 stock-slot overlay
  * TrendRiderV3      — risk-on/risk-off basket with regime gates
  * PermanentV4       — improved Permanent (momentum baskets + bond trend)
  * Recovery+Mom Top10 — current top US stock-picking strategy

Run:
    uv run python -m research.permanent_yearly
"""
from __future__ import annotations

import os
import sys
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

# Allow running as a script ("python research/permanent_yearly.py") and
# as a module ("python -m research.permanent_yearly")
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import yfinance as yf

import data_manager
from strategies.permanent import (
    ETF_UNIVERSE,
    GLOBAL_ETF_UNIVERSE,
    HK_ETF_UNIVERSE,
    PermanentOverlay,
    PermanentV4,
    TrendRiderV3,
)
from strategies.recovery_momentum import RecoveryMomentumStrategy

ETF_CACHE = "data/etfs.csv"
STOCKS_LONG_CACHE = "data/us_long.csv"


def load_long_stock_history(tickers: list[str], start: str = "2014-01-01") -> pd.DataFrame:
    """Stock prices going back further than the 10-year data_manager cache.

    We need 2014 data so the 252-day momentum warmup completes before 2015.
    Caches to data/us_long.csv. Refreshes once a day if the latest date is
    older than yesterday.
    """
    cached: pd.DataFrame | None = None
    if os.path.exists(STOCKS_LONG_CACHE):
        cached = pd.read_csv(STOCKS_LONG_CACHE, index_col=0, parse_dates=True)

    fresh_today = (
        cached is not None
        and cached.index.max() >= pd.Timestamp(datetime.now().date() - timedelta(days=1))
    )
    have_all_tickers = (
        cached is not None
        and all(t in cached.columns for t in tickers)
    )
    if fresh_today and have_all_tickers:
        return cached[tickers].ffill()

    print(f"--- Downloading {len(tickers)} stock tickers (long history) from {start} ---")
    raw = yf.download(tickers, start=start, auto_adjust=True, progress=False, threads=True)
    if isinstance(raw.columns, pd.MultiIndex):
        df = raw["Close"]
    else:
        df = raw[["Close"]].rename(columns={"Close": tickers[0]})
    df = df.dropna(how="all")
    # Drop tickers with >50% missing — same convention as data_manager
    good = df.columns[df.notna().mean() > 0.5]
    df = df[good]
    df = df.ffill()
    if cached is not None:
        df = cached.combine_first(df)
        df = df.sort_index()
    os.makedirs("data", exist_ok=True)
    df.to_csv(STOCKS_LONG_CACHE)
    print(f"--- Saved {df.shape[0]} days x {df.shape[1]} tickers to {STOCKS_LONG_CACHE} ---")
    return df


# ---------------------------------------------------------------------------
# ETF data loader (separate cache so we don't pollute data/us.csv)
# ---------------------------------------------------------------------------
def load_etfs(tickers: list[str], start: str = "2014-01-01") -> pd.DataFrame:
    """Load ETF closes from local cache; download missing dates from Yahoo.

    Returns the panel WITHOUT ffill so callers can detect which dates are
    real trading days for which symbol. Caller is expected to anchor the
    panel to a master calendar (e.g. SPY) and then ffill.
    """
    cached: pd.DataFrame | None = None
    if os.path.exists(ETF_CACHE):
        cached = pd.read_csv(ETF_CACHE, index_col=0, parse_dates=True)

    need_download = (
        cached is None
        or any(t not in cached.columns for t in tickers)
        or cached.index.max() < pd.Timestamp(datetime.now() - timedelta(days=2))
    )

    if need_download:
        print(f"--- Downloading ETF prices: {tickers} ---")
        raw = yf.download(tickers, start=start, auto_adjust=True, progress=False)
        if isinstance(raw.columns, pd.MultiIndex):
            df = raw["Close"]
        else:
            df = raw[["Close"]].rename(columns={"Close": tickers[0]})
        df = df.dropna(how="all")
        if cached is not None:
            df = cached.combine_first(df)
        df = df.sort_index()
        os.makedirs("data", exist_ok=True)
        df.to_csv(ETF_CACHE)
        print(f"--- Saved {df.shape[0]} days x {df.shape[1]} ETFs to {ETF_CACHE} ---")
        return df

    return cached[tickers].dropna(how="all")


# ---------------------------------------------------------------------------
# Backtest engine: returns daily portfolio returns from a weights DataFrame.
# ---------------------------------------------------------------------------
def daily_returns(weights: pd.DataFrame, prices: pd.DataFrame,
                  txn_cost: float = 0.001) -> pd.Series:
    """Compute daily portfolio returns net of turnover cost.

    weights : already 1-day lagged so weights[t] is decided using info
              up through t-1 and applies to the t-1 → t close return.
    prices  : aligned price data over the same columns/dates.
    """
    aligned = weights.reindex(index=prices.index, columns=prices.columns).fillna(0.0)
    daily_pct = prices.pct_change().fillna(0.0)
    port = (daily_pct * aligned).sum(axis=1)
    turnover = aligned.diff().abs().sum(axis=1).fillna(0.0)
    return port - turnover * txn_cost


def equity_with_cashflows(returns: pd.Series, contributions: pd.Series,
                          start_capital: float) -> pd.Series:
    """Simulate equity given a daily return series and dated cash injections.

    contributions : Series indexed by dates with positive values for cash
                    added that day (added at end-of-day, after returns).
    start_capital : amount on the first index date (returns[0] applies to
                    day 1; we assume returns[0] = 0).
    """
    contrib = contributions.reindex(returns.index).fillna(0.0)
    eq = np.empty(len(returns))
    val = start_capital
    for i, r in enumerate(returns.values):
        val = val * (1.0 + float(r)) + float(contrib.iat[i])
        eq[i] = val
    return pd.Series(eq, index=returns.index)


# ---------------------------------------------------------------------------
# Yearly tests
# ---------------------------------------------------------------------------
def test1_annual_reset(returns: pd.Series, years: list[int],
                       start_capital: float = 10_000) -> pd.Series:
    """Each year independently: start at $start_capital, return year-end value."""
    out: dict[int, float] = {}
    for y in years:
        mask = returns.index.year == y
        if not mask.any():
            out[y] = float("nan")
            continue
        cum = (1.0 + returns[mask]).prod()
        out[y] = float(start_capital * cum)
    return pd.Series(out, name="year_end")


def test2_with_contributions(returns: pd.Series, years: list[int],
                             initial: float = 10_000,
                             annual_contrib: float = 10_000) -> pd.Series:
    """Start initial in year 1; add annual_contrib at first trading day of years 2+.

    Returns a Series indexed by year with end-of-year portfolio value.
    """
    yr_returns = returns[returns.index.year.isin(years)].copy()
    if yr_returns.empty:
        return pd.Series(dtype=float)
    contrib = pd.Series(0.0, index=yr_returns.index)
    for y in years[1:]:
        ymask = yr_returns.index.year == y
        if ymask.any():
            first_day = yr_returns.index[ymask][0]
            contrib.at[first_day] = annual_contrib

    eq = equity_with_cashflows(yr_returns, contrib, start_capital=initial)
    out = {y: float(eq[eq.index.year == y].iloc[-1]) if (eq.index.year == y).any() else float("nan")
           for y in years}
    return pd.Series(out, name="year_end")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
    years = list(range(2015, 2026))  # 2015 .. 2025 inclusive

    # 1) ETF prices for TAA strategies — include global + HK variants too.
    # Anchor to the US (SPY) trading calendar so rolling windows are
    # consistent across strategies. HK ETFs get reindexed + ffilled onto
    # NYSE dates; on HK holidays we use the latest HK close.
    full_universe = sorted(set(ETF_UNIVERSE + GLOBAL_ETF_UNIVERSE + HK_ETF_UNIVERSE))
    etfs = load_etfs(full_universe, start="2013-06-01")
    nyse_index = etfs["SPY"].dropna().index
    etfs = etfs.reindex(nyse_index).ffill()
    etfs = etfs[(etfs.index >= "2013-06-01") & (etfs.index <= f"{years[-1]}-12-31")]
    print(f"--- ETF panel: {etfs.shape[0]} days x {etfs.shape[1]} cols, "
          f"{etfs.index.min().date()} to {etfs.index.max().date()} ---")

    # 2) S&P 500 prices for stock-picking strategies — needs longer history
    # than data_manager's 10-year cache so that 252-day momentum warmup
    # completes before 2015.
    from universe import UNIVERSES
    universe = UNIVERSES["us"]
    tickers = universe["fetch"]()
    benchmark = universe["benchmark"]
    all_tickers = sorted(set(tickers + [benchmark]))
    stocks = load_long_stock_history(all_tickers, start="2013-06-01")
    stocks = stocks[(stocks.index >= "2013-06-01") & (stocks.index <= f"{years[-1]}-12-31")]
    member_cols = [c for c in stocks.columns if c in tickers]
    print(f"--- Stock panel: {stocks.shape[0]} days x {len(member_cols)} members ---")

    # 3) Build strategies and compute their daily return series
    series: dict[str, pd.Series] = {}

    for name, strat in [
        ("PermanentOverlay", PermanentOverlay()),
        ("PermanentV4",      PermanentV4()),
        ("TrendRiderV3-US",  TrendRiderV3()),
        ("TrendRiderV3-Global",
         TrendRiderV3(risk_on=("TQQQ", "UPRO", "YINN", "CHAU"),
                      risk_off=("GLD", "DBC"))),
        ("TrendRiderV3-HK",
         TrendRiderV3(risk_on=("7200.HK", "7500.HK"),
                      risk_off=("GLD", "DBC"))),
    ]:
        print(f"\nRunning: {name}")
        w = strat.generate_signals(etfs)
        rets = daily_returns(w, etfs[w.columns])
        series[name] = rets

    print("\nRunning: Recovery+Mom Top10")
    rec = RecoveryMomentumStrategy(top_n=10)
    w = rec.generate_signals(stocks[member_cols])
    series["Recovery+Mom Top10"] = daily_returns(w, stocks[member_cols])

    # Buy & hold SPY benchmark for context
    spy = etfs["SPY"]
    series["SPY Buy&Hold"] = spy.pct_change().fillna(0.0)

    # 4) Restrict every series to 2015-01-01 onward, common index per series
    for k, s in series.items():
        series[k] = s[(s.index >= f"{years[0]}-01-01") & (s.index <= f"{years[-1]}-12-31")]

    # 5) Test 1 — annual reset
    t1 = pd.DataFrame({name: test1_annual_reset(s, years) for name, s in series.items()})
    t1.index.name = "year"

    # 6) Test 2 — annual $10k contribution
    t2 = pd.DataFrame({name: test2_with_contributions(s, years) for name, s in series.items()})
    t2.index.name = "year"

    # 7) Print reports
    pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

    print("\n" + "=" * 78)
    print("TEST 1 — Each year starts at $10,000 (independent year-end value)")
    print("=" * 78)
    print(t1.to_string())
    annual_ret = (t1 / 10_000.0 - 1.0) * 100
    pd.set_option("display.float_format", lambda x: f"{x:+.2f}%")
    print("\nAnnual returns (%)")
    print(annual_ret.to_string())
    avg = annual_ret.mean(axis=0)
    win_years = (annual_ret > 0).sum(axis=0)
    print("\nMean annual return / years up:")
    for c in annual_ret.columns:
        print(f"  {c:22s}  mean={avg[c]:+6.2f}%   up_years={int(win_years[c])}/{len(years)}")

    pd.set_option("display.float_format", lambda x: f"{x:,.0f}")
    print("\n" + "=" * 78)
    print("TEST 2 — Start $10,000 in 2015, add $10,000 each subsequent year")
    print("=" * 78)
    print(t2.to_string())
    total_in = pd.Series({y: 10_000 * (years.index(y) + 1) for y in years}, name="contributed")
    print("\nTotal $ contributed by year-end:")
    print(total_in.to_string())

    # Total return on contributions, year-by-year
    print("\nMultiple of contributed capital:")
    pd.set_option("display.float_format", lambda x: f"{x:.2f}x")
    multiple = t2.div(total_in, axis=0)
    print(multiple.to_string())

    # 8) Save CSVs
    os.makedirs("data", exist_ok=True)
    pd.set_option("display.float_format", None)
    t1.to_csv("data/permanent_yearly_test1_reset.csv")
    t2.to_csv("data/permanent_yearly_test2_contrib.csv")
    print("\nSaved: data/permanent_yearly_test1_reset.csv")
    print("Saved: data/permanent_yearly_test2_contrib.csv")


if __name__ == "__main__":
    main()