quant/research/fetch_historical.py

"""
Fetch price history for all tickers that were ever S&P 500 members — including
delisted ones — and save to data/us_pit.csv. This is the foundation for a
survivorship-bias-free backtest.

NOTE: Yahoo Finance no longer serves price data for many fully-delisted tickers
(bankruptcies, old mergers). Those are silently skipped. The result is still
a major improvement over "today's S&P 500 extrapolated 10 years back", but it
is NOT a perfect point-in-time dataset — only a dataset where the universe
mask is correct at each date. A subset of worst-outcome tickers (e.g., ABK,
ACAS) will be missing entirely. This caveat is documented in the run summary.
"""

import os
from datetime import datetime, timedelta

import pandas as pd
import yfinance as yf

import universe_history as uh

DATA_DIR = "data"
OUT_PATH = os.path.join(DATA_DIR, "us_pit.csv")
YEARS = 10
BATCH_SIZE = 50


def fetch_all_historical(force: bool = False) -> pd.DataFrame:
    os.makedirs(DATA_DIR, exist_ok=True)
    intervals = uh.load_sp500_history()
    tickers = uh.all_tickers_ever(intervals) + ["SPY"]
    tickers = sorted(set(tickers))

    existing = None
    if os.path.exists(OUT_PATH) and not force:
        existing = pd.read_csv(OUT_PATH, index_col=0, parse_dates=True)
        missing = [t for t in tickers if t not in existing.columns]
        if not missing:
            # Just append latest dates
            last_date = existing.index[-1]
            if (datetime.now() - last_date.to_pydatetime()).days < 2:
                print(f"--- us_pit.csv already up to date: {existing.shape} ---")
                return existing
            tickers = list(existing.columns)
            start = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")
            print(f"--- Appending new dates from {start} for {len(tickers)} tickers ---")
            new = _download_batched(tickers, start=start)
            if new is not None and not new.empty:
                combined = pd.concat([existing, new]).sort_index()
                combined = combined[~combined.index.duplicated(keep="last")]
                combined.to_csv(OUT_PATH)
                print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
                return combined
            return existing
        else:
            print(f"--- Have {existing.shape[1]} cols; need {len(missing)} more ---")
            tickers = missing

    start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d")
    new = _download_batched(tickers, start=start)

    if existing is not None and new is not None and not new.empty:
        combined = pd.concat([existing, new.reindex(existing.index)], axis=1)
        # Add any new rows from `new` not in existing
        new_only_idx = new.index.difference(existing.index)
        if len(new_only_idx) > 0:
            combined_new = new.loc[new_only_idx].reindex(columns=combined.columns)
            combined = pd.concat([combined, combined_new]).sort_index()
    else:
        combined = new

    combined.to_csv(OUT_PATH)
    print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
    return combined


def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None:
    frames = []
    n = len(tickers)
    for i in range(0, n, BATCH_SIZE):
        batch = tickers[i:i + BATCH_SIZE]
        print(f"  [{i}/{n}] fetching {len(batch)} tickers...", flush=True)
        try:
            raw = yf.download(batch, start=start, auto_adjust=True,
                              progress=False, threads=True)
            if raw.empty:
                continue
            if isinstance(raw.columns, pd.MultiIndex):
                close = raw["Close"]
            else:
                close = raw[["Close"]].rename(columns={"Close": batch[0]})
            close = close.dropna(axis=1, how="all")
            if not close.empty:
                frames.append(close)
        except Exception as e:
            print(f"    batch failed: {e}")
    if not frames:
        return None
    result = pd.concat(frames, axis=1).sort_index()
    result = result.loc[:, ~result.columns.duplicated()]
    return result


if __name__ == "__main__":
    fetch_all_historical()