""" Fetch price history for all tickers that were ever S&P 500 members — including delisted ones — and save to data/us_pit.csv. This is the foundation for a survivorship-bias-free backtest. NOTE: Yahoo Finance no longer serves price data for many fully-delisted tickers (bankruptcies, old mergers). Those are silently skipped. The result is still a major improvement over "today's S&P 500 extrapolated 10 years back", but it is NOT a perfect point-in-time dataset — only a dataset where the universe mask is correct at each date. A subset of worst-outcome tickers (e.g., ABK, ACAS) will be missing entirely. This caveat is documented in the run summary. """ import os from datetime import datetime, timedelta import pandas as pd import yfinance as yf import universe_history as uh DATA_DIR = "data" OUT_PATH = os.path.join(DATA_DIR, "us_pit.csv") YEARS = 10 BATCH_SIZE = 50 def fetch_all_historical(force: bool = False) -> pd.DataFrame: os.makedirs(DATA_DIR, exist_ok=True) intervals = uh.load_sp500_history() tickers = uh.all_tickers_ever(intervals) + ["SPY"] tickers = sorted(set(tickers)) existing = None if os.path.exists(OUT_PATH) and not force: existing = pd.read_csv(OUT_PATH, index_col=0, parse_dates=True) missing = [t for t in tickers if t not in existing.columns] if not missing: # Just append latest dates last_date = existing.index[-1] if (datetime.now() - last_date.to_pydatetime()).days < 2: print(f"--- us_pit.csv already up to date: {existing.shape} ---") return existing tickers = list(existing.columns) start = (last_date + timedelta(days=1)).strftime("%Y-%m-%d") print(f"--- Appending new dates from {start} for {len(tickers)} tickers ---") new = _download_batched(tickers, start=start) if new is not None and not new.empty: combined = pd.concat([existing, new]).sort_index() combined = combined[~combined.index.duplicated(keep="last")] combined.to_csv(OUT_PATH) print(f"--- Saved {combined.shape} to {OUT_PATH} ---") return combined return existing else: print(f"--- Have {existing.shape[1]} cols; need {len(missing)} more ---") tickers = missing start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d") new = _download_batched(tickers, start=start) if existing is not None and new is not None and not new.empty: combined = pd.concat([existing, new.reindex(existing.index)], axis=1) # Add any new rows from `new` not in existing new_only_idx = new.index.difference(existing.index) if len(new_only_idx) > 0: combined_new = new.loc[new_only_idx].reindex(columns=combined.columns) combined = pd.concat([combined, combined_new]).sort_index() else: combined = new combined.to_csv(OUT_PATH) print(f"--- Saved {combined.shape} to {OUT_PATH} ---") return combined def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None: frames = [] n = len(tickers) for i in range(0, n, BATCH_SIZE): batch = tickers[i:i + BATCH_SIZE] print(f" [{i}/{n}] fetching {len(batch)} tickers...", flush=True) try: raw = yf.download(batch, start=start, auto_adjust=True, progress=False, threads=True) if raw.empty: continue if isinstance(raw.columns, pd.MultiIndex): close = raw["Close"] else: close = raw[["Close"]].rename(columns={"Close": batch[0]}) close = close.dropna(axis=1, how="all") if not close.empty: frames.append(close) except Exception as e: print(f" batch failed: {e}") if not frames: return None result = pd.concat(frames, axis=1).sort_index() result = result.loc[:, ~result.columns.duplicated()] return result if __name__ == "__main__": fetch_all_historical()