from __future__ import annotations import io import warnings import zipfile from pathlib import Path from urllib.error import URLError from urllib.request import Request, urlopen import pandas as pd KEN_FRENCH_DAILY_FF5_ZIP_URL = ( "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/" "F-F_Research_Data_5_Factors_2x3_daily_CSV.zip" ) EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"] class ExternalFactorFormatError(ValueError): pass def _download_kf_zip_bytes() -> bytes: request = Request( KEN_FRENCH_DAILY_FF5_ZIP_URL, headers={"User-Agent": "quant-factor-attribution/0.1"}, ) with urlopen(request, timeout=30) as response: return response.read() def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive: member_names = [ name for name in archive.namelist() if not name.endswith("/") and name.lower().endswith((".csv", ".txt")) ] if not member_names: raise ExternalFactorFormatError("Ken French archive did not contain a CSV or TXT file") try: text = archive.read(member_names[0]).decode("utf-8-sig") except UnicodeDecodeError as exc: raise ExternalFactorFormatError("Ken French factor file was not valid UTF-8 text") from exc lines = [line for line in text.splitlines() if line.strip()] try: header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line) except StopIteration as exc: raise ExternalFactorFormatError("Ken French factor file was missing the daily factor header") from exc table = "\n".join(lines[header_index:]) try: factors = pd.read_csv(io.StringIO(table)) except pd.errors.ParserError as exc: raise ExternalFactorFormatError("Ken French factor table could not be parsed") from exc factors = factors.rename(columns={"Mkt-RF": "MKT_RF"}) date_column = factors.columns[0] missing_columns = [column for column in EXPECTED_FACTOR_COLUMNS if column not in factors.columns] if missing_columns: raise ExternalFactorFormatError( f"Ken French factor table was missing columns: {', '.join(missing_columns)}" ) factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")] if factors.empty: raise ExternalFactorFormatError("Ken French factor table did not contain daily rows") try: factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d") except ValueError as exc: raise ExternalFactorFormatError("Ken French factor table contained invalid dates") from exc factors = factors.set_index(date_column) factors.index.name = None try: factors = factors[EXPECTED_FACTOR_COLUMNS].astype(float) / 100.0 except ValueError as exc: raise ExternalFactorFormatError("Ken French factor table contained non-numeric values") from exc return factors def _warn_and_load_cached_factors(cache_path: Path, reason: str) -> pd.DataFrame: warnings.warn( f"Using cached data from {cache_path} because {reason}.", UserWarning, stacklevel=2, ) return pd.read_csv(cache_path, index_col=0, parse_dates=True) def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame: cache_path = Path(cache_dir) / "ff5_us_daily.csv" cache_path.parent.mkdir(parents=True, exist_ok=True) try: raw_bytes = _download_kf_zip_bytes() except (URLError, TimeoutError, ConnectionError) as exc: if cache_path.exists(): return _warn_and_load_cached_factors(cache_path, f"download failed: {exc}") raise try: factors = _parse_kf_daily_csv(raw_bytes) except zipfile.BadZipFile as exc: if cache_path.exists(): return _warn_and_load_cached_factors(cache_path, f"the upstream ZIP was invalid: {exc}") raise except ExternalFactorFormatError as exc: if cache_path.exists(): return _warn_and_load_cached_factors( cache_path, f"the upstream factor format was invalid: {exc}", ) raise factors.to_csv(cache_path) return factors