from __future__ import annotations import io import zipfile from pathlib import Path from urllib.error import URLError from urllib.request import Request, urlopen import pandas as pd KEN_FRENCH_DAILY_FF5_ZIP_URL = ( "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/" "F-F_Research_Data_5_Factors_2x3_daily_CSV.zip" ) SOURCE_PARSE_EXCEPTIONS = ( zipfile.BadZipFile, UnicodeDecodeError, StopIteration, KeyError, ValueError, pd.errors.ParserError, ) def _download_kf_zip_bytes() -> bytes: request = Request( KEN_FRENCH_DAILY_FF5_ZIP_URL, headers={"User-Agent": "quant-factor-attribution/0.1"}, ) with urlopen(request, timeout=30) as response: return response.read() def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive: member_name = next( name for name in archive.namelist() if not name.endswith("/") and name.lower().endswith((".csv", ".txt")) ) text = archive.read(member_name).decode("utf-8-sig") lines = [line for line in text.splitlines() if line.strip()] header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line) table = "\n".join(lines[header_index:]) factors = pd.read_csv(io.StringIO(table)) factors = factors.rename(columns={"Mkt-RF": "MKT_RF"}) date_column = factors.columns[0] factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")] factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d") factors = factors.set_index(date_column) factors.index.name = None factors = factors.astype(float) / 100.0 return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]] def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame: cache_path = Path(cache_dir) / "ff5_us_daily.csv" cache_path.parent.mkdir(parents=True, exist_ok=True) try: raw_bytes = _download_kf_zip_bytes() except (URLError, TimeoutError, ConnectionError, OSError): if cache_path.exists(): return pd.read_csv(cache_path, index_col=0, parse_dates=True) raise try: factors = _parse_kf_daily_csv(raw_bytes) except SOURCE_PARSE_EXCEPTIONS: if cache_path.exists(): return pd.read_csv(cache_path, index_col=0, parse_dates=True) raise factors.to_csv(cache_path) return factors