quant/factor_attribution.py

from __future__ import annotations

import io
import warnings
import zipfile
from pathlib import Path
from urllib.error import URLError
from urllib.request import Request, urlopen

import pandas as pd

KEN_FRENCH_DAILY_FF5_ZIP_URL = (
    "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"
    "F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
)

EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]


class ExternalFactorFormatError(ValueError):
    pass


def _download_kf_zip_bytes() -> bytes:
    request = Request(
        KEN_FRENCH_DAILY_FF5_ZIP_URL,
        headers={"User-Agent": "quant-factor-attribution/0.1"},
    )
    with urlopen(request, timeout=30) as response:
        return response.read()


def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
    with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
        member_names = [
            name
            for name in archive.namelist()
            if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
        ]
        if not member_names:
            raise ExternalFactorFormatError("Ken French archive did not contain a CSV or TXT file")

        try:
            text = archive.read(member_names[0]).decode("utf-8-sig")
        except UnicodeDecodeError as exc:
            raise ExternalFactorFormatError("Ken French factor file was not valid UTF-8 text") from exc

    lines = [line for line in text.splitlines() if line.strip()]
    try:
        header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
    except StopIteration as exc:
        raise ExternalFactorFormatError("Ken French factor file was missing the daily factor header") from exc

    table = "\n".join(lines[header_index:])
    try:
        factors = pd.read_csv(io.StringIO(table))
    except pd.errors.ParserError as exc:
        raise ExternalFactorFormatError("Ken French factor table could not be parsed") from exc

    factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
    date_column = factors.columns[0]
    missing_columns = [column for column in EXPECTED_FACTOR_COLUMNS if column not in factors.columns]
    if missing_columns:
        raise ExternalFactorFormatError(
            f"Ken French factor table was missing columns: {', '.join(missing_columns)}"
        )

    factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
    if factors.empty:
        raise ExternalFactorFormatError("Ken French factor table did not contain daily rows")

    try:
        factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
    except ValueError as exc:
        raise ExternalFactorFormatError("Ken French factor table contained invalid dates") from exc

    factors = factors.set_index(date_column)
    factors.index.name = None
    try:
        factors = factors[EXPECTED_FACTOR_COLUMNS].astype(float) / 100.0
    except ValueError as exc:
        raise ExternalFactorFormatError("Ken French factor table contained non-numeric values") from exc

    return factors


def _warn_and_load_cached_factors(cache_path: Path, reason: str) -> pd.DataFrame:
    warnings.warn(
        f"Using cached data from {cache_path} because {reason}.",
        UserWarning,
        stacklevel=2,
    )
    return pd.read_csv(cache_path, index_col=0, parse_dates=True)


def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
    cache_path = Path(cache_dir) / "ff5_us_daily.csv"
    cache_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        raw_bytes = _download_kf_zip_bytes()
    except (URLError, TimeoutError, ConnectionError) as exc:
        if cache_path.exists():
            return _warn_and_load_cached_factors(cache_path, f"download failed: {exc}")
        raise

    try:
        factors = _parse_kf_daily_csv(raw_bytes)
    except zipfile.BadZipFile as exc:
        if cache_path.exists():
            return _warn_and_load_cached_factors(cache_path, f"the upstream ZIP was invalid: {exc}")
        raise
    except ExternalFactorFormatError as exc:
        if cache_path.exists():
            return _warn_and_load_cached_factors(
                cache_path,
                f"the upstream factor format was invalid: {exc}",
            )
        raise

    factors.to_csv(cache_path)
    return factors