Files
quant/factor_attribution.py

132 lines
4.5 KiB
Python

from __future__ import annotations
import io
import socket
import ssl
import warnings
import zipfile
from pathlib import Path
from urllib.error import URLError
from urllib.request import Request, urlopen
import pandas as pd
KEN_FRENCH_DAILY_FF5_ZIP_URL = (
"https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"
"F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
)
EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]
class ExternalFactorFormatError(ValueError):
pass
class ExternalFactorDownloadError(OSError):
pass
def _download_kf_zip_bytes() -> bytes:
request = Request(
KEN_FRENCH_DAILY_FF5_ZIP_URL,
headers={"User-Agent": "quant-factor-attribution/0.1"},
)
try:
with urlopen(request, timeout=30) as response:
return response.read()
except (URLError, TimeoutError, ConnectionError, socket.timeout, ssl.SSLError) as exc:
raise ExternalFactorDownloadError(f"Failed to download external factor data: {exc}") from exc
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
member_names = [
name
for name in archive.namelist()
if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
]
if not member_names:
raise ExternalFactorFormatError("Ken French archive did not contain a CSV or TXT file")
try:
text = archive.read(member_names[0]).decode("utf-8-sig")
except UnicodeDecodeError as exc:
raise ExternalFactorFormatError("Ken French factor file was not valid UTF-8 text") from exc
lines = [line for line in text.splitlines() if line.strip()]
try:
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
except StopIteration as exc:
raise ExternalFactorFormatError("Ken French factor file was missing the daily factor header") from exc
table = "\n".join(lines[header_index:])
try:
factors = pd.read_csv(io.StringIO(table))
except pd.errors.ParserError as exc:
raise ExternalFactorFormatError("Ken French factor table could not be parsed") from exc
factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
date_column = factors.columns[0]
missing_columns = [column for column in EXPECTED_FACTOR_COLUMNS if column not in factors.columns]
if missing_columns:
raise ExternalFactorFormatError(
f"Ken French factor table was missing columns: {', '.join(missing_columns)}"
)
factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
if factors.empty:
raise ExternalFactorFormatError("Ken French factor table did not contain daily rows")
try:
factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
except ValueError as exc:
raise ExternalFactorFormatError("Ken French factor table contained invalid dates") from exc
factors = factors.set_index(date_column)
factors.index.name = None
try:
factors = factors[EXPECTED_FACTOR_COLUMNS].astype(float) / 100.0
except ValueError as exc:
raise ExternalFactorFormatError("Ken French factor table contained non-numeric values") from exc
return factors
def _warn_and_load_cached_factors(cache_path: Path, reason: str) -> pd.DataFrame:
warnings.warn(
f"Using cached data from {cache_path} because {reason}.",
UserWarning,
stacklevel=2,
)
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
cache_path = Path(cache_dir) / "ff5_us_daily.csv"
cache_path.parent.mkdir(parents=True, exist_ok=True)
try:
raw_bytes = _download_kf_zip_bytes()
except ExternalFactorDownloadError as exc:
if cache_path.exists():
return _warn_and_load_cached_factors(cache_path, f"download failed: {exc}")
raise
try:
factors = _parse_kf_daily_csv(raw_bytes)
except zipfile.BadZipFile as exc:
if cache_path.exists():
return _warn_and_load_cached_factors(cache_path, f"the upstream ZIP was invalid: {exc}")
raise
except ExternalFactorFormatError as exc:
if cache_path.exists():
return _warn_and_load_cached_factors(
cache_path,
f"the upstream factor format was invalid: {exc}",
)
raise
factors.to_csv(cache_path)
return factors