78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
from __future__ import annotations
|
|
|
|
import io
|
|
import zipfile
|
|
from pathlib import Path
|
|
from urllib.error import URLError
|
|
from urllib.request import Request, urlopen
|
|
|
|
import pandas as pd
|
|
|
|
KEN_FRENCH_DAILY_FF5_ZIP_URL = (
|
|
"https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"
|
|
"F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
|
|
)
|
|
|
|
SOURCE_PARSE_EXCEPTIONS = (
|
|
zipfile.BadZipFile,
|
|
UnicodeDecodeError,
|
|
StopIteration,
|
|
KeyError,
|
|
ValueError,
|
|
pd.errors.ParserError,
|
|
)
|
|
|
|
|
|
def _download_kf_zip_bytes() -> bytes:
|
|
request = Request(
|
|
KEN_FRENCH_DAILY_FF5_ZIP_URL,
|
|
headers={"User-Agent": "quant-factor-attribution/0.1"},
|
|
)
|
|
with urlopen(request, timeout=30) as response:
|
|
return response.read()
|
|
|
|
|
|
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
|
|
with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
|
|
member_name = next(
|
|
name
|
|
for name in archive.namelist()
|
|
if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
|
|
)
|
|
text = archive.read(member_name).decode("utf-8-sig")
|
|
|
|
lines = [line for line in text.splitlines() if line.strip()]
|
|
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
|
|
table = "\n".join(lines[header_index:])
|
|
factors = pd.read_csv(io.StringIO(table))
|
|
factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
|
|
date_column = factors.columns[0]
|
|
factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
|
|
factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
|
|
factors = factors.set_index(date_column)
|
|
factors.index.name = None
|
|
factors = factors.astype(float) / 100.0
|
|
return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]]
|
|
|
|
|
|
def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
|
|
cache_path = Path(cache_dir) / "ff5_us_daily.csv"
|
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
raw_bytes = _download_kf_zip_bytes()
|
|
except (URLError, TimeoutError, ConnectionError, OSError):
|
|
if cache_path.exists():
|
|
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
|
raise
|
|
|
|
try:
|
|
factors = _parse_kf_daily_csv(raw_bytes)
|
|
except SOURCE_PARSE_EXCEPTIONS:
|
|
if cache_path.exists():
|
|
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
|
raise
|
|
|
|
factors.to_csv(cache_path)
|
|
return factors
|