Add factor loader and cache scaffolding

2026-04-07 15:27:44 +08:00
parent 80493cb6af
commit feb1864a4d
2 changed files with 99 additions and 0 deletions
--- a/factor_attribution.py
+++ b/factor_attribution.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import io
+from pathlib import Path
+
+import pandas as pd
+
+
+def _download_kf_zip_bytes() -> bytes:
+    raise NotImplementedError
+
+
+def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
+    text = raw_bytes.decode("utf-8")
+    lines = [line for line in text.splitlines() if line.strip()]
+    header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
+    table = "\n".join(lines[header_index:])
+    factors = pd.read_csv(io.StringIO(table))
+    factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
+    date_column = factors.columns[0]
+    factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
+    factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
+    factors = factors.set_index(date_column)
+    factors.index.name = None
+    factors = factors.astype(float) / 100.0
+    return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]]
+
+
+def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
+    cache_path = Path(cache_dir) / "ff5_us_daily.csv"
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        factors = _parse_kf_daily_csv(_download_kf_zip_bytes())
+        factors.to_csv(cache_path)
+        return factors
+    except Exception:
+        if cache_path.exists():
+            return pd.read_csv(cache_path, index_col=0, parse_dates=True)
+        raise