Add factor loader and cache scaffolding
This commit is contained in:
40
factor_attribution.py
Normal file
40
factor_attribution.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def _download_kf_zip_bytes() -> bytes:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
|
||||
text = raw_bytes.decode("utf-8")
|
||||
lines = [line for line in text.splitlines() if line.strip()]
|
||||
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
|
||||
table = "\n".join(lines[header_index:])
|
||||
factors = pd.read_csv(io.StringIO(table))
|
||||
factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
|
||||
date_column = factors.columns[0]
|
||||
factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
|
||||
factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
|
||||
factors = factors.set_index(date_column)
|
||||
factors.index.name = None
|
||||
factors = factors.astype(float) / 100.0
|
||||
return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]]
|
||||
|
||||
|
||||
def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
|
||||
cache_path = Path(cache_dir) / "ff5_us_daily.csv"
|
||||
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
factors = _parse_kf_daily_csv(_download_kf_zip_bytes())
|
||||
factors.to_csv(cache_path)
|
||||
return factors
|
||||
except Exception:
|
||||
if cache_path.exists():
|
||||
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
||||
raise
|
||||
59
tests/test_factor_attribution.py
Normal file
59
tests/test_factor_attribution.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from factor_attribution import load_external_us_factors
|
||||
|
||||
|
||||
class ExternalFactorLoaderTests(unittest.TestCase):
|
||||
def test_load_external_us_factors_parses_percent_values_and_dates(self):
|
||||
csv_text = (
|
||||
"This line is ignored\n"
|
||||
",Mkt-RF,SMB,HML,RMW,CMA,RF\n"
|
||||
"20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n"
|
||||
"20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n"
|
||||
"\n"
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with mock.patch(
|
||||
"factor_attribution._download_kf_zip_bytes",
|
||||
return_value=csv_text.encode("utf-8"),
|
||||
):
|
||||
factors = load_external_us_factors(cache_dir=Path(tmpdir))
|
||||
|
||||
self.assertListEqual(
|
||||
list(factors.columns),
|
||||
["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"],
|
||||
)
|
||||
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
|
||||
self.assertAlmostEqual(factors.iloc[0]["RF"], 0.0002)
|
||||
self.assertEqual(str(factors.index[0].date()), "2026-01-02")
|
||||
|
||||
def test_load_external_us_factors_falls_back_to_cache_when_download_fails(self):
|
||||
cached = pd.DataFrame(
|
||||
{
|
||||
"MKT_RF": [0.01],
|
||||
"SMB": [0.0],
|
||||
"HML": [0.0],
|
||||
"RMW": [0.0],
|
||||
"CMA": [0.0],
|
||||
"RF": [0.0001],
|
||||
},
|
||||
index=pd.to_datetime(["2026-01-02"]),
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
cache_dir = Path(tmpdir)
|
||||
cached.to_csv(cache_dir / "ff5_us_daily.csv")
|
||||
with mock.patch(
|
||||
"factor_attribution._download_kf_zip_bytes",
|
||||
side_effect=RuntimeError("boom"),
|
||||
):
|
||||
factors = load_external_us_factors(cache_dir=cache_dir)
|
||||
|
||||
self.assertEqual(len(factors), 1)
|
||||
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
|
||||
Reference in New Issue
Block a user