diff --git a/factor_attribution.py b/factor_attribution.py new file mode 100644 index 0000000..add7cf4 --- /dev/null +++ b/factor_attribution.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import io +from pathlib import Path + +import pandas as pd + + +def _download_kf_zip_bytes() -> bytes: + raise NotImplementedError + + +def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: + text = raw_bytes.decode("utf-8") + lines = [line for line in text.splitlines() if line.strip()] + header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line) + table = "\n".join(lines[header_index:]) + factors = pd.read_csv(io.StringIO(table)) + factors = factors.rename(columns={"Mkt-RF": "MKT_RF"}) + date_column = factors.columns[0] + factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")] + factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d") + factors = factors.set_index(date_column) + factors.index.name = None + factors = factors.astype(float) / 100.0 + return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]] + + +def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame: + cache_path = Path(cache_dir) / "ff5_us_daily.csv" + cache_path.parent.mkdir(parents=True, exist_ok=True) + + try: + factors = _parse_kf_daily_csv(_download_kf_zip_bytes()) + factors.to_csv(cache_path) + return factors + except Exception: + if cache_path.exists(): + return pd.read_csv(cache_path, index_col=0, parse_dates=True) + raise diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py new file mode 100644 index 0000000..81b12fc --- /dev/null +++ b/tests/test_factor_attribution.py @@ -0,0 +1,59 @@ +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +import pandas as pd + +from factor_attribution import load_external_us_factors + + +class ExternalFactorLoaderTests(unittest.TestCase): + def test_load_external_us_factors_parses_percent_values_and_dates(self): + csv_text = ( + "This line is ignored\n" + ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" + "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" + "20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n" + "\n" + ) + + with tempfile.TemporaryDirectory() as tmpdir: + with mock.patch( + "factor_attribution._download_kf_zip_bytes", + return_value=csv_text.encode("utf-8"), + ): + factors = load_external_us_factors(cache_dir=Path(tmpdir)) + + self.assertListEqual( + list(factors.columns), + ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"], + ) + self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) + self.assertAlmostEqual(factors.iloc[0]["RF"], 0.0002) + self.assertEqual(str(factors.index[0].date()), "2026-01-02") + + def test_load_external_us_factors_falls_back_to_cache_when_download_fails(self): + cached = pd.DataFrame( + { + "MKT_RF": [0.01], + "SMB": [0.0], + "HML": [0.0], + "RMW": [0.0], + "CMA": [0.0], + "RF": [0.0001], + }, + index=pd.to_datetime(["2026-01-02"]), + ) + + with tempfile.TemporaryDirectory() as tmpdir: + cache_dir = Path(tmpdir) + cached.to_csv(cache_dir / "ff5_us_daily.csv") + with mock.patch( + "factor_attribution._download_kf_zip_bytes", + side_effect=RuntimeError("boom"), + ): + factors = load_external_us_factors(cache_dir=cache_dir) + + self.assertEqual(len(factors), 1) + self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)