From 9e6da727a346bf61fe5b4436b8e74927ccd9664e Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 7 Apr 2026 15:44:46 +0800 Subject: [PATCH] Implement Ken French factor download and cache fallback --- factor_attribution.py | 30 +++++++++++++++++-- tests/test_factor_attribution.py | 49 +++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/factor_attribution.py b/factor_attribution.py index a635c85..1c1f5f5 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -4,12 +4,32 @@ import io import zipfile from pathlib import Path from urllib.error import URLError +from urllib.request import Request, urlopen import pandas as pd +KEN_FRENCH_DAILY_FF5_ZIP_URL = ( + "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/" + "F-F_Research_Data_5_Factors_2x3_daily_CSV.zip" +) + +SOURCE_PARSE_EXCEPTIONS = ( + zipfile.BadZipFile, + UnicodeDecodeError, + StopIteration, + KeyError, + ValueError, + pd.errors.ParserError, +) + def _download_kf_zip_bytes() -> bytes: - raise NotImplementedError + request = Request( + KEN_FRENCH_DAILY_FF5_ZIP_URL, + headers={"User-Agent": "quant-factor-attribution/0.1"}, + ) + with urlopen(request, timeout=30) as response: + return response.read() def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: @@ -46,6 +66,12 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF return pd.read_csv(cache_path, index_col=0, parse_dates=True) raise - factors = _parse_kf_daily_csv(raw_bytes) + try: + factors = _parse_kf_daily_csv(raw_bytes) + except SOURCE_PARSE_EXCEPTIONS: + if cache_path.exists(): + return pd.read_csv(cache_path, index_col=0, parse_dates=True) + raise + factors.to_csv(cache_path) return factors diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index 8950bca..9a1802b 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -8,10 +8,28 @@ from unittest import mock import pandas as pd -from factor_attribution import load_external_us_factors +from factor_attribution import ( + KEN_FRENCH_DAILY_FF5_ZIP_URL, + _download_kf_zip_bytes, + load_external_us_factors, +) class ExternalFactorLoaderTests(unittest.TestCase): + def test_download_kf_zip_bytes_fetches_official_ken_french_zip(self): + response = mock.MagicMock() + response.read.return_value = b"zip-bytes" + response.__enter__.return_value = response + response.__exit__.return_value = False + + with mock.patch("factor_attribution.urlopen", return_value=response) as mocked_urlopen: + raw_bytes = _download_kf_zip_bytes() + + self.assertEqual(raw_bytes, b"zip-bytes") + request = mocked_urlopen.call_args.args[0] + self.assertEqual(request.full_url, KEN_FRENCH_DAILY_FF5_ZIP_URL) + self.assertEqual(mocked_urlopen.call_args.kwargs["timeout"], 30) + def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self): csv_text = ( "This line is ignored\n" @@ -65,7 +83,7 @@ class ExternalFactorLoaderTests(unittest.TestCase): self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) - def test_load_external_us_factors_raises_parse_errors_instead_of_using_cache(self): + def test_load_external_us_factors_falls_back_to_cache_when_parse_fails(self): cached = pd.DataFrame( { "MKT_RF": [0.01], @@ -85,8 +103,31 @@ class ExternalFactorLoaderTests(unittest.TestCase): "factor_attribution._download_kf_zip_bytes", return_value=b"not-a-zip-file", ): - with self.assertRaises(zipfile.BadZipFile): - load_external_us_factors(cache_dir=cache_dir) + factors = load_external_us_factors(cache_dir=cache_dir) + + self.assertEqual(len(factors), 1) + self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) + + def test_load_external_us_factors_surfaces_cache_write_failures(self): + csv_text = ( + "This line is ignored\n" + ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" + "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" + "\n" + ) + zip_bytes = self._make_zip_bytes( + "F-F_Research_Data_5_Factors_2x3_daily.csv", + csv_text, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + with mock.patch( + "factor_attribution._download_kf_zip_bytes", + return_value=zip_bytes, + ): + with mock.patch("pandas.DataFrame.to_csv", side_effect=OSError("disk full")): + with self.assertRaises(OSError): + load_external_us_factors(cache_dir=Path(tmpdir)) def _make_zip_bytes(self, filename: str, contents: str) -> bytes: buffer = io.BytesIO()