From e70922d9afddb785143bfc2eeb98b8ab2124d37d Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 7 Apr 2026 15:38:49 +0800 Subject: [PATCH] Harden factor loader zip parsing and fallback --- factor_attribution.py | 21 ++++++++++++---- tests/test_factor_attribution.py | 42 +++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/factor_attribution.py b/factor_attribution.py index add7cf4..a635c85 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -1,7 +1,9 @@ from __future__ import annotations import io +import zipfile from pathlib import Path +from urllib.error import URLError import pandas as pd @@ -11,7 +13,14 @@ def _download_kf_zip_bytes() -> bytes: def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: - text = raw_bytes.decode("utf-8") + with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive: + member_name = next( + name + for name in archive.namelist() + if not name.endswith("/") and name.lower().endswith((".csv", ".txt")) + ) + text = archive.read(member_name).decode("utf-8-sig") + lines = [line for line in text.splitlines() if line.strip()] header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line) table = "\n".join(lines[header_index:]) @@ -31,10 +40,12 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF cache_path.parent.mkdir(parents=True, exist_ok=True) try: - factors = _parse_kf_daily_csv(_download_kf_zip_bytes()) - factors.to_csv(cache_path) - return factors - except Exception: + raw_bytes = _download_kf_zip_bytes() + except (URLError, TimeoutError, ConnectionError, OSError): if cache_path.exists(): return pd.read_csv(cache_path, index_col=0, parse_dates=True) raise + + factors = _parse_kf_daily_csv(raw_bytes) + factors.to_csv(cache_path) + return factors diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index 81b12fc..8950bca 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -1,6 +1,9 @@ +import io import tempfile import unittest +import zipfile from pathlib import Path +from urllib.error import URLError from unittest import mock import pandas as pd @@ -9,7 +12,7 @@ from factor_attribution import load_external_us_factors class ExternalFactorLoaderTests(unittest.TestCase): - def test_load_external_us_factors_parses_percent_values_and_dates(self): + def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self): csv_text = ( "This line is ignored\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" @@ -17,11 +20,15 @@ class ExternalFactorLoaderTests(unittest.TestCase): "20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n" "\n" ) + zip_bytes = self._make_zip_bytes( + "F-F_Research_Data_5_Factors_2x3_daily.csv", + csv_text, + ) with tempfile.TemporaryDirectory() as tmpdir: with mock.patch( "factor_attribution._download_kf_zip_bytes", - return_value=csv_text.encode("utf-8"), + return_value=zip_bytes, ): factors = load_external_us_factors(cache_dir=Path(tmpdir)) @@ -51,9 +58,38 @@ class ExternalFactorLoaderTests(unittest.TestCase): cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch( "factor_attribution._download_kf_zip_bytes", - side_effect=RuntimeError("boom"), + side_effect=URLError("boom"), ): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) + + def test_load_external_us_factors_raises_parse_errors_instead_of_using_cache(self): + cached = pd.DataFrame( + { + "MKT_RF": [0.01], + "SMB": [0.0], + "HML": [0.0], + "RMW": [0.0], + "CMA": [0.0], + "RF": [0.0001], + }, + index=pd.to_datetime(["2026-01-02"]), + ) + + with tempfile.TemporaryDirectory() as tmpdir: + cache_dir = Path(tmpdir) + cached.to_csv(cache_dir / "ff5_us_daily.csv") + with mock.patch( + "factor_attribution._download_kf_zip_bytes", + return_value=b"not-a-zip-file", + ): + with self.assertRaises(zipfile.BadZipFile): + load_external_us_factors(cache_dir=cache_dir) + + def _make_zip_bytes(self, filename: str, contents: str) -> bytes: + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, mode="w") as archive: + archive.writestr(filename, contents) + return buffer.getvalue()