Harden factor loader zip parsing and fallback

This commit is contained in:
2026-04-07 15:38:49 +08:00
parent feb1864a4d
commit e70922d9af
2 changed files with 55 additions and 8 deletions

View File

@@ -1,7 +1,9 @@
from __future__ import annotations from __future__ import annotations
import io import io
import zipfile
from pathlib import Path from pathlib import Path
from urllib.error import URLError
import pandas as pd import pandas as pd
@@ -11,7 +13,14 @@ def _download_kf_zip_bytes() -> bytes:
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
text = raw_bytes.decode("utf-8") with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
member_name = next(
name
for name in archive.namelist()
if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
)
text = archive.read(member_name).decode("utf-8-sig")
lines = [line for line in text.splitlines() if line.strip()] lines = [line for line in text.splitlines() if line.strip()]
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line) header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
table = "\n".join(lines[header_index:]) table = "\n".join(lines[header_index:])
@@ -31,10 +40,12 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF
cache_path.parent.mkdir(parents=True, exist_ok=True) cache_path.parent.mkdir(parents=True, exist_ok=True)
try: try:
factors = _parse_kf_daily_csv(_download_kf_zip_bytes()) raw_bytes = _download_kf_zip_bytes()
factors.to_csv(cache_path) except (URLError, TimeoutError, ConnectionError, OSError):
return factors
except Exception:
if cache_path.exists(): if cache_path.exists():
return pd.read_csv(cache_path, index_col=0, parse_dates=True) return pd.read_csv(cache_path, index_col=0, parse_dates=True)
raise raise
factors = _parse_kf_daily_csv(raw_bytes)
factors.to_csv(cache_path)
return factors

View File

@@ -1,6 +1,9 @@
import io
import tempfile import tempfile
import unittest import unittest
import zipfile
from pathlib import Path from pathlib import Path
from urllib.error import URLError
from unittest import mock from unittest import mock
import pandas as pd import pandas as pd
@@ -9,7 +12,7 @@ from factor_attribution import load_external_us_factors
class ExternalFactorLoaderTests(unittest.TestCase): class ExternalFactorLoaderTests(unittest.TestCase):
def test_load_external_us_factors_parses_percent_values_and_dates(self): def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self):
csv_text = ( csv_text = (
"This line is ignored\n" "This line is ignored\n"
",Mkt-RF,SMB,HML,RMW,CMA,RF\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n"
@@ -17,11 +20,15 @@ class ExternalFactorLoaderTests(unittest.TestCase):
"20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n" "20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n"
"\n" "\n"
) )
zip_bytes = self._make_zip_bytes(
"F-F_Research_Data_5_Factors_2x3_daily.csv",
csv_text,
)
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
with mock.patch( with mock.patch(
"factor_attribution._download_kf_zip_bytes", "factor_attribution._download_kf_zip_bytes",
return_value=csv_text.encode("utf-8"), return_value=zip_bytes,
): ):
factors = load_external_us_factors(cache_dir=Path(tmpdir)) factors = load_external_us_factors(cache_dir=Path(tmpdir))
@@ -51,9 +58,38 @@ class ExternalFactorLoaderTests(unittest.TestCase):
cached.to_csv(cache_dir / "ff5_us_daily.csv") cached.to_csv(cache_dir / "ff5_us_daily.csv")
with mock.patch( with mock.patch(
"factor_attribution._download_kf_zip_bytes", "factor_attribution._download_kf_zip_bytes",
side_effect=RuntimeError("boom"), side_effect=URLError("boom"),
): ):
factors = load_external_us_factors(cache_dir=cache_dir) factors = load_external_us_factors(cache_dir=cache_dir)
self.assertEqual(len(factors), 1) self.assertEqual(len(factors), 1)
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
def test_load_external_us_factors_raises_parse_errors_instead_of_using_cache(self):
cached = pd.DataFrame(
{
"MKT_RF": [0.01],
"SMB": [0.0],
"HML": [0.0],
"RMW": [0.0],
"CMA": [0.0],
"RF": [0.0001],
},
index=pd.to_datetime(["2026-01-02"]),
)
with tempfile.TemporaryDirectory() as tmpdir:
cache_dir = Path(tmpdir)
cached.to_csv(cache_dir / "ff5_us_daily.csv")
with mock.patch(
"factor_attribution._download_kf_zip_bytes",
return_value=b"not-a-zip-file",
):
with self.assertRaises(zipfile.BadZipFile):
load_external_us_factors(cache_dir=cache_dir)
def _make_zip_bytes(self, filename: str, contents: str) -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, mode="w") as archive:
archive.writestr(filename, contents)
return buffer.getvalue()