Harden factor loader zip parsing and fallback
This commit is contained in:
@@ -1,7 +1,9 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from urllib.error import URLError
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@@ -11,7 +13,14 @@ def _download_kf_zip_bytes() -> bytes:
|
|||||||
|
|
||||||
|
|
||||||
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
|
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
|
||||||
text = raw_bytes.decode("utf-8")
|
with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
|
||||||
|
member_name = next(
|
||||||
|
name
|
||||||
|
for name in archive.namelist()
|
||||||
|
if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
|
||||||
|
)
|
||||||
|
text = archive.read(member_name).decode("utf-8-sig")
|
||||||
|
|
||||||
lines = [line for line in text.splitlines() if line.strip()]
|
lines = [line for line in text.splitlines() if line.strip()]
|
||||||
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
|
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
|
||||||
table = "\n".join(lines[header_index:])
|
table = "\n".join(lines[header_index:])
|
||||||
@@ -31,10 +40,12 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF
|
|||||||
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
factors = _parse_kf_daily_csv(_download_kf_zip_bytes())
|
raw_bytes = _download_kf_zip_bytes()
|
||||||
factors.to_csv(cache_path)
|
except (URLError, TimeoutError, ConnectionError, OSError):
|
||||||
return factors
|
|
||||||
except Exception:
|
|
||||||
if cache_path.exists():
|
if cache_path.exists():
|
||||||
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
factors = _parse_kf_daily_csv(raw_bytes)
|
||||||
|
factors.to_csv(cache_path)
|
||||||
|
return factors
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
|
import io
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from urllib.error import URLError
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -9,7 +12,7 @@ from factor_attribution import load_external_us_factors
|
|||||||
|
|
||||||
|
|
||||||
class ExternalFactorLoaderTests(unittest.TestCase):
|
class ExternalFactorLoaderTests(unittest.TestCase):
|
||||||
def test_load_external_us_factors_parses_percent_values_and_dates(self):
|
def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self):
|
||||||
csv_text = (
|
csv_text = (
|
||||||
"This line is ignored\n"
|
"This line is ignored\n"
|
||||||
",Mkt-RF,SMB,HML,RMW,CMA,RF\n"
|
",Mkt-RF,SMB,HML,RMW,CMA,RF\n"
|
||||||
@@ -17,11 +20,15 @@ class ExternalFactorLoaderTests(unittest.TestCase):
|
|||||||
"20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n"
|
"20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n"
|
||||||
"\n"
|
"\n"
|
||||||
)
|
)
|
||||||
|
zip_bytes = self._make_zip_bytes(
|
||||||
|
"F-F_Research_Data_5_Factors_2x3_daily.csv",
|
||||||
|
csv_text,
|
||||||
|
)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
with mock.patch(
|
with mock.patch(
|
||||||
"factor_attribution._download_kf_zip_bytes",
|
"factor_attribution._download_kf_zip_bytes",
|
||||||
return_value=csv_text.encode("utf-8"),
|
return_value=zip_bytes,
|
||||||
):
|
):
|
||||||
factors = load_external_us_factors(cache_dir=Path(tmpdir))
|
factors = load_external_us_factors(cache_dir=Path(tmpdir))
|
||||||
|
|
||||||
@@ -51,9 +58,38 @@ class ExternalFactorLoaderTests(unittest.TestCase):
|
|||||||
cached.to_csv(cache_dir / "ff5_us_daily.csv")
|
cached.to_csv(cache_dir / "ff5_us_daily.csv")
|
||||||
with mock.patch(
|
with mock.patch(
|
||||||
"factor_attribution._download_kf_zip_bytes",
|
"factor_attribution._download_kf_zip_bytes",
|
||||||
side_effect=RuntimeError("boom"),
|
side_effect=URLError("boom"),
|
||||||
):
|
):
|
||||||
factors = load_external_us_factors(cache_dir=cache_dir)
|
factors = load_external_us_factors(cache_dir=cache_dir)
|
||||||
|
|
||||||
self.assertEqual(len(factors), 1)
|
self.assertEqual(len(factors), 1)
|
||||||
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
|
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
|
||||||
|
|
||||||
|
def test_load_external_us_factors_raises_parse_errors_instead_of_using_cache(self):
|
||||||
|
cached = pd.DataFrame(
|
||||||
|
{
|
||||||
|
"MKT_RF": [0.01],
|
||||||
|
"SMB": [0.0],
|
||||||
|
"HML": [0.0],
|
||||||
|
"RMW": [0.0],
|
||||||
|
"CMA": [0.0],
|
||||||
|
"RF": [0.0001],
|
||||||
|
},
|
||||||
|
index=pd.to_datetime(["2026-01-02"]),
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
cache_dir = Path(tmpdir)
|
||||||
|
cached.to_csv(cache_dir / "ff5_us_daily.csv")
|
||||||
|
with mock.patch(
|
||||||
|
"factor_attribution._download_kf_zip_bytes",
|
||||||
|
return_value=b"not-a-zip-file",
|
||||||
|
):
|
||||||
|
with self.assertRaises(zipfile.BadZipFile):
|
||||||
|
load_external_us_factors(cache_dir=cache_dir)
|
||||||
|
|
||||||
|
def _make_zip_bytes(self, filename: str, contents: str) -> bytes:
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
with zipfile.ZipFile(buffer, mode="w") as archive:
|
||||||
|
archive.writestr(filename, contents)
|
||||||
|
return buffer.getvalue()
|
||||||
|
|||||||
Reference in New Issue
Block a user