Narrow factor loader format fallback handling

This commit is contained in:
2026-04-07 15:51:57 +08:00
parent 9e6da727a3
commit 0e94688066
2 changed files with 148 additions and 23 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import io
import warnings
import zipfile
from pathlib import Path
from urllib.error import URLError
@@ -13,14 +14,11 @@ KEN_FRENCH_DAILY_FF5_ZIP_URL = (
"F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
)
SOURCE_PARSE_EXCEPTIONS = (
zipfile.BadZipFile,
UnicodeDecodeError,
StopIteration,
KeyError,
ValueError,
pd.errors.ParserError,
)
EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]
class ExternalFactorFormatError(ValueError):
pass
def _download_kf_zip_bytes() -> bytes:
@@ -34,25 +32,65 @@ def _download_kf_zip_bytes() -> bytes:
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
member_name = next(
member_names = [
name
for name in archive.namelist()
if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
)
text = archive.read(member_name).decode("utf-8-sig")
]
if not member_names:
raise ExternalFactorFormatError("Ken French archive did not contain a CSV or TXT file")
try:
text = archive.read(member_names[0]).decode("utf-8-sig")
except UnicodeDecodeError as exc:
raise ExternalFactorFormatError("Ken French factor file was not valid UTF-8 text") from exc
lines = [line for line in text.splitlines() if line.strip()]
try:
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
except StopIteration as exc:
raise ExternalFactorFormatError("Ken French factor file was missing the daily factor header") from exc
table = "\n".join(lines[header_index:])
try:
factors = pd.read_csv(io.StringIO(table))
except pd.errors.ParserError as exc:
raise ExternalFactorFormatError("Ken French factor table could not be parsed") from exc
factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
date_column = factors.columns[0]
missing_columns = [column for column in EXPECTED_FACTOR_COLUMNS if column not in factors.columns]
if missing_columns:
raise ExternalFactorFormatError(
f"Ken French factor table was missing columns: {', '.join(missing_columns)}"
)
factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
if factors.empty:
raise ExternalFactorFormatError("Ken French factor table did not contain daily rows")
try:
factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
except ValueError as exc:
raise ExternalFactorFormatError("Ken French factor table contained invalid dates") from exc
factors = factors.set_index(date_column)
factors.index.name = None
factors = factors.astype(float) / 100.0
return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]]
try:
factors = factors[EXPECTED_FACTOR_COLUMNS].astype(float) / 100.0
except ValueError as exc:
raise ExternalFactorFormatError("Ken French factor table contained non-numeric values") from exc
return factors
def _warn_and_load_cached_factors(cache_path: Path, reason: str) -> pd.DataFrame:
warnings.warn(
f"Using cached data from {cache_path} because {reason}.",
UserWarning,
stacklevel=2,
)
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
@@ -61,16 +99,23 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF
try:
raw_bytes = _download_kf_zip_bytes()
except (URLError, TimeoutError, ConnectionError, OSError):
except (URLError, TimeoutError, ConnectionError) as exc:
if cache_path.exists():
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
return _warn_and_load_cached_factors(cache_path, f"download failed: {exc}")
raise
try:
factors = _parse_kf_daily_csv(raw_bytes)
except SOURCE_PARSE_EXCEPTIONS:
except zipfile.BadZipFile as exc:
if cache_path.exists():
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
return _warn_and_load_cached_factors(cache_path, f"the upstream ZIP was invalid: {exc}")
raise
except ExternalFactorFormatError as exc:
if cache_path.exists():
return _warn_and_load_cached_factors(
cache_path,
f"the upstream factor format was invalid: {exc}",
)
raise
factors.to_csv(cache_path)

View File

@@ -9,8 +9,10 @@ from unittest import mock
import pandas as pd
from factor_attribution import (
ExternalFactorFormatError,
KEN_FRENCH_DAILY_FF5_ZIP_URL,
_download_kf_zip_bytes,
_parse_kf_daily_csv,
load_external_us_factors,
)
@@ -78,12 +80,52 @@ class ExternalFactorLoaderTests(unittest.TestCase):
"factor_attribution._download_kf_zip_bytes",
side_effect=URLError("boom"),
):
with self.assertWarnsRegex(UserWarning, "cached data"):
factors = load_external_us_factors(cache_dir=cache_dir)
self.assertEqual(len(factors), 1)
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
def test_load_external_us_factors_falls_back_to_cache_when_parse_fails(self):
def test_parse_kf_daily_csv_raises_external_factor_format_error_for_missing_header(self):
zip_bytes = self._make_zip_bytes(
"F-F_Research_Data_5_Factors_2x3_daily.csv",
"not the expected file format\n20260102,1.00\n",
)
with self.assertRaises(ExternalFactorFormatError):
_parse_kf_daily_csv(zip_bytes)
def test_load_external_us_factors_warns_and_falls_back_to_cache_when_source_format_is_invalid(self):
cached = pd.DataFrame(
{
"MKT_RF": [0.01],
"SMB": [0.0],
"HML": [0.0],
"RMW": [0.0],
"CMA": [0.0],
"RF": [0.0001],
},
index=pd.to_datetime(["2026-01-02"]),
)
with tempfile.TemporaryDirectory() as tmpdir:
cache_dir = Path(tmpdir)
cached.to_csv(cache_dir / "ff5_us_daily.csv")
malformed_zip_bytes = self._make_zip_bytes(
"F-F_Research_Data_5_Factors_2x3_daily.csv",
"not the expected file format\n20260102,1.00\n",
)
with mock.patch(
"factor_attribution._download_kf_zip_bytes",
return_value=malformed_zip_bytes,
):
with self.assertWarnsRegex(UserWarning, "cached data"):
factors = load_external_us_factors(cache_dir=cache_dir)
self.assertEqual(len(factors), 1)
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
def test_load_external_us_factors_warns_and_falls_back_to_cache_when_zip_is_invalid(self):
cached = pd.DataFrame(
{
"MKT_RF": [0.01],
@@ -103,6 +145,7 @@ class ExternalFactorLoaderTests(unittest.TestCase):
"factor_attribution._download_kf_zip_bytes",
return_value=b"not-a-zip-file",
):
with self.assertWarnsRegex(UserWarning, "cached data"):
factors = load_external_us_factors(cache_dir=cache_dir)
self.assertEqual(len(factors), 1)
@@ -129,6 +172,43 @@ class ExternalFactorLoaderTests(unittest.TestCase):
with self.assertRaises(OSError):
load_external_us_factors(cache_dir=Path(tmpdir))
def test_load_external_us_factors_does_not_swallow_unrelated_local_failures(self):
csv_text = (
"This line is ignored\n"
",Mkt-RF,SMB,HML,RMW,CMA,RF\n"
"20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n"
"\n"
)
zip_bytes = self._make_zip_bytes(
"F-F_Research_Data_5_Factors_2x3_daily.csv",
csv_text,
)
cached = pd.DataFrame(
{
"MKT_RF": [0.01],
"SMB": [0.0],
"HML": [0.0],
"RMW": [0.0],
"CMA": [0.0],
"RF": [0.0001],
},
index=pd.to_datetime(["2026-01-02"]),
)
with tempfile.TemporaryDirectory() as tmpdir:
cache_dir = Path(tmpdir)
cached.to_csv(cache_dir / "ff5_us_daily.csv")
with mock.patch(
"factor_attribution._download_kf_zip_bytes",
return_value=zip_bytes,
):
with mock.patch(
"factor_attribution._parse_kf_daily_csv",
side_effect=RuntimeError("unexpected local bug"),
):
with self.assertRaises(RuntimeError):
load_external_us_factors(cache_dir=cache_dir)
def _make_zip_bytes(self, filename: str, contents: str) -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, mode="w") as archive: