From 0e946880660cb0f8b649c160e5d0e12b9b88fb97 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 7 Apr 2026 15:51:57 +0800 Subject: [PATCH] Narrow factor loader format fallback handling --- factor_attribution.py | 85 +++++++++++++++++++++++-------- tests/test_factor_attribution.py | 86 ++++++++++++++++++++++++++++++-- 2 files changed, 148 insertions(+), 23 deletions(-) diff --git a/factor_attribution.py b/factor_attribution.py index 1c1f5f5..b81180b 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -1,6 +1,7 @@ from __future__ import annotations import io +import warnings import zipfile from pathlib import Path from urllib.error import URLError @@ -13,14 +14,11 @@ KEN_FRENCH_DAILY_FF5_ZIP_URL = ( "F-F_Research_Data_5_Factors_2x3_daily_CSV.zip" ) -SOURCE_PARSE_EXCEPTIONS = ( - zipfile.BadZipFile, - UnicodeDecodeError, - StopIteration, - KeyError, - ValueError, - pd.errors.ParserError, -) +EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"] + + +class ExternalFactorFormatError(ValueError): + pass def _download_kf_zip_bytes() -> bytes: @@ -34,25 +32,65 @@ def _download_kf_zip_bytes() -> bytes: def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive: - member_name = next( + member_names = [ name for name in archive.namelist() if not name.endswith("/") and name.lower().endswith((".csv", ".txt")) - ) - text = archive.read(member_name).decode("utf-8-sig") + ] + if not member_names: + raise ExternalFactorFormatError("Ken French archive did not contain a CSV or TXT file") + + try: + text = archive.read(member_names[0]).decode("utf-8-sig") + except UnicodeDecodeError as exc: + raise ExternalFactorFormatError("Ken French factor file was not valid UTF-8 text") from exc lines = [line for line in text.splitlines() if line.strip()] - header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line) + try: + header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line) + except StopIteration as exc: + raise ExternalFactorFormatError("Ken French factor file was missing the daily factor header") from exc + table = "\n".join(lines[header_index:]) - factors = pd.read_csv(io.StringIO(table)) + try: + factors = pd.read_csv(io.StringIO(table)) + except pd.errors.ParserError as exc: + raise ExternalFactorFormatError("Ken French factor table could not be parsed") from exc + factors = factors.rename(columns={"Mkt-RF": "MKT_RF"}) date_column = factors.columns[0] + missing_columns = [column for column in EXPECTED_FACTOR_COLUMNS if column not in factors.columns] + if missing_columns: + raise ExternalFactorFormatError( + f"Ken French factor table was missing columns: {', '.join(missing_columns)}" + ) + factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")] - factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d") + if factors.empty: + raise ExternalFactorFormatError("Ken French factor table did not contain daily rows") + + try: + factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d") + except ValueError as exc: + raise ExternalFactorFormatError("Ken French factor table contained invalid dates") from exc + factors = factors.set_index(date_column) factors.index.name = None - factors = factors.astype(float) / 100.0 - return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]] + try: + factors = factors[EXPECTED_FACTOR_COLUMNS].astype(float) / 100.0 + except ValueError as exc: + raise ExternalFactorFormatError("Ken French factor table contained non-numeric values") from exc + + return factors + + +def _warn_and_load_cached_factors(cache_path: Path, reason: str) -> pd.DataFrame: + warnings.warn( + f"Using cached data from {cache_path} because {reason}.", + UserWarning, + stacklevel=2, + ) + return pd.read_csv(cache_path, index_col=0, parse_dates=True) def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame: @@ -61,16 +99,23 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF try: raw_bytes = _download_kf_zip_bytes() - except (URLError, TimeoutError, ConnectionError, OSError): + except (URLError, TimeoutError, ConnectionError) as exc: if cache_path.exists(): - return pd.read_csv(cache_path, index_col=0, parse_dates=True) + return _warn_and_load_cached_factors(cache_path, f"download failed: {exc}") raise try: factors = _parse_kf_daily_csv(raw_bytes) - except SOURCE_PARSE_EXCEPTIONS: + except zipfile.BadZipFile as exc: if cache_path.exists(): - return pd.read_csv(cache_path, index_col=0, parse_dates=True) + return _warn_and_load_cached_factors(cache_path, f"the upstream ZIP was invalid: {exc}") + raise + except ExternalFactorFormatError as exc: + if cache_path.exists(): + return _warn_and_load_cached_factors( + cache_path, + f"the upstream factor format was invalid: {exc}", + ) raise factors.to_csv(cache_path) diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index 9a1802b..0ad12eb 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -9,8 +9,10 @@ from unittest import mock import pandas as pd from factor_attribution import ( + ExternalFactorFormatError, KEN_FRENCH_DAILY_FF5_ZIP_URL, _download_kf_zip_bytes, + _parse_kf_daily_csv, load_external_us_factors, ) @@ -78,12 +80,52 @@ class ExternalFactorLoaderTests(unittest.TestCase): "factor_attribution._download_kf_zip_bytes", side_effect=URLError("boom"), ): - factors = load_external_us_factors(cache_dir=cache_dir) + with self.assertWarnsRegex(UserWarning, "cached data"): + factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) - def test_load_external_us_factors_falls_back_to_cache_when_parse_fails(self): + def test_parse_kf_daily_csv_raises_external_factor_format_error_for_missing_header(self): + zip_bytes = self._make_zip_bytes( + "F-F_Research_Data_5_Factors_2x3_daily.csv", + "not the expected file format\n20260102,1.00\n", + ) + + with self.assertRaises(ExternalFactorFormatError): + _parse_kf_daily_csv(zip_bytes) + + def test_load_external_us_factors_warns_and_falls_back_to_cache_when_source_format_is_invalid(self): + cached = pd.DataFrame( + { + "MKT_RF": [0.01], + "SMB": [0.0], + "HML": [0.0], + "RMW": [0.0], + "CMA": [0.0], + "RF": [0.0001], + }, + index=pd.to_datetime(["2026-01-02"]), + ) + + with tempfile.TemporaryDirectory() as tmpdir: + cache_dir = Path(tmpdir) + cached.to_csv(cache_dir / "ff5_us_daily.csv") + malformed_zip_bytes = self._make_zip_bytes( + "F-F_Research_Data_5_Factors_2x3_daily.csv", + "not the expected file format\n20260102,1.00\n", + ) + with mock.patch( + "factor_attribution._download_kf_zip_bytes", + return_value=malformed_zip_bytes, + ): + with self.assertWarnsRegex(UserWarning, "cached data"): + factors = load_external_us_factors(cache_dir=cache_dir) + + self.assertEqual(len(factors), 1) + self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) + + def test_load_external_us_factors_warns_and_falls_back_to_cache_when_zip_is_invalid(self): cached = pd.DataFrame( { "MKT_RF": [0.01], @@ -103,7 +145,8 @@ class ExternalFactorLoaderTests(unittest.TestCase): "factor_attribution._download_kf_zip_bytes", return_value=b"not-a-zip-file", ): - factors = load_external_us_factors(cache_dir=cache_dir) + with self.assertWarnsRegex(UserWarning, "cached data"): + factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) @@ -129,6 +172,43 @@ class ExternalFactorLoaderTests(unittest.TestCase): with self.assertRaises(OSError): load_external_us_factors(cache_dir=Path(tmpdir)) + def test_load_external_us_factors_does_not_swallow_unrelated_local_failures(self): + csv_text = ( + "This line is ignored\n" + ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" + "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" + "\n" + ) + zip_bytes = self._make_zip_bytes( + "F-F_Research_Data_5_Factors_2x3_daily.csv", + csv_text, + ) + cached = pd.DataFrame( + { + "MKT_RF": [0.01], + "SMB": [0.0], + "HML": [0.0], + "RMW": [0.0], + "CMA": [0.0], + "RF": [0.0001], + }, + index=pd.to_datetime(["2026-01-02"]), + ) + + with tempfile.TemporaryDirectory() as tmpdir: + cache_dir = Path(tmpdir) + cached.to_csv(cache_dir / "ff5_us_daily.csv") + with mock.patch( + "factor_attribution._download_kf_zip_bytes", + return_value=zip_bytes, + ): + with mock.patch( + "factor_attribution._parse_kf_daily_csv", + side_effect=RuntimeError("unexpected local bug"), + ): + with self.assertRaises(RuntimeError): + load_external_us_factors(cache_dir=cache_dir) + def _make_zip_bytes(self, filename: str, contents: str) -> bytes: buffer = io.BytesIO() with zipfile.ZipFile(buffer, mode="w") as archive: