Implement Ken French factor download and cache fallback

This commit is contained in:
2026-04-07 15:44:46 +08:00
parent e70922d9af
commit 9e6da727a3
2 changed files with 73 additions and 6 deletions

View File

@@ -4,12 +4,32 @@ import io
import zipfile import zipfile
from pathlib import Path from pathlib import Path
from urllib.error import URLError from urllib.error import URLError
from urllib.request import Request, urlopen
import pandas as pd import pandas as pd
KEN_FRENCH_DAILY_FF5_ZIP_URL = (
"https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"
"F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
)
SOURCE_PARSE_EXCEPTIONS = (
zipfile.BadZipFile,
UnicodeDecodeError,
StopIteration,
KeyError,
ValueError,
pd.errors.ParserError,
)
def _download_kf_zip_bytes() -> bytes: def _download_kf_zip_bytes() -> bytes:
raise NotImplementedError request = Request(
KEN_FRENCH_DAILY_FF5_ZIP_URL,
headers={"User-Agent": "quant-factor-attribution/0.1"},
)
with urlopen(request, timeout=30) as response:
return response.read()
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame: def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
@@ -46,6 +66,12 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF
return pd.read_csv(cache_path, index_col=0, parse_dates=True) return pd.read_csv(cache_path, index_col=0, parse_dates=True)
raise raise
try:
factors = _parse_kf_daily_csv(raw_bytes) factors = _parse_kf_daily_csv(raw_bytes)
except SOURCE_PARSE_EXCEPTIONS:
if cache_path.exists():
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
raise
factors.to_csv(cache_path) factors.to_csv(cache_path)
return factors return factors

View File

@@ -8,10 +8,28 @@ from unittest import mock
import pandas as pd import pandas as pd
from factor_attribution import load_external_us_factors from factor_attribution import (
KEN_FRENCH_DAILY_FF5_ZIP_URL,
_download_kf_zip_bytes,
load_external_us_factors,
)
class ExternalFactorLoaderTests(unittest.TestCase): class ExternalFactorLoaderTests(unittest.TestCase):
def test_download_kf_zip_bytes_fetches_official_ken_french_zip(self):
response = mock.MagicMock()
response.read.return_value = b"zip-bytes"
response.__enter__.return_value = response
response.__exit__.return_value = False
with mock.patch("factor_attribution.urlopen", return_value=response) as mocked_urlopen:
raw_bytes = _download_kf_zip_bytes()
self.assertEqual(raw_bytes, b"zip-bytes")
request = mocked_urlopen.call_args.args[0]
self.assertEqual(request.full_url, KEN_FRENCH_DAILY_FF5_ZIP_URL)
self.assertEqual(mocked_urlopen.call_args.kwargs["timeout"], 30)
def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self): def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self):
csv_text = ( csv_text = (
"This line is ignored\n" "This line is ignored\n"
@@ -65,7 +83,7 @@ class ExternalFactorLoaderTests(unittest.TestCase):
self.assertEqual(len(factors), 1) self.assertEqual(len(factors), 1)
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
def test_load_external_us_factors_raises_parse_errors_instead_of_using_cache(self): def test_load_external_us_factors_falls_back_to_cache_when_parse_fails(self):
cached = pd.DataFrame( cached = pd.DataFrame(
{ {
"MKT_RF": [0.01], "MKT_RF": [0.01],
@@ -85,8 +103,31 @@ class ExternalFactorLoaderTests(unittest.TestCase):
"factor_attribution._download_kf_zip_bytes", "factor_attribution._download_kf_zip_bytes",
return_value=b"not-a-zip-file", return_value=b"not-a-zip-file",
): ):
with self.assertRaises(zipfile.BadZipFile): factors = load_external_us_factors(cache_dir=cache_dir)
load_external_us_factors(cache_dir=cache_dir)
self.assertEqual(len(factors), 1)
self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01)
def test_load_external_us_factors_surfaces_cache_write_failures(self):
csv_text = (
"This line is ignored\n"
",Mkt-RF,SMB,HML,RMW,CMA,RF\n"
"20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n"
"\n"
)
zip_bytes = self._make_zip_bytes(
"F-F_Research_Data_5_Factors_2x3_daily.csv",
csv_text,
)
with tempfile.TemporaryDirectory() as tmpdir:
with mock.patch(
"factor_attribution._download_kf_zip_bytes",
return_value=zip_bytes,
):
with mock.patch("pandas.DataFrame.to_csv", side_effect=OSError("disk full")):
with self.assertRaises(OSError):
load_external_us_factors(cache_dir=Path(tmpdir))
def _make_zip_bytes(self, filename: str, contents: str) -> bytes: def _make_zip_bytes(self, filename: str, contents: str) -> bytes:
buffer = io.BytesIO() buffer = io.BytesIO()