Narrow factor loader format fallback handling
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import warnings
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from urllib.error import URLError
|
||||
@@ -13,14 +14,11 @@ KEN_FRENCH_DAILY_FF5_ZIP_URL = (
|
||||
"F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
|
||||
)
|
||||
|
||||
SOURCE_PARSE_EXCEPTIONS = (
|
||||
zipfile.BadZipFile,
|
||||
UnicodeDecodeError,
|
||||
StopIteration,
|
||||
KeyError,
|
||||
ValueError,
|
||||
pd.errors.ParserError,
|
||||
)
|
||||
EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]
|
||||
|
||||
|
||||
class ExternalFactorFormatError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def _download_kf_zip_bytes() -> bytes:
|
||||
@@ -34,25 +32,65 @@ def _download_kf_zip_bytes() -> bytes:
|
||||
|
||||
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
|
||||
with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
|
||||
member_name = next(
|
||||
member_names = [
|
||||
name
|
||||
for name in archive.namelist()
|
||||
if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
|
||||
)
|
||||
text = archive.read(member_name).decode("utf-8-sig")
|
||||
]
|
||||
if not member_names:
|
||||
raise ExternalFactorFormatError("Ken French archive did not contain a CSV or TXT file")
|
||||
|
||||
try:
|
||||
text = archive.read(member_names[0]).decode("utf-8-sig")
|
||||
except UnicodeDecodeError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor file was not valid UTF-8 text") from exc
|
||||
|
||||
lines = [line for line in text.splitlines() if line.strip()]
|
||||
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
|
||||
try:
|
||||
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
|
||||
except StopIteration as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor file was missing the daily factor header") from exc
|
||||
|
||||
table = "\n".join(lines[header_index:])
|
||||
factors = pd.read_csv(io.StringIO(table))
|
||||
try:
|
||||
factors = pd.read_csv(io.StringIO(table))
|
||||
except pd.errors.ParserError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor table could not be parsed") from exc
|
||||
|
||||
factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
|
||||
date_column = factors.columns[0]
|
||||
missing_columns = [column for column in EXPECTED_FACTOR_COLUMNS if column not in factors.columns]
|
||||
if missing_columns:
|
||||
raise ExternalFactorFormatError(
|
||||
f"Ken French factor table was missing columns: {', '.join(missing_columns)}"
|
||||
)
|
||||
|
||||
factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
|
||||
factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
|
||||
if factors.empty:
|
||||
raise ExternalFactorFormatError("Ken French factor table did not contain daily rows")
|
||||
|
||||
try:
|
||||
factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
|
||||
except ValueError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor table contained invalid dates") from exc
|
||||
|
||||
factors = factors.set_index(date_column)
|
||||
factors.index.name = None
|
||||
factors = factors.astype(float) / 100.0
|
||||
return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]]
|
||||
try:
|
||||
factors = factors[EXPECTED_FACTOR_COLUMNS].astype(float) / 100.0
|
||||
except ValueError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor table contained non-numeric values") from exc
|
||||
|
||||
return factors
|
||||
|
||||
|
||||
def _warn_and_load_cached_factors(cache_path: Path, reason: str) -> pd.DataFrame:
|
||||
warnings.warn(
|
||||
f"Using cached data from {cache_path} because {reason}.",
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
||||
|
||||
|
||||
def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
|
||||
@@ -61,16 +99,23 @@ def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataF
|
||||
|
||||
try:
|
||||
raw_bytes = _download_kf_zip_bytes()
|
||||
except (URLError, TimeoutError, ConnectionError, OSError):
|
||||
except (URLError, TimeoutError, ConnectionError) as exc:
|
||||
if cache_path.exists():
|
||||
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
||||
return _warn_and_load_cached_factors(cache_path, f"download failed: {exc}")
|
||||
raise
|
||||
|
||||
try:
|
||||
factors = _parse_kf_daily_csv(raw_bytes)
|
||||
except SOURCE_PARSE_EXCEPTIONS:
|
||||
except zipfile.BadZipFile as exc:
|
||||
if cache_path.exists():
|
||||
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
||||
return _warn_and_load_cached_factors(cache_path, f"the upstream ZIP was invalid: {exc}")
|
||||
raise
|
||||
except ExternalFactorFormatError as exc:
|
||||
if cache_path.exists():
|
||||
return _warn_and_load_cached_factors(
|
||||
cache_path,
|
||||
f"the upstream factor format was invalid: {exc}",
|
||||
)
|
||||
raise
|
||||
|
||||
factors.to_csv(cache_path)
|
||||
|
||||
Reference in New Issue
Block a user