from __future__ import annotations

import io
import zipfile
from pathlib import Path
from urllib.error import URLError

import pandas as pd


def _download_kf_zip_bytes() -> bytes:
    raise NotImplementedError


def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
    with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
        member_name = next(
            name
            for name in archive.namelist()
            if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
        )
        text = archive.read(member_name).decode("utf-8-sig")

    lines = [line for line in text.splitlines() if line.strip()]
    header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
    table = "\n".join(lines[header_index:])
    factors = pd.read_csv(io.StringIO(table))
    factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
    date_column = factors.columns[0]
    factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
    factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
    factors = factors.set_index(date_column)
    factors.index.name = None
    factors = factors.astype(float) / 100.0
    return factors[["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]]


def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
    cache_path = Path(cache_dir) / "ff5_us_daily.csv"
    cache_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        raw_bytes = _download_kf_zip_bytes()
    except (URLError, TimeoutError, ConnectionError, OSError):
        if cache_path.exists():
            return pd.read_csv(cache_path, index_col=0, parse_dates=True)
        raise

    factors = _parse_kf_daily_csv(raw_bytes)
    factors.to_csv(cache_path)
    return factors