From f2e14ec2005a6261dbd2c306af9c33b6acdc75a3 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 7 Apr 2026 16:40:24 +0800 Subject: [PATCH] Add factor attribution regression engine --- factor_attribution.py | 117 +++++++++++++++++++++++++++++++ tests/test_factor_attribution.py | 79 +++++++++++++++++++++ 2 files changed, 196 insertions(+) diff --git a/factor_attribution.py b/factor_attribution.py index 67ec85a..8a9037d 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -12,6 +12,7 @@ from urllib.request import Request, urlopen import numpy as np import pandas as pd +from scipy import stats KEN_FRENCH_DAILY_FF5_ZIP_URL = ( "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/" @@ -19,6 +20,18 @@ KEN_FRENCH_DAILY_FF5_ZIP_URL = ( ) EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"] +CAPM_FACTOR_COLUMNS = ["MKT_RF"] +FF5_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA"] +EXTENSION_FACTOR_COLUMNS = ["MOM", "LOWVOL", "RECOVERY"] +FF5PLUS_FACTOR_COLUMNS = FF5_FACTOR_COLUMNS + EXTENSION_FACTOR_COLUMNS +PROXY_FACTOR_COLUMNS = [ + "MKT", + "SMB_PROXY", + "HML_PROXY", + "RMW_PROXY", + "CMA_PROXY", +] + EXTENSION_FACTOR_COLUMNS +TRADING_DAYS_PER_YEAR = 252 class ExternalFactorFormatError(ValueError): @@ -222,3 +235,107 @@ def build_proxy_core_factors( }, index=price_data.index, ) + + +def prepare_factor_models( + market: str, + extension_factors: pd.DataFrame, + proxy_factors: pd.DataFrame | None = None, + external_factors: pd.DataFrame | None = None, +) -> dict[str, object]: + market_name = market.lower() + if market_name == "us" and external_factors is not None: + factor_frame = pd.concat([external_factors, extension_factors], axis=1) + return { + "factor_frame": factor_frame, + "models": { + "capm": CAPM_FACTOR_COLUMNS.copy(), + "ff5": FF5_FACTOR_COLUMNS.copy(), + "ff5plus": FF5PLUS_FACTOR_COLUMNS.copy(), + }, + "risk_free_col": "RF", + "factor_source": "external+local", + "proxy_only": False, + "model_family": "standard", + } + + if proxy_factors is None: + raise ValueError("proxy_factors are required when external factors are unavailable") + + factor_frame = pd.concat([proxy_factors, extension_factors], axis=1) + return { + "factor_frame": factor_frame, + "models": {"proxy": PROXY_FACTOR_COLUMNS.copy()}, + "risk_free_col": None, + "factor_source": "proxy_only", + "proxy_only": True, + "model_family": "proxy", + } + + +def run_factor_regression( + strategy_returns: pd.Series, + factor_frame: pd.DataFrame, + factor_cols: list[str], + risk_free_col: str | None = None, +) -> dict[str, object]: + regression_frame = pd.concat( + [strategy_returns.rename("strategy"), factor_frame[factor_cols + ([risk_free_col] if risk_free_col else [])]], + axis=1, + ).dropna() + + if regression_frame.empty: + raise ValueError("No overlapping strategy and factor observations were available for regression") + + y = regression_frame["strategy"].astype(float) + if risk_free_col is not None: + y = y - regression_frame[risk_free_col].astype(float) + + x = regression_frame[factor_cols].astype(float).to_numpy() + x = np.column_stack([np.ones(len(regression_frame)), x]) + + coefficients, _, _, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None) + fitted = x @ coefficients + residuals = y.to_numpy() - fitted + + n_obs = len(regression_frame) + param_count = x.shape[1] + dof = max(n_obs - param_count, 1) + residual_variance = float((residuals @ residuals) / dof) + covariance = residual_variance * np.linalg.pinv(x.T @ x) + standard_errors = np.sqrt(np.diag(covariance)) + + with np.errstate(divide="ignore", invalid="ignore"): + t_stats = np.divide( + coefficients, + standard_errors, + out=np.full_like(coefficients, np.nan, dtype=float), + where=standard_errors > 0, + ) + p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof) + + ss_total = float(((y - y.mean()) ** 2).sum()) + ss_residual = float(np.sum(residuals**2)) + r_squared = 1.0 - ss_residual / ss_total if ss_total else 0.0 + if n_obs > param_count: + adj_r_squared = 1.0 - (1.0 - r_squared) * (n_obs - 1) / (n_obs - param_count) + else: + adj_r_squared = 0.0 + + factor_slice = slice(1, None) + residual_series = pd.Series(residuals, index=regression_frame.index) + return { + "alpha_daily": float(coefficients[0]), + "alpha_ann": float(coefficients[0] * TRADING_DAYS_PER_YEAR), + "alpha_t_stat": float(t_stats[0]), + "alpha_p_value": float(p_values[0]), + "betas": {name: float(value) for name, value in zip(factor_cols, coefficients[factor_slice])}, + "t_stats": {name: float(value) for name, value in zip(factor_cols, t_stats[factor_slice])}, + "p_values": {name: float(value) for name, value in zip(factor_cols, p_values[factor_slice])}, + "r_squared": float(r_squared), + "adj_r_squared": float(adj_r_squared), + "residual_vol_ann": float(residual_series.std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)), + "start_date": regression_frame.index.min().date().isoformat(), + "end_date": regression_frame.index.max().date().isoformat(), + "n_obs": n_obs, + } diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index 72f31c8..3376610 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -21,6 +21,8 @@ from factor_attribution import ( build_extension_factors, build_proxy_core_factors, load_external_us_factors, + prepare_factor_models, + run_factor_regression, ) @@ -421,3 +423,80 @@ class LocalFactorConstructionTests(unittest.TestCase): benchmark_path = 0.0004 * steps + 0.018 * np.sin(steps / 27.0 + 0.3) data[benchmark] = 250.0 * np.exp(benchmark_path) return pd.DataFrame(data, index=dates) + + +class RegressionTests(unittest.TestCase): + def test_run_factor_regression_recovers_known_coefficients(self): + dates = pd.date_range("2024-01-01", periods=300, freq="B") + angles = np.linspace(0.0, 18.0, len(dates)) + factors = pd.DataFrame( + { + "MKT_RF": 0.012 * np.sin(angles), + "SMB": 0.007 * np.cos(angles * 0.7) + np.linspace(-0.002, 0.003, len(dates)), + "RF": np.full(len(dates), 0.0001), + }, + index=dates, + ) + factors.loc[dates[:4], "SMB"] = np.nan + + strategy = ( + 0.0005 + + 1.2 * factors["MKT_RF"] + + 0.4 * factors["SMB"] + + factors["RF"] + ) + + result = run_factor_regression( + strategy, + factors, + factor_cols=["MKT_RF", "SMB"], + risk_free_col="RF", + ) + + self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=6) + self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=6) + self.assertAlmostEqual(result["betas"]["SMB"], 0.4, places=6) + self.assertGreater(result["r_squared"], 0.999999) + self.assertEqual(result["start_date"], "2024-01-05") + self.assertEqual(result["end_date"], "2025-02-21") + self.assertEqual(result["n_obs"], 296) + + def test_prepare_factor_models_uses_proxy_family_without_external_us_factors(self): + dates = pd.date_range("2024-01-01", periods=5, freq="B") + extension = pd.DataFrame( + { + "MOM": np.linspace(0.001, 0.005, len(dates)), + "LOWVOL": np.linspace(-0.002, 0.002, len(dates)), + "RECOVERY": np.linspace(0.003, -0.001, len(dates)), + }, + index=dates, + ) + proxy = pd.DataFrame( + { + "MKT": np.linspace(-0.01, 0.01, len(dates)), + "SMB_PROXY": np.linspace(0.002, 0.004, len(dates)), + "HML_PROXY": np.linspace(-0.003, 0.001, len(dates)), + "RMW_PROXY": np.linspace(0.005, 0.001, len(dates)), + "CMA_PROXY": np.linspace(-0.004, -0.002, len(dates)), + }, + index=dates, + ) + + prepared = prepare_factor_models( + market="us", + extension_factors=extension, + proxy_factors=proxy, + external_factors=None, + ) + + self.assertEqual(prepared["factor_source"], "proxy_only") + self.assertIsNone(prepared["risk_free_col"]) + self.assertListEqual(list(prepared["models"]), ["proxy"]) + self.assertListEqual( + prepared["models"]["proxy"], + ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"], + ) + self.assertListEqual( + list(prepared["factor_frame"].columns), + ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"], + )