From 0876c0b6afa202a7b0435110131b56aa3e2a8ee3 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 7 Apr 2026 16:48:23 +0800 Subject: [PATCH] Guard factor regressions against unidentified models --- factor_attribution.py | 17 +++++++++++++---- tests/test_factor_attribution.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/factor_attribution.py b/factor_attribution.py index 8a9037d..d197f0b 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -293,14 +293,23 @@ def run_factor_regression( x = regression_frame[factor_cols].astype(float).to_numpy() x = np.column_stack([np.ones(len(regression_frame)), x]) + n_obs = len(regression_frame) + param_count = x.shape[1] + if n_obs <= param_count: + raise ValueError( + f"Insufficient observations for regression: need more than {param_count} rows, got {n_obs}" + ) + + coefficients, _, rank, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None) + if rank < param_count: + raise ValueError( + "Regression design matrix is rank-deficient; coefficients are not uniquely identified" + ) - coefficients, _, _, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None) fitted = x @ coefficients residuals = y.to_numpy() - fitted - n_obs = len(regression_frame) - param_count = x.shape[1] - dof = max(n_obs - param_count, 1) + dof = n_obs - param_count residual_variance = float((residuals @ residuals) / dof) covariance = residual_variance * np.linalg.pinv(x.T @ x) standard_errors = np.sqrt(np.diag(covariance)) diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index 3376610..4a5cd6a 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -461,6 +461,35 @@ class RegressionTests(unittest.TestCase): self.assertEqual(result["end_date"], "2025-02-21") self.assertEqual(result["n_obs"], 296) + def test_run_factor_regression_rejects_underdetermined_designs(self): + dates = pd.date_range("2024-01-01", periods=3, freq="B") + factors = pd.DataFrame( + { + "MKT_RF": [0.01, -0.02, 0.015], + "SMB": [0.005, 0.004, -0.001], + }, + index=dates, + ) + strategy = pd.Series([0.012, -0.018, 0.019], index=dates) + + with self.assertRaisesRegex(ValueError, "Insufficient observations"): + run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) + + def test_run_factor_regression_rejects_rank_deficient_designs(self): + dates = pd.date_range("2024-01-01", periods=6, freq="B") + market = np.array([0.01, -0.02, 0.015, 0.005, -0.01, 0.02]) + factors = pd.DataFrame( + { + "MKT_RF": market, + "SMB": market * 2.0, + }, + index=dates, + ) + strategy = pd.Series(0.0005 + 1.0 * factors["MKT_RF"] + 0.5 * factors["SMB"], index=dates) + + with self.assertRaisesRegex(ValueError, "rank-deficient"): + run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) + def test_prepare_factor_models_uses_proxy_family_without_external_us_factors(self): dates = pd.date_range("2024-01-01", periods=5, freq="B") extension = pd.DataFrame(