diff --git a/factor_attribution.py b/factor_attribution.py index d197f0b..ee90b95 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -295,9 +295,9 @@ def run_factor_regression( x = np.column_stack([np.ones(len(regression_frame)), x]) n_obs = len(regression_frame) param_count = x.shape[1] - if n_obs <= param_count: + if n_obs < param_count: raise ValueError( - f"Insufficient observations for regression: need more than {param_count} rows, got {n_obs}" + f"Insufficient observations for regression: need at least {param_count} rows, got {n_obs}" ) coefficients, _, rank, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None) @@ -310,29 +310,36 @@ def run_factor_regression( residuals = y.to_numpy() - fitted dof = n_obs - param_count - residual_variance = float((residuals @ residuals) / dof) - covariance = residual_variance * np.linalg.pinv(x.T @ x) - standard_errors = np.sqrt(np.diag(covariance)) + if dof > 0: + residual_variance = float((residuals @ residuals) / dof) + covariance = residual_variance * np.linalg.pinv(x.T @ x) + standard_errors = np.sqrt(np.diag(covariance)) - with np.errstate(divide="ignore", invalid="ignore"): - t_stats = np.divide( - coefficients, - standard_errors, - out=np.full_like(coefficients, np.nan, dtype=float), - where=standard_errors > 0, - ) - p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof) + with np.errstate(divide="ignore", invalid="ignore"): + t_stats = np.divide( + coefficients, + standard_errors, + out=np.full_like(coefficients, np.nan, dtype=float), + where=standard_errors > 0, + ) + p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof) + residual_vol_ann = float(pd.Series(residuals, index=regression_frame.index).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)) + adj_r_squared_is_defined = True + else: + t_stats = np.full_like(coefficients, np.nan, dtype=float) + p_values = np.full_like(coefficients, np.nan, dtype=float) + residual_vol_ann = float("nan") + adj_r_squared_is_defined = False ss_total = float(((y - y.mean()) ** 2).sum()) ss_residual = float(np.sum(residuals**2)) r_squared = 1.0 - ss_residual / ss_total if ss_total else 0.0 - if n_obs > param_count: + if adj_r_squared_is_defined: adj_r_squared = 1.0 - (1.0 - r_squared) * (n_obs - 1) / (n_obs - param_count) else: - adj_r_squared = 0.0 + adj_r_squared = float("nan") factor_slice = slice(1, None) - residual_series = pd.Series(residuals, index=regression_frame.index) return { "alpha_daily": float(coefficients[0]), "alpha_ann": float(coefficients[0] * TRADING_DAYS_PER_YEAR), @@ -343,7 +350,7 @@ def run_factor_regression( "p_values": {name: float(value) for name, value in zip(factor_cols, p_values[factor_slice])}, "r_squared": float(r_squared), "adj_r_squared": float(adj_r_squared), - "residual_vol_ann": float(residual_series.std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)), + "residual_vol_ann": residual_vol_ann, "start_date": regression_frame.index.min().date().isoformat(), "end_date": regression_frame.index.max().date().isoformat(), "n_obs": n_obs, diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index 4a5cd6a..e615c0b 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -462,19 +462,45 @@ class RegressionTests(unittest.TestCase): self.assertEqual(result["n_obs"], 296) def test_run_factor_regression_rejects_underdetermined_designs(self): - dates = pd.date_range("2024-01-01", periods=3, freq="B") + dates = pd.date_range("2024-01-01", periods=2, freq="B") factors = pd.DataFrame( { - "MKT_RF": [0.01, -0.02, 0.015], - "SMB": [0.005, 0.004, -0.001], + "MKT_RF": [0.01, -0.02], + "SMB": [0.005, 0.004], }, index=dates, ) - strategy = pd.Series([0.012, -0.018, 0.019], index=dates) + strategy = pd.Series([0.012, -0.018], index=dates) with self.assertRaisesRegex(ValueError, "Insufficient observations"): run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) + def test_run_factor_regression_allows_square_full_rank_design_without_inference(self): + dates = pd.date_range("2024-01-01", periods=3, freq="B") + factors = pd.DataFrame( + { + "MKT_RF": [0.0, 1.0, 0.0], + "SMB": [0.0, 0.0, 1.0], + }, + index=dates, + ) + strategy = pd.Series([0.0005, 1.2005, -0.3995], index=dates) + + result = run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) + + self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=10) + self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=10) + self.assertAlmostEqual(result["betas"]["SMB"], -0.4, places=10) + self.assertEqual(result["r_squared"], 1.0) + self.assertTrue(np.isnan(result["alpha_t_stat"])) + self.assertTrue(np.isnan(result["alpha_p_value"])) + self.assertTrue(np.isnan(result["t_stats"]["MKT_RF"])) + self.assertTrue(np.isnan(result["t_stats"]["SMB"])) + self.assertTrue(np.isnan(result["p_values"]["MKT_RF"])) + self.assertTrue(np.isnan(result["p_values"]["SMB"])) + self.assertTrue(np.isnan(result["adj_r_squared"])) + self.assertTrue(np.isnan(result["residual_vol_ann"])) + def test_run_factor_regression_rejects_rank_deficient_designs(self): dates = pd.date_range("2024-01-01", periods=6, freq="B") market = np.array([0.01, -0.02, 0.015, 0.005, -0.01, 0.02])