Handle square factor regressions without inference

This commit is contained in:
2026-04-07 16:53:16 +08:00
parent 0876c0b6af
commit 3d934b3316
2 changed files with 54 additions and 21 deletions

View File

@@ -295,9 +295,9 @@ def run_factor_regression(
x = np.column_stack([np.ones(len(regression_frame)), x])
n_obs = len(regression_frame)
param_count = x.shape[1]
if n_obs <= param_count:
if n_obs < param_count:
raise ValueError(
f"Insufficient observations for regression: need more than {param_count} rows, got {n_obs}"
f"Insufficient observations for regression: need at least {param_count} rows, got {n_obs}"
)
coefficients, _, rank, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None)
@@ -310,29 +310,36 @@ def run_factor_regression(
residuals = y.to_numpy() - fitted
dof = n_obs - param_count
residual_variance = float((residuals @ residuals) / dof)
covariance = residual_variance * np.linalg.pinv(x.T @ x)
standard_errors = np.sqrt(np.diag(covariance))
if dof > 0:
residual_variance = float((residuals @ residuals) / dof)
covariance = residual_variance * np.linalg.pinv(x.T @ x)
standard_errors = np.sqrt(np.diag(covariance))
with np.errstate(divide="ignore", invalid="ignore"):
t_stats = np.divide(
coefficients,
standard_errors,
out=np.full_like(coefficients, np.nan, dtype=float),
where=standard_errors > 0,
)
p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof)
with np.errstate(divide="ignore", invalid="ignore"):
t_stats = np.divide(
coefficients,
standard_errors,
out=np.full_like(coefficients, np.nan, dtype=float),
where=standard_errors > 0,
)
p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof)
residual_vol_ann = float(pd.Series(residuals, index=regression_frame.index).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR))
adj_r_squared_is_defined = True
else:
t_stats = np.full_like(coefficients, np.nan, dtype=float)
p_values = np.full_like(coefficients, np.nan, dtype=float)
residual_vol_ann = float("nan")
adj_r_squared_is_defined = False
ss_total = float(((y - y.mean()) ** 2).sum())
ss_residual = float(np.sum(residuals**2))
r_squared = 1.0 - ss_residual / ss_total if ss_total else 0.0
if n_obs > param_count:
if adj_r_squared_is_defined:
adj_r_squared = 1.0 - (1.0 - r_squared) * (n_obs - 1) / (n_obs - param_count)
else:
adj_r_squared = 0.0
adj_r_squared = float("nan")
factor_slice = slice(1, None)
residual_series = pd.Series(residuals, index=regression_frame.index)
return {
"alpha_daily": float(coefficients[0]),
"alpha_ann": float(coefficients[0] * TRADING_DAYS_PER_YEAR),
@@ -343,7 +350,7 @@ def run_factor_regression(
"p_values": {name: float(value) for name, value in zip(factor_cols, p_values[factor_slice])},
"r_squared": float(r_squared),
"adj_r_squared": float(adj_r_squared),
"residual_vol_ann": float(residual_series.std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)),
"residual_vol_ann": residual_vol_ann,
"start_date": regression_frame.index.min().date().isoformat(),
"end_date": regression_frame.index.max().date().isoformat(),
"n_obs": n_obs,

View File

@@ -462,19 +462,45 @@ class RegressionTests(unittest.TestCase):
self.assertEqual(result["n_obs"], 296)
def test_run_factor_regression_rejects_underdetermined_designs(self):
dates = pd.date_range("2024-01-01", periods=3, freq="B")
dates = pd.date_range("2024-01-01", periods=2, freq="B")
factors = pd.DataFrame(
{
"MKT_RF": [0.01, -0.02, 0.015],
"SMB": [0.005, 0.004, -0.001],
"MKT_RF": [0.01, -0.02],
"SMB": [0.005, 0.004],
},
index=dates,
)
strategy = pd.Series([0.012, -0.018, 0.019], index=dates)
strategy = pd.Series([0.012, -0.018], index=dates)
with self.assertRaisesRegex(ValueError, "Insufficient observations"):
run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"])
def test_run_factor_regression_allows_square_full_rank_design_without_inference(self):
dates = pd.date_range("2024-01-01", periods=3, freq="B")
factors = pd.DataFrame(
{
"MKT_RF": [0.0, 1.0, 0.0],
"SMB": [0.0, 0.0, 1.0],
},
index=dates,
)
strategy = pd.Series([0.0005, 1.2005, -0.3995], index=dates)
result = run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"])
self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=10)
self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=10)
self.assertAlmostEqual(result["betas"]["SMB"], -0.4, places=10)
self.assertEqual(result["r_squared"], 1.0)
self.assertTrue(np.isnan(result["alpha_t_stat"]))
self.assertTrue(np.isnan(result["alpha_p_value"]))
self.assertTrue(np.isnan(result["t_stats"]["MKT_RF"]))
self.assertTrue(np.isnan(result["t_stats"]["SMB"]))
self.assertTrue(np.isnan(result["p_values"]["MKT_RF"]))
self.assertTrue(np.isnan(result["p_values"]["SMB"]))
self.assertTrue(np.isnan(result["adj_r_squared"]))
self.assertTrue(np.isnan(result["residual_vol_ann"]))
def test_run_factor_regression_rejects_rank_deficient_designs(self):
dates = pd.date_range("2024-01-01", periods=6, freq="B")
market = np.array([0.01, -0.02, 0.015, 0.005, -0.01, 0.02])