Handle square factor regressions without inference
This commit is contained in:
@@ -295,9 +295,9 @@ def run_factor_regression(
|
||||
x = np.column_stack([np.ones(len(regression_frame)), x])
|
||||
n_obs = len(regression_frame)
|
||||
param_count = x.shape[1]
|
||||
if n_obs <= param_count:
|
||||
if n_obs < param_count:
|
||||
raise ValueError(
|
||||
f"Insufficient observations for regression: need more than {param_count} rows, got {n_obs}"
|
||||
f"Insufficient observations for regression: need at least {param_count} rows, got {n_obs}"
|
||||
)
|
||||
|
||||
coefficients, _, rank, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None)
|
||||
@@ -310,29 +310,36 @@ def run_factor_regression(
|
||||
residuals = y.to_numpy() - fitted
|
||||
|
||||
dof = n_obs - param_count
|
||||
residual_variance = float((residuals @ residuals) / dof)
|
||||
covariance = residual_variance * np.linalg.pinv(x.T @ x)
|
||||
standard_errors = np.sqrt(np.diag(covariance))
|
||||
if dof > 0:
|
||||
residual_variance = float((residuals @ residuals) / dof)
|
||||
covariance = residual_variance * np.linalg.pinv(x.T @ x)
|
||||
standard_errors = np.sqrt(np.diag(covariance))
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
t_stats = np.divide(
|
||||
coefficients,
|
||||
standard_errors,
|
||||
out=np.full_like(coefficients, np.nan, dtype=float),
|
||||
where=standard_errors > 0,
|
||||
)
|
||||
p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof)
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
t_stats = np.divide(
|
||||
coefficients,
|
||||
standard_errors,
|
||||
out=np.full_like(coefficients, np.nan, dtype=float),
|
||||
where=standard_errors > 0,
|
||||
)
|
||||
p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof)
|
||||
residual_vol_ann = float(pd.Series(residuals, index=regression_frame.index).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR))
|
||||
adj_r_squared_is_defined = True
|
||||
else:
|
||||
t_stats = np.full_like(coefficients, np.nan, dtype=float)
|
||||
p_values = np.full_like(coefficients, np.nan, dtype=float)
|
||||
residual_vol_ann = float("nan")
|
||||
adj_r_squared_is_defined = False
|
||||
|
||||
ss_total = float(((y - y.mean()) ** 2).sum())
|
||||
ss_residual = float(np.sum(residuals**2))
|
||||
r_squared = 1.0 - ss_residual / ss_total if ss_total else 0.0
|
||||
if n_obs > param_count:
|
||||
if adj_r_squared_is_defined:
|
||||
adj_r_squared = 1.0 - (1.0 - r_squared) * (n_obs - 1) / (n_obs - param_count)
|
||||
else:
|
||||
adj_r_squared = 0.0
|
||||
adj_r_squared = float("nan")
|
||||
|
||||
factor_slice = slice(1, None)
|
||||
residual_series = pd.Series(residuals, index=regression_frame.index)
|
||||
return {
|
||||
"alpha_daily": float(coefficients[0]),
|
||||
"alpha_ann": float(coefficients[0] * TRADING_DAYS_PER_YEAR),
|
||||
@@ -343,7 +350,7 @@ def run_factor_regression(
|
||||
"p_values": {name: float(value) for name, value in zip(factor_cols, p_values[factor_slice])},
|
||||
"r_squared": float(r_squared),
|
||||
"adj_r_squared": float(adj_r_squared),
|
||||
"residual_vol_ann": float(residual_series.std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)),
|
||||
"residual_vol_ann": residual_vol_ann,
|
||||
"start_date": regression_frame.index.min().date().isoformat(),
|
||||
"end_date": regression_frame.index.max().date().isoformat(),
|
||||
"n_obs": n_obs,
|
||||
|
||||
@@ -462,19 +462,45 @@ class RegressionTests(unittest.TestCase):
|
||||
self.assertEqual(result["n_obs"], 296)
|
||||
|
||||
def test_run_factor_regression_rejects_underdetermined_designs(self):
|
||||
dates = pd.date_range("2024-01-01", periods=3, freq="B")
|
||||
dates = pd.date_range("2024-01-01", periods=2, freq="B")
|
||||
factors = pd.DataFrame(
|
||||
{
|
||||
"MKT_RF": [0.01, -0.02, 0.015],
|
||||
"SMB": [0.005, 0.004, -0.001],
|
||||
"MKT_RF": [0.01, -0.02],
|
||||
"SMB": [0.005, 0.004],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
strategy = pd.Series([0.012, -0.018, 0.019], index=dates)
|
||||
strategy = pd.Series([0.012, -0.018], index=dates)
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Insufficient observations"):
|
||||
run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"])
|
||||
|
||||
def test_run_factor_regression_allows_square_full_rank_design_without_inference(self):
|
||||
dates = pd.date_range("2024-01-01", periods=3, freq="B")
|
||||
factors = pd.DataFrame(
|
||||
{
|
||||
"MKT_RF": [0.0, 1.0, 0.0],
|
||||
"SMB": [0.0, 0.0, 1.0],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
strategy = pd.Series([0.0005, 1.2005, -0.3995], index=dates)
|
||||
|
||||
result = run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"])
|
||||
|
||||
self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=10)
|
||||
self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=10)
|
||||
self.assertAlmostEqual(result["betas"]["SMB"], -0.4, places=10)
|
||||
self.assertEqual(result["r_squared"], 1.0)
|
||||
self.assertTrue(np.isnan(result["alpha_t_stat"]))
|
||||
self.assertTrue(np.isnan(result["alpha_p_value"]))
|
||||
self.assertTrue(np.isnan(result["t_stats"]["MKT_RF"]))
|
||||
self.assertTrue(np.isnan(result["t_stats"]["SMB"]))
|
||||
self.assertTrue(np.isnan(result["p_values"]["MKT_RF"]))
|
||||
self.assertTrue(np.isnan(result["p_values"]["SMB"]))
|
||||
self.assertTrue(np.isnan(result["adj_r_squared"]))
|
||||
self.assertTrue(np.isnan(result["residual_vol_ann"]))
|
||||
|
||||
def test_run_factor_regression_rejects_rank_deficient_designs(self):
|
||||
dates = pd.date_range("2024-01-01", periods=6, freq="B")
|
||||
market = np.array([0.01, -0.02, 0.015, 0.005, -0.01, 0.02])
|
||||
|
||||
Reference in New Issue
Block a user