Handle square factor regressions without inference

This commit is contained in:
2026-04-07 16:53:16 +08:00
parent 0876c0b6af
commit 3d934b3316
2 changed files with 54 additions and 21 deletions

View File

@@ -295,9 +295,9 @@ def run_factor_regression(
x = np.column_stack([np.ones(len(regression_frame)), x])
n_obs = len(regression_frame)
param_count = x.shape[1]
if n_obs <= param_count:
if n_obs < param_count:
raise ValueError(
f"Insufficient observations for regression: need more than {param_count} rows, got {n_obs}"
f"Insufficient observations for regression: need at least {param_count} rows, got {n_obs}"
)
coefficients, _, rank, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None)
@@ -310,29 +310,36 @@ def run_factor_regression(
residuals = y.to_numpy() - fitted
dof = n_obs - param_count
residual_variance = float((residuals @ residuals) / dof)
covariance = residual_variance * np.linalg.pinv(x.T @ x)
standard_errors = np.sqrt(np.diag(covariance))
if dof > 0:
residual_variance = float((residuals @ residuals) / dof)
covariance = residual_variance * np.linalg.pinv(x.T @ x)
standard_errors = np.sqrt(np.diag(covariance))
with np.errstate(divide="ignore", invalid="ignore"):
t_stats = np.divide(
coefficients,
standard_errors,
out=np.full_like(coefficients, np.nan, dtype=float),
where=standard_errors > 0,
)
p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof)
with np.errstate(divide="ignore", invalid="ignore"):
t_stats = np.divide(
coefficients,
standard_errors,
out=np.full_like(coefficients, np.nan, dtype=float),
where=standard_errors > 0,
)
p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof)
residual_vol_ann = float(pd.Series(residuals, index=regression_frame.index).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR))
adj_r_squared_is_defined = True
else:
t_stats = np.full_like(coefficients, np.nan, dtype=float)
p_values = np.full_like(coefficients, np.nan, dtype=float)
residual_vol_ann = float("nan")
adj_r_squared_is_defined = False
ss_total = float(((y - y.mean()) ** 2).sum())
ss_residual = float(np.sum(residuals**2))
r_squared = 1.0 - ss_residual / ss_total if ss_total else 0.0
if n_obs > param_count:
if adj_r_squared_is_defined:
adj_r_squared = 1.0 - (1.0 - r_squared) * (n_obs - 1) / (n_obs - param_count)
else:
adj_r_squared = 0.0
adj_r_squared = float("nan")
factor_slice = slice(1, None)
residual_series = pd.Series(residuals, index=regression_frame.index)
return {
"alpha_daily": float(coefficients[0]),
"alpha_ann": float(coefficients[0] * TRADING_DAYS_PER_YEAR),
@@ -343,7 +350,7 @@ def run_factor_regression(
"p_values": {name: float(value) for name, value in zip(factor_cols, p_values[factor_slice])},
"r_squared": float(r_squared),
"adj_r_squared": float(adj_r_squared),
"residual_vol_ann": float(residual_series.std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)),
"residual_vol_ann": residual_vol_ann,
"start_date": regression_frame.index.min().date().isoformat(),
"end_date": regression_frame.index.max().date().isoformat(),
"n_obs": n_obs,