Integrate factor attribution into backtest CLI
This commit is contained in:
@@ -32,6 +32,57 @@ PROXY_FACTOR_COLUMNS = [
|
||||
"CMA_PROXY",
|
||||
] + EXTENSION_FACTOR_COLUMNS
|
||||
TRADING_DAYS_PER_YEAR = 252
|
||||
SUMMARY_BETA_COLUMN_BY_FACTOR = {
|
||||
"MKT_RF": "beta_mkt",
|
||||
"MKT": "beta_mkt",
|
||||
"SMB": "beta_smb",
|
||||
"SMB_PROXY": "beta_smb",
|
||||
"HML": "beta_hml",
|
||||
"HML_PROXY": "beta_hml",
|
||||
"RMW": "beta_rmw",
|
||||
"RMW_PROXY": "beta_rmw",
|
||||
"CMA": "beta_cma",
|
||||
"CMA_PROXY": "beta_cma",
|
||||
"MOM": "beta_mom",
|
||||
"LOWVOL": "beta_lowvol",
|
||||
"RECOVERY": "beta_recovery",
|
||||
}
|
||||
SUMMARY_COLUMNS = [
|
||||
"strategy",
|
||||
"market",
|
||||
"model",
|
||||
"factor_source",
|
||||
"proxy_only",
|
||||
"start_date",
|
||||
"end_date",
|
||||
"n_obs",
|
||||
"alpha_daily",
|
||||
"alpha_ann",
|
||||
"alpha_t_stat",
|
||||
"alpha_p_value",
|
||||
"r_squared",
|
||||
"adj_r_squared",
|
||||
"residual_vol_ann",
|
||||
"beta_mkt",
|
||||
"beta_smb",
|
||||
"beta_hml",
|
||||
"beta_rmw",
|
||||
"beta_cma",
|
||||
"beta_mom",
|
||||
"beta_lowvol",
|
||||
"beta_recovery",
|
||||
]
|
||||
LOADING_COLUMNS = [
|
||||
"strategy",
|
||||
"market",
|
||||
"model",
|
||||
"factor_source",
|
||||
"proxy_only",
|
||||
"factor",
|
||||
"beta",
|
||||
"t_stat",
|
||||
"p_value",
|
||||
]
|
||||
|
||||
|
||||
class ExternalFactorFormatError(ValueError):
|
||||
@@ -358,3 +409,218 @@ def run_factor_regression(
|
||||
"end_date": regression_frame.index.max().date().isoformat(),
|
||||
"n_obs": n_obs,
|
||||
}
|
||||
|
||||
|
||||
def _empty_attribution_frames() -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
return (
|
||||
pd.DataFrame(columns=SUMMARY_COLUMNS),
|
||||
pd.DataFrame(columns=LOADING_COLUMNS),
|
||||
)
|
||||
|
||||
|
||||
def _select_model_names(
|
||||
model_selection: str,
|
||||
available_models: dict[str, list[str]],
|
||||
) -> list[str]:
|
||||
if model_selection == "all":
|
||||
return list(available_models)
|
||||
if model_selection in available_models:
|
||||
return [model_selection]
|
||||
return list(available_models)
|
||||
|
||||
|
||||
def attribute_strategies(
|
||||
results_df: pd.DataFrame,
|
||||
benchmark_label: str,
|
||||
price_data: pd.DataFrame,
|
||||
market: str,
|
||||
model_selection: str = "all",
|
||||
benchmark: str | None = None,
|
||||
external_factors: pd.DataFrame | None = None,
|
||||
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
benchmark_symbol = benchmark
|
||||
if benchmark_symbol is None:
|
||||
matching_columns = [column for column in price_data.columns if column in benchmark_label]
|
||||
benchmark_symbol = matching_columns[0] if matching_columns else price_data.columns[-1]
|
||||
|
||||
extension_factors = build_extension_factors(price_data, benchmark=benchmark_symbol, market=market)
|
||||
|
||||
resolved_external_factors = external_factors
|
||||
market_name = market.lower()
|
||||
if market_name == "us" and resolved_external_factors is None:
|
||||
try:
|
||||
resolved_external_factors = load_external_us_factors()
|
||||
except (ExternalFactorDownloadError, ExternalFactorFormatError, zipfile.BadZipFile) as exc:
|
||||
warnings.warn(
|
||||
f"Falling back to proxy factor attribution because external US factors were unavailable: {exc}",
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
resolved_external_factors = None
|
||||
|
||||
proxy_factors = None
|
||||
if market_name != "us" or resolved_external_factors is None:
|
||||
proxy_factors = build_proxy_core_factors(price_data, benchmark=benchmark_symbol, market=market)
|
||||
|
||||
prepared = prepare_factor_models(
|
||||
market=market,
|
||||
extension_factors=extension_factors,
|
||||
proxy_factors=proxy_factors,
|
||||
external_factors=resolved_external_factors,
|
||||
)
|
||||
model_names = _select_model_names(model_selection, prepared["models"])
|
||||
|
||||
strategy_returns = results_df.sort_index().pct_change(fill_method=None)
|
||||
if strategy_returns.empty:
|
||||
return _empty_attribution_frames()
|
||||
|
||||
summary_rows: list[dict[str, object]] = []
|
||||
loading_rows: list[dict[str, object]] = []
|
||||
for strategy_name in strategy_returns.columns:
|
||||
if strategy_name == benchmark_label:
|
||||
continue
|
||||
|
||||
for model_name in model_names:
|
||||
factor_cols = prepared["models"][model_name]
|
||||
try:
|
||||
regression_result = run_factor_regression(
|
||||
strategy_returns=strategy_returns[strategy_name],
|
||||
factor_frame=prepared["factor_frame"],
|
||||
factor_cols=factor_cols,
|
||||
risk_free_col=prepared["risk_free_col"],
|
||||
)
|
||||
except ValueError as exc:
|
||||
warnings.warn(
|
||||
f"Skipping factor attribution for {strategy_name} ({model_name}): {exc}",
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
continue
|
||||
|
||||
summary_row: dict[str, object] = {
|
||||
"strategy": strategy_name,
|
||||
"market": market_name,
|
||||
"model": model_name,
|
||||
"factor_source": prepared["factor_source"],
|
||||
"proxy_only": prepared["proxy_only"],
|
||||
"start_date": regression_result["start_date"],
|
||||
"end_date": regression_result["end_date"],
|
||||
"n_obs": regression_result["n_obs"],
|
||||
"alpha_daily": regression_result["alpha_daily"],
|
||||
"alpha_ann": regression_result["alpha_ann"],
|
||||
"alpha_t_stat": regression_result["alpha_t_stat"],
|
||||
"alpha_p_value": regression_result["alpha_p_value"],
|
||||
"r_squared": regression_result["r_squared"],
|
||||
"adj_r_squared": regression_result["adj_r_squared"],
|
||||
"residual_vol_ann": regression_result["residual_vol_ann"],
|
||||
"beta_mkt": np.nan,
|
||||
"beta_smb": np.nan,
|
||||
"beta_hml": np.nan,
|
||||
"beta_rmw": np.nan,
|
||||
"beta_cma": np.nan,
|
||||
"beta_mom": np.nan,
|
||||
"beta_lowvol": np.nan,
|
||||
"beta_recovery": np.nan,
|
||||
}
|
||||
for factor_name, beta in regression_result["betas"].items():
|
||||
summary_column = SUMMARY_BETA_COLUMN_BY_FACTOR.get(factor_name)
|
||||
if summary_column is not None:
|
||||
summary_row[summary_column] = beta
|
||||
loading_rows.append(
|
||||
{
|
||||
"strategy": strategy_name,
|
||||
"market": market_name,
|
||||
"model": model_name,
|
||||
"factor_source": prepared["factor_source"],
|
||||
"proxy_only": prepared["proxy_only"],
|
||||
"factor": factor_name,
|
||||
"beta": beta,
|
||||
"t_stat": regression_result["t_stats"][factor_name],
|
||||
"p_value": regression_result["p_values"][factor_name],
|
||||
}
|
||||
)
|
||||
|
||||
summary_rows.append(summary_row)
|
||||
|
||||
summary_df = pd.DataFrame(summary_rows, columns=SUMMARY_COLUMNS)
|
||||
loadings_df = pd.DataFrame(loading_rows, columns=LOADING_COLUMNS)
|
||||
return summary_df, loadings_df
|
||||
|
||||
|
||||
def export_attribution(
|
||||
summary_df: pd.DataFrame,
|
||||
loadings_df: pd.DataFrame,
|
||||
output_dir: Path | str,
|
||||
) -> None:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
summary_df.to_csv(output_path / "summary.csv", index=False)
|
||||
loadings_df.to_csv(output_path / "loadings.csv", index=False)
|
||||
|
||||
|
||||
def _describe_alpha(alpha_ann: float) -> str:
|
||||
if alpha_ann > 0.02:
|
||||
return "positive"
|
||||
if alpha_ann < -0.02:
|
||||
return "negative"
|
||||
return "close to flat"
|
||||
|
||||
|
||||
def _describe_fit(r_squared: float) -> str:
|
||||
if r_squared >= 0.75:
|
||||
return "strong"
|
||||
if r_squared >= 0.4:
|
||||
return "moderate"
|
||||
return "weak"
|
||||
|
||||
|
||||
def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str:
|
||||
beta_columns = [column for column in SUMMARY_COLUMNS if column.startswith("beta_")]
|
||||
present = []
|
||||
for column in beta_columns:
|
||||
value = row.get(column)
|
||||
if pd.notna(value):
|
||||
present.append((column.removeprefix("beta_").upper(), float(value)))
|
||||
|
||||
if not present:
|
||||
return "no material factor loadings were estimated"
|
||||
|
||||
top_loadings = sorted(present, key=lambda item: abs(item[1]), reverse=True)[:limit]
|
||||
return ", ".join(f"{name} {value:.2f}" for name, value in top_loadings)
|
||||
|
||||
|
||||
def print_attribution_summary(summary_df: pd.DataFrame) -> None:
|
||||
if summary_df.empty:
|
||||
print("Factor attribution: no usable regressions were produced.")
|
||||
return
|
||||
|
||||
display_columns = [
|
||||
"strategy",
|
||||
"market",
|
||||
"model",
|
||||
"alpha_ann",
|
||||
"r_squared",
|
||||
"residual_vol_ann",
|
||||
"beta_mkt",
|
||||
"beta_smb",
|
||||
"beta_hml",
|
||||
"beta_rmw",
|
||||
"beta_cma",
|
||||
"beta_mom",
|
||||
"beta_lowvol",
|
||||
"beta_recovery",
|
||||
]
|
||||
table = summary_df.loc[:, display_columns].copy()
|
||||
numeric_columns = [column for column in display_columns if column not in {"strategy", "market", "model"}]
|
||||
table.loc[:, numeric_columns] = table.loc[:, numeric_columns].round(4)
|
||||
|
||||
print("\nFactor attribution")
|
||||
print(table.to_string(index=False, na_rep=""))
|
||||
print("\nInterpretation")
|
||||
for _, row in summary_df.iterrows():
|
||||
print(
|
||||
f"- {row['strategy']} / {row['model']}: estimated annualized alpha is "
|
||||
f"{_describe_alpha(float(row['alpha_ann']))} ({row['alpha_ann']:.2%}); "
|
||||
f"strongest loadings are {_top_loading_descriptions(row)}; "
|
||||
f"model fit looks {_describe_fit(float(row['r_squared']))} (R^2={row['r_squared']:.2f})."
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user