Fix proxy attribution benchmark and labeling

This commit is contained in:
2026-04-07 17:18:29 +08:00
parent 9c4a219c68
commit 69a03f52d9
2 changed files with 117 additions and 10 deletions

View File

@@ -32,17 +32,18 @@ PROXY_FACTOR_COLUMNS = [
"CMA_PROXY",
] + EXTENSION_FACTOR_COLUMNS
TRADING_DAYS_PER_YEAR = 252
MISSING_BENCHMARK_SENTINEL = "__missing_benchmark__"
SUMMARY_BETA_COLUMN_BY_FACTOR = {
"MKT_RF": "beta_mkt",
"MKT": "beta_mkt",
"SMB": "beta_smb",
"SMB_PROXY": "beta_smb",
"SMB_PROXY": "beta_smb_proxy",
"HML": "beta_hml",
"HML_PROXY": "beta_hml",
"HML_PROXY": "beta_hml_proxy",
"RMW": "beta_rmw",
"RMW_PROXY": "beta_rmw",
"RMW_PROXY": "beta_rmw_proxy",
"CMA": "beta_cma",
"CMA_PROXY": "beta_cma",
"CMA_PROXY": "beta_cma_proxy",
"MOM": "beta_mom",
"LOWVOL": "beta_lowvol",
"RECOVERY": "beta_recovery",
@@ -68,6 +69,10 @@ SUMMARY_COLUMNS = [
"beta_hml",
"beta_rmw",
"beta_cma",
"beta_smb_proxy",
"beta_hml_proxy",
"beta_rmw_proxy",
"beta_cma_proxy",
"beta_mom",
"beta_lowvol",
"beta_recovery",
@@ -429,6 +434,12 @@ def _select_model_names(
return list(available_models)
def _resolve_benchmark_symbol(benchmark: str | None) -> str:
if benchmark is None:
return MISSING_BENCHMARK_SENTINEL
return benchmark
def attribute_strategies(
results_df: pd.DataFrame,
benchmark_label: str,
@@ -438,10 +449,7 @@ def attribute_strategies(
benchmark: str | None = None,
external_factors: pd.DataFrame | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame]:
benchmark_symbol = benchmark
if benchmark_symbol is None:
matching_columns = [column for column in price_data.columns if column in benchmark_label]
benchmark_symbol = matching_columns[0] if matching_columns else price_data.columns[-1]
benchmark_symbol = _resolve_benchmark_symbol(benchmark)
extension_factors = build_extension_factors(price_data, benchmark=benchmark_symbol, market=market)
@@ -518,6 +526,10 @@ def attribute_strategies(
"beta_hml": np.nan,
"beta_rmw": np.nan,
"beta_cma": np.nan,
"beta_smb_proxy": np.nan,
"beta_hml_proxy": np.nan,
"beta_rmw_proxy": np.nan,
"beta_cma_proxy": np.nan,
"beta_mom": np.nan,
"beta_lowvol": np.nan,
"beta_recovery": np.nan,
@@ -575,7 +587,7 @@ def _describe_fit(r_squared: float) -> str:
def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str:
beta_columns = [column for column in SUMMARY_COLUMNS if column.startswith("beta_")]
beta_columns = [column for column in row.index if column.startswith("beta_")]
present = []
for column in beta_columns:
value = row.get(column)
@@ -606,11 +618,15 @@ def print_attribution_summary(summary_df: pd.DataFrame) -> None:
"beta_hml",
"beta_rmw",
"beta_cma",
"beta_smb_proxy",
"beta_hml_proxy",
"beta_rmw_proxy",
"beta_cma_proxy",
"beta_mom",
"beta_lowvol",
"beta_recovery",
]
table = summary_df.loc[:, display_columns].copy()
table = summary_df.reindex(columns=display_columns).copy()
numeric_columns = [column for column in display_columns if column not in {"strategy", "market", "model"}]
table.loc[:, numeric_columns] = table.loc[:, numeric_columns].round(4)