Harden attribution beta semantics fallback

This commit is contained in:
2026-04-07 17:43:22 +08:00
parent 097131d962
commit b3d87b3d92
2 changed files with 158 additions and 14 deletions

View File

@@ -86,6 +86,16 @@ LOADING_COLUMNS = [
"t_stat", "t_stat",
"p_value", "p_value",
] ]
SEMANTIC_BETA_COLUMNS = [
"beta_mkt",
"beta_smb",
"beta_hml",
"beta_rmw",
"beta_cma",
"beta_mom",
"beta_lowvol",
"beta_recovery",
]
class ExternalFactorFormatError(ValueError): class ExternalFactorFormatError(ValueError):
@@ -451,17 +461,34 @@ def _beta_semantics_map(proxy_only: bool) -> dict[str, str]:
} }
def _parse_beta_semantics(row: pd.Series) -> dict[str, str]: def _resolve_beta_semantics(row: pd.Series) -> dict[str, str]:
canonical = _beta_semantics_map(bool(row.get("proxy_only", False)))
raw_value = row.get("beta_semantics") raw_value = row.get("beta_semantics")
if isinstance(raw_value, str) and raw_value: if isinstance(raw_value, str) and raw_value:
try: try:
parsed = json.loads(raw_value) parsed = json.loads(raw_value)
except json.JSONDecodeError: except json.JSONDecodeError:
parsed = None return canonical
else: else:
if isinstance(parsed, dict): if isinstance(parsed, dict):
return {str(key): str(value) for key, value in parsed.items()} parsed_mapping = {str(key): str(value) for key, value in parsed.items()}
return _beta_semantics_map(bool(row.get("proxy_only", False))) if set(parsed_mapping) == set(SEMANTIC_BETA_COLUMNS) and parsed_mapping == canonical:
return parsed_mapping
return canonical
def _section_beta_header_map(summary_df: pd.DataFrame) -> dict[str, str]:
if summary_df.empty:
return {}
semantics = _resolve_beta_semantics(summary_df.iloc[0])
header_map: dict[str, str] = {}
for beta_column, factor_name in semantics.items():
suffix = factor_name.lower()
if suffix == "mkt_rf":
suffix = "mkt"
header_map[beta_column] = f"beta_{suffix}"
return header_map
def attribute_strategies( def attribute_strategies(
@@ -609,7 +636,7 @@ def _describe_fit(r_squared: float) -> str:
def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str: def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str:
beta_columns = [column for column in row.index if column.startswith("beta_")] beta_columns = [column for column in row.index if column.startswith("beta_")]
factor_labels = _parse_beta_semantics(row) factor_labels = _resolve_beta_semantics(row)
present = [] present = []
for column in beta_columns: for column in beta_columns:
value = row.get(column) value = row.get(column)
@@ -644,15 +671,8 @@ def _print_attribution_section(summary_df: pd.DataFrame, title: str, proxy_label
"beta_recovery", "beta_recovery",
] ]
table = summary_df.reindex(columns=display_columns).copy() table = summary_df.reindex(columns=display_columns).copy()
if proxy_labels: del proxy_labels
table = table.rename( table = table.rename(columns=_section_beta_header_map(summary_df))
columns={
"beta_smb": "beta_smb_proxy",
"beta_hml": "beta_hml_proxy",
"beta_rmw": "beta_rmw_proxy",
"beta_cma": "beta_cma_proxy",
}
)
numeric_columns = [ numeric_columns = [
column column
for column in table.columns for column in table.columns

View File

@@ -881,6 +881,48 @@ class AttributionIntegrationTests(unittest.TestCase):
self.assertIn("SMB_PROXY", output) self.assertIn("SMB_PROXY", output)
self.assertNotIn(" beta_smb ", output) self.assertNotIn(" beta_smb ", output)
def test_print_attribution_summary_ignores_malformed_proxy_beta_semantics(self):
summary = pd.DataFrame(
[
{
"strategy": "Strategy",
"market": "cn",
"model": "proxy",
"factor_source": "proxy_only",
"proxy_only": True,
"beta_semantics": "{not-json",
"start_date": "2025-01-02",
"end_date": "2026-03-24",
"n_obs": 319,
"alpha_daily": 0.0002,
"alpha_ann": 0.0504,
"alpha_t_stat": 1.5,
"alpha_p_value": 0.12,
"r_squared": 0.72,
"adj_r_squared": 0.70,
"residual_vol_ann": 0.14,
"beta_mkt": 0.85,
"beta_smb": -0.30,
"beta_hml": 0.25,
"beta_rmw": 0.10,
"beta_cma": -0.05,
"beta_mom": 0.20,
"beta_lowvol": np.nan,
"beta_recovery": np.nan,
}
]
)
buffer = io.StringIO()
with contextlib.redirect_stdout(buffer):
print_attribution_summary(summary)
output = buffer.getvalue()
self.assertIn("Proxy factor attribution", output)
self.assertIn("beta_smb_proxy", output)
self.assertIn("SMB_PROXY", output)
self.assertNotIn(" beta_smb ", output)
def test_print_attribution_summary_splits_standard_and_proxy_sections_for_mixed_frames(self): def test_print_attribution_summary_splits_standard_and_proxy_sections_for_mixed_frames(self):
summary = pd.DataFrame( summary = pd.DataFrame(
[ [
@@ -971,6 +1013,88 @@ class AttributionIntegrationTests(unittest.TestCase):
self.assertIn("beta_smb_proxy", output) self.assertIn("beta_smb_proxy", output)
self.assertIn("beta_smb ", output) self.assertIn("beta_smb ", output)
def test_print_attribution_summary_ignores_mismatched_beta_semantics_in_mixed_frames(self):
summary = pd.DataFrame(
[
{
"strategy": "US Strategy",
"market": "us",
"model": "ff5",
"factor_source": "external+local",
"proxy_only": False,
"beta_semantics": json.dumps(
{
"beta_mkt": "MKT",
"beta_smb": "SMB_PROXY",
"beta_hml": "HML_PROXY",
"beta_rmw": "RMW_PROXY",
"beta_cma": "CMA_PROXY",
"beta_mom": "MOM",
"beta_lowvol": "LOWVOL",
"beta_recovery": "RECOVERY",
"extra": "BAD",
}
),
"start_date": "2025-01-02",
"end_date": "2026-03-24",
"n_obs": 319,
"alpha_daily": 0.0004,
"alpha_ann": 0.1008,
"alpha_t_stat": 2.1,
"alpha_p_value": 0.04,
"r_squared": 0.82,
"adj_r_squared": 0.81,
"residual_vol_ann": 0.12,
"beta_mkt": 1.05,
"beta_smb": -0.20,
"beta_hml": 0.30,
"beta_rmw": 0.05,
"beta_cma": np.nan,
"beta_mom": np.nan,
"beta_lowvol": np.nan,
"beta_recovery": np.nan,
},
{
"strategy": "CN Strategy",
"market": "cn",
"model": "proxy",
"factor_source": "proxy_only",
"proxy_only": True,
"beta_semantics": json.dumps({"beta_smb": "SMB", "beta_hml": "HML"}),
"start_date": "2025-01-02",
"end_date": "2026-03-24",
"n_obs": 319,
"alpha_daily": 0.0002,
"alpha_ann": 0.0504,
"alpha_t_stat": 1.5,
"alpha_p_value": 0.12,
"r_squared": 0.72,
"adj_r_squared": 0.70,
"residual_vol_ann": 0.14,
"beta_mkt": 0.85,
"beta_smb": -0.30,
"beta_hml": 0.25,
"beta_rmw": 0.10,
"beta_cma": -0.05,
"beta_mom": 0.20,
"beta_lowvol": np.nan,
"beta_recovery": np.nan,
},
]
)
buffer = io.StringIO()
with contextlib.redirect_stdout(buffer):
print_attribution_summary(summary)
output = buffer.getvalue()
self.assertIn("Standard factor attribution", output)
self.assertIn("Proxy factor attribution", output)
self.assertIn("MKT_RF 1.05", output)
self.assertIn("SMB_PROXY -0.30", output)
self.assertIn("beta_smb_proxy", output)
self.assertNotIn("HML_PROXY 0.30", output)
def _make_price_frame(self, dates: pd.DatetimeIndex, benchmark: str) -> pd.DataFrame: def _make_price_frame(self, dates: pd.DatetimeIndex, benchmark: str) -> pd.DataFrame:
steps = np.arange(len(dates), dtype=float) steps = np.arange(len(dates), dtype=float)
data = {} data = {}