From 69a03f52d90e6aea29cb114613fa0f00d2111e60 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 7 Apr 2026 17:18:29 +0800 Subject: [PATCH] Fix proxy attribution benchmark and labeling --- factor_attribution.py | 36 +++++++++---- tests/test_factor_attribution.py | 91 ++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 10 deletions(-) diff --git a/factor_attribution.py b/factor_attribution.py index 16ca955..aa2d3f7 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -32,17 +32,18 @@ PROXY_FACTOR_COLUMNS = [ "CMA_PROXY", ] + EXTENSION_FACTOR_COLUMNS TRADING_DAYS_PER_YEAR = 252 +MISSING_BENCHMARK_SENTINEL = "__missing_benchmark__" SUMMARY_BETA_COLUMN_BY_FACTOR = { "MKT_RF": "beta_mkt", "MKT": "beta_mkt", "SMB": "beta_smb", - "SMB_PROXY": "beta_smb", + "SMB_PROXY": "beta_smb_proxy", "HML": "beta_hml", - "HML_PROXY": "beta_hml", + "HML_PROXY": "beta_hml_proxy", "RMW": "beta_rmw", - "RMW_PROXY": "beta_rmw", + "RMW_PROXY": "beta_rmw_proxy", "CMA": "beta_cma", - "CMA_PROXY": "beta_cma", + "CMA_PROXY": "beta_cma_proxy", "MOM": "beta_mom", "LOWVOL": "beta_lowvol", "RECOVERY": "beta_recovery", @@ -68,6 +69,10 @@ SUMMARY_COLUMNS = [ "beta_hml", "beta_rmw", "beta_cma", + "beta_smb_proxy", + "beta_hml_proxy", + "beta_rmw_proxy", + "beta_cma_proxy", "beta_mom", "beta_lowvol", "beta_recovery", @@ -429,6 +434,12 @@ def _select_model_names( return list(available_models) +def _resolve_benchmark_symbol(benchmark: str | None) -> str: + if benchmark is None: + return MISSING_BENCHMARK_SENTINEL + return benchmark + + def attribute_strategies( results_df: pd.DataFrame, benchmark_label: str, @@ -438,10 +449,7 @@ def attribute_strategies( benchmark: str | None = None, external_factors: pd.DataFrame | None = None, ) -> tuple[pd.DataFrame, pd.DataFrame]: - benchmark_symbol = benchmark - if benchmark_symbol is None: - matching_columns = [column for column in price_data.columns if column in benchmark_label] - benchmark_symbol = matching_columns[0] if matching_columns else price_data.columns[-1] + benchmark_symbol = _resolve_benchmark_symbol(benchmark) extension_factors = build_extension_factors(price_data, benchmark=benchmark_symbol, market=market) @@ -518,6 +526,10 @@ def attribute_strategies( "beta_hml": np.nan, "beta_rmw": np.nan, "beta_cma": np.nan, + "beta_smb_proxy": np.nan, + "beta_hml_proxy": np.nan, + "beta_rmw_proxy": np.nan, + "beta_cma_proxy": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, @@ -575,7 +587,7 @@ def _describe_fit(r_squared: float) -> str: def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str: - beta_columns = [column for column in SUMMARY_COLUMNS if column.startswith("beta_")] + beta_columns = [column for column in row.index if column.startswith("beta_")] present = [] for column in beta_columns: value = row.get(column) @@ -606,11 +618,15 @@ def print_attribution_summary(summary_df: pd.DataFrame) -> None: "beta_hml", "beta_rmw", "beta_cma", + "beta_smb_proxy", + "beta_hml_proxy", + "beta_rmw_proxy", + "beta_cma_proxy", "beta_mom", "beta_lowvol", "beta_recovery", ] - table = summary_df.loc[:, display_columns].copy() + table = summary_df.reindex(columns=display_columns).copy() numeric_columns = [column for column in display_columns if column not in {"strategy", "market", "model"}] table.loc[:, numeric_columns] = table.loc[:, numeric_columns].round(4) diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index 170ad80..b1eeb4d 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -655,6 +655,10 @@ class AttributionIntegrationTests(unittest.TestCase): "beta_hml", "beta_rmw", "beta_cma", + "beta_smb_proxy", + "beta_hml_proxy", + "beta_rmw_proxy", + "beta_cma_proxy", "beta_mom", "beta_lowvol", "beta_recovery", @@ -667,6 +671,7 @@ class AttributionIntegrationTests(unittest.TestCase): self.assertAlmostEqual(summary.loc[0, "beta_mkt"], 1.10, places=3) self.assertAlmostEqual(summary.loc[0, "beta_smb"], -0.25, places=3) self.assertAlmostEqual(summary.loc[0, "beta_hml"], 0.35, places=3) + self.assertTrue(np.isnan(summary.loc[0, "beta_smb_proxy"])) self.assertTrue(np.isnan(summary.loc[0, "beta_mom"])) self.assertListEqual( @@ -704,11 +709,53 @@ class AttributionIntegrationTests(unittest.TestCase): self.assertEqual(summary.loc[0, "model"], "proxy") self.assertEqual(summary.loc[0, "factor_source"], "proxy_only") self.assertTrue(bool(summary.loc[0, "proxy_only"])) + self.assertIn("beta_smb_proxy", summary.columns) + self.assertIn("beta_hml_proxy", summary.columns) + self.assertIn("beta_rmw_proxy", summary.columns) + self.assertIn("beta_cma_proxy", summary.columns) + self.assertTrue(np.isnan(summary.loc[0, "beta_smb"])) + self.assertTrue(np.isnan(summary.loc[0, "beta_hml"])) + self.assertTrue(np.isnan(summary.loc[0, "beta_rmw"])) + self.assertTrue(np.isnan(summary.loc[0, "beta_cma"])) self.assertEqual( set(loadings["factor"]), {"MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"}, ) + def test_attribute_strategies_without_benchmark_uses_equal_weight_proxy_market(self): + dates = pd.date_range("2025-01-01", periods=320, freq="B") + prices = self._make_price_frame(dates, benchmark="000300.SS").drop(columns=["000300.SS"]) + equal_weight_returns = prices.pct_change().mean(axis=1).fillna(0.0) + results = pd.DataFrame( + { + "Strategy": 100_000.0 * (1.0 + 0.0002 + 0.8 * equal_weight_returns).cumprod(), + "External Benchmark": 100_000.0 * (1.0 + 0.0001 + 0.6 * equal_weight_returns).cumprod(), + }, + index=dates, + ) + + summary_missing, loadings_missing = attribute_strategies( + results_df=results, + benchmark_label="External Benchmark", + benchmark=None, + price_data=prices, + market="cn", + model_selection="ff5", + external_factors=None, + ) + summary_explicit, loadings_explicit = attribute_strategies( + results_df=results, + benchmark_label="External Benchmark", + benchmark="MISSING_BENCHMARK", + price_data=prices, + market="cn", + model_selection="ff5", + external_factors=None, + ) + + pd.testing.assert_frame_equal(summary_missing, summary_explicit, check_dtype=False) + pd.testing.assert_frame_equal(loadings_missing, loadings_explicit, check_dtype=False) + def test_print_attribution_summary_prints_compact_table_and_interpretation(self): summary = pd.DataFrame( [ @@ -751,6 +798,50 @@ class AttributionIntegrationTests(unittest.TestCase): self.assertIn("alpha_ann", output) self.assertIn("Interpretation", output) + def test_print_attribution_summary_keeps_proxy_factor_labels_in_output(self): + summary = pd.DataFrame( + [ + { + "strategy": "Strategy", + "market": "cn", + "model": "proxy", + "factor_source": "proxy_only", + "proxy_only": True, + "start_date": "2025-01-02", + "end_date": "2026-03-24", + "n_obs": 319, + "alpha_daily": 0.0002, + "alpha_ann": 0.0504, + "alpha_t_stat": 1.5, + "alpha_p_value": 0.12, + "r_squared": 0.72, + "adj_r_squared": 0.70, + "residual_vol_ann": 0.14, + "beta_mkt": 0.85, + "beta_smb": np.nan, + "beta_hml": np.nan, + "beta_rmw": np.nan, + "beta_cma": np.nan, + "beta_smb_proxy": -0.30, + "beta_hml_proxy": 0.25, + "beta_rmw_proxy": 0.10, + "beta_cma_proxy": -0.05, + "beta_mom": 0.20, + "beta_lowvol": np.nan, + "beta_recovery": np.nan, + } + ] + ) + + buffer = io.StringIO() + with contextlib.redirect_stdout(buffer): + print_attribution_summary(summary) + + output = buffer.getvalue() + self.assertIn("beta_smb_proxy", output) + self.assertIn("beta_hml_proxy", output) + self.assertIn("SMB_PROXY", output) + def _make_price_frame(self, dates: pd.DatetimeIndex, benchmark: str) -> pd.DataFrame: steps = np.arange(len(dates), dtype=float) data = {}