From 35a91ba6cc94088677cfa5c79447b804aa1e6f93 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Tue, 7 Apr 2026 17:51:57 +0800
Subject: [PATCH] Honor complete attribution beta semantics labels

---
 factor_attribution.py            | 56 ++++++++++---------
 tests/test_factor_attribution.py | 94 ++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 26 deletions(-)

diff --git a/factor_attribution.py b/factor_attribution.py
index a09c3b0..47979e8 100644
--- a/factor_attribution.py
+++ b/factor_attribution.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import json
 import http.client
 import io
+import re
 import socket
 import ssl
 import warnings
@@ -472,25 +473,33 @@ def _resolve_beta_semantics(row: pd.Series) -> dict[str, str]:
         else:
             if isinstance(parsed, dict):
                 parsed_mapping = {str(key): str(value) for key, value in parsed.items()}
-                if set(parsed_mapping) == set(SEMANTIC_BETA_COLUMNS) and parsed_mapping == canonical:
+                if set(parsed_mapping) == set(SEMANTIC_BETA_COLUMNS) and all(
+                    value.strip() for value in parsed_mapping.values()
+                ):
                     return parsed_mapping
     return canonical
 
 
-def _section_beta_header_map(summary_df: pd.DataFrame) -> dict[str, str]:
-    if summary_df.empty:
-        return {}
+def _beta_header_name(factor_name: str) -> str:
+    suffix = factor_name.strip().lower()
+    suffix = re.sub(r"[^a-z0-9]+", "_", suffix).strip("_")
+    if suffix == "mkt_rf":
+        suffix = "mkt"
+    return f"beta_{suffix}"
 
-    semantics = _resolve_beta_semantics(summary_df.iloc[0])
+
+def _section_beta_header_map(semantics: dict[str, str]) -> dict[str, str]:
     header_map: dict[str, str] = {}
     for beta_column, factor_name in semantics.items():
-        suffix = factor_name.lower()
-        if suffix == "mkt_rf":
-            suffix = "mkt"
-        header_map[beta_column] = f"beta_{suffix}"
+        header_map[beta_column] = _beta_header_name(factor_name)
     return header_map
 
 
+def _section_key(row: pd.Series) -> tuple[bool, tuple[tuple[str, str], ...]]:
+    semantics = _resolve_beta_semantics(row)
+    return bool(row.get("proxy_only", False)), tuple((key, semantics[key]) for key in SEMANTIC_BETA_COLUMNS)
+
+
 def attribute_strategies(
     results_df: pd.DataFrame,
     benchmark_label: str,
@@ -651,7 +660,7 @@ def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str:
     return ", ".join(f"{name} {value:.2f}" for name, value in top_loadings)
 
 
-def _print_attribution_section(summary_df: pd.DataFrame, title: str, proxy_labels: bool) -> None:
+def _print_attribution_section(summary_df: pd.DataFrame, title: str, semantics: dict[str, str]) -> None:
     display_columns = [
         "strategy",
         "market",
@@ -671,8 +680,7 @@ def _print_attribution_section(summary_df: pd.DataFrame, title: str, proxy_label
         "beta_recovery",
     ]
     table = summary_df.reindex(columns=display_columns).copy()
-    del proxy_labels
-    table = table.rename(columns=_section_beta_header_map(summary_df))
+    table = table.rename(columns=_section_beta_header_map(semantics))
     numeric_columns = [
         column
         for column in table.columns
@@ -689,22 +697,18 @@ def print_attribution_summary(summary_df: pd.DataFrame) -> None:
         print("Factor attribution: no usable regressions were produced.")
         return
 
-    proxy_mask = summary_df["proxy_only"].fillna(False).astype(bool)
-    standard_rows = summary_df.loc[~proxy_mask]
-    proxy_rows = summary_df.loc[proxy_mask]
-
     print("\nFactor attribution")
-    if not standard_rows.empty:
+    sections: dict[tuple[bool, tuple[tuple[str, str], ...]], list[int]] = {}
+    for index, row in summary_df.iterrows():
+        sections.setdefault(_section_key(row), []).append(index)
+
+    for (is_proxy, semantics_items), row_indexes in sections.items():
+        section_rows = summary_df.loc[row_indexes]
+        title = "Proxy factor attribution" if is_proxy else "Standard factor attribution"
         _print_attribution_section(
-            standard_rows,
-            title="Standard factor attribution",
-            proxy_labels=False,
-        )
-    if not proxy_rows.empty:
-        _print_attribution_section(
-            proxy_rows,
-            title="Proxy factor attribution",
-            proxy_labels=True,
+            section_rows,
+            title=title,
+            semantics=dict(semantics_items),
         )
     print("\nInterpretation")
     for _, row in summary_df.iterrows():
diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py
index cb97aef..1a52991 100644
--- a/tests/test_factor_attribution.py
+++ b/tests/test_factor_attribution.py
@@ -923,6 +923,100 @@ class AttributionIntegrationTests(unittest.TestCase):
         self.assertIn("SMB_PROXY", output)
         self.assertNotIn(" beta_smb ", output)
 
+    def test_print_attribution_summary_honors_complete_noncanonical_beta_semantics(self):
+        summary = pd.DataFrame(
+            [
+                {
+                    "strategy": "US Strategy",
+                    "market": "us",
+                    "model": "ff5",
+                    "factor_source": "external+local",
+                    "proxy_only": False,
+                    "beta_semantics": json.dumps(
+                        {
+                            "beta_mkt": "MARKET_EXCESS",
+                            "beta_smb": "SIZE",
+                            "beta_hml": "VALUE",
+                            "beta_rmw": "QUALITY",
+                            "beta_cma": "INVESTMENT",
+                            "beta_mom": "MOMENTUM",
+                            "beta_lowvol": "MINVOL",
+                            "beta_recovery": "BOUNCE",
+                        }
+                    ),
+                    "start_date": "2025-01-02",
+                    "end_date": "2026-03-24",
+                    "n_obs": 319,
+                    "alpha_daily": 0.0004,
+                    "alpha_ann": 0.1008,
+                    "alpha_t_stat": 2.1,
+                    "alpha_p_value": 0.04,
+                    "r_squared": 0.82,
+                    "adj_r_squared": 0.81,
+                    "residual_vol_ann": 0.12,
+                    "beta_mkt": 1.05,
+                    "beta_smb": -0.20,
+                    "beta_hml": 0.30,
+                    "beta_rmw": 0.05,
+                    "beta_cma": np.nan,
+                    "beta_mom": np.nan,
+                    "beta_lowvol": np.nan,
+                    "beta_recovery": np.nan,
+                },
+                {
+                    "strategy": "CN Strategy",
+                    "market": "cn",
+                    "model": "proxy",
+                    "factor_source": "proxy_only",
+                    "proxy_only": True,
+                    "beta_semantics": json.dumps(
+                        {
+                            "beta_mkt": "LOCAL_MARKET",
+                            "beta_smb": "SIZE_PROXY_CUSTOM",
+                            "beta_hml": "VALUE_PROXY_CUSTOM",
+                            "beta_rmw": "QUALITY_PROXY_CUSTOM",
+                            "beta_cma": "INVEST_PROXY_CUSTOM",
+                            "beta_mom": "TREND",
+                            "beta_lowvol": "DEFENSIVE",
+                            "beta_recovery": "RECOVERY_PROXY",
+                        }
+                    ),
+                    "start_date": "2025-01-02",
+                    "end_date": "2026-03-24",
+                    "n_obs": 319,
+                    "alpha_daily": 0.0002,
+                    "alpha_ann": 0.0504,
+                    "alpha_t_stat": 1.5,
+                    "alpha_p_value": 0.12,
+                    "r_squared": 0.72,
+                    "adj_r_squared": 0.70,
+                    "residual_vol_ann": 0.14,
+                    "beta_mkt": 0.85,
+                    "beta_smb": -0.30,
+                    "beta_hml": 0.25,
+                    "beta_rmw": 0.10,
+                    "beta_cma": -0.05,
+                    "beta_mom": 0.20,
+                    "beta_lowvol": np.nan,
+                    "beta_recovery": np.nan,
+                },
+            ]
+        )
+
+        buffer = io.StringIO()
+        with contextlib.redirect_stdout(buffer):
+            print_attribution_summary(summary)
+
+        output = buffer.getvalue()
+        self.assertIn("Standard factor attribution", output)
+        self.assertIn("Proxy factor attribution", output)
+        self.assertIn("beta_market_excess", output)
+        self.assertIn("beta_size_proxy_custom", output)
+        self.assertIn("MARKET_EXCESS 1.05", output)
+        self.assertIn("SIZE_PROXY_CUSTOM -0.30", output)
+        self.assertNotIn("MKT_RF 1.05", output)
+        self.assertNotIn("SMB_PROXY -0.30", output)
+
     def test_print_attribution_summary_splits_standard_and_proxy_sections_for_mixed_frames(self):
         summary = pd.DataFrame(
             [