Honor complete attribution beta semantics labels

2026-04-07 17:51:57 +08:00
parent b3d87b3d92
commit 35a91ba6cc
2 changed files with 124 additions and 26 deletions
--- a/factor_attribution.py
+++ b/factor_attribution.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import json
 import http.client
 import io
+import re
 import socket
 import ssl
 import warnings
@@ -472,25 +473,33 @@ def _resolve_beta_semantics(row: pd.Series) -> dict[str, str]:
        else:
            if isinstance(parsed, dict):
                parsed_mapping = {str(key): str(value) for key, value in parsed.items()}
-                if set(parsed_mapping) == set(SEMANTIC_BETA_COLUMNS) and parsed_mapping == canonical:
+                if set(parsed_mapping) == set(SEMANTIC_BETA_COLUMNS) and all(
+                    value.strip() for value in parsed_mapping.values()
+                ):
                    return parsed_mapping
    return canonical


-def _section_beta_header_map(summary_df: pd.DataFrame) -> dict[str, str]:
-    if summary_df.empty:
-        return {}
+def _beta_header_name(factor_name: str) -> str:
+    suffix = factor_name.strip().lower()
+    suffix = re.sub(r"[^a-z0-9]+", "_", suffix).strip("_")
+    if suffix == "mkt_rf":
+        suffix = "mkt"
+    return f"beta_{suffix}"

-    semantics = _resolve_beta_semantics(summary_df.iloc[0])
+
+def _section_beta_header_map(semantics: dict[str, str]) -> dict[str, str]:
    header_map: dict[str, str] = {}
    for beta_column, factor_name in semantics.items():
-        suffix = factor_name.lower()
-        if suffix == "mkt_rf":
-            suffix = "mkt"
-        header_map[beta_column] = f"beta_{suffix}"
+        header_map[beta_column] = _beta_header_name(factor_name)
    return header_map


+def _section_key(row: pd.Series) -> tuple[bool, tuple[tuple[str, str], ...]]:
+    semantics = _resolve_beta_semantics(row)
+    return bool(row.get("proxy_only", False)), tuple((key, semantics[key]) for key in SEMANTIC_BETA_COLUMNS)
+
+
 def attribute_strategies(
    results_df: pd.DataFrame,
    benchmark_label: str,
@@ -651,7 +660,7 @@ def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str:
    return ", ".join(f"{name} {value:.2f}" for name, value in top_loadings)


-def _print_attribution_section(summary_df: pd.DataFrame, title: str, proxy_labels: bool) -> None:
+def _print_attribution_section(summary_df: pd.DataFrame, title: str, semantics: dict[str, str]) -> None:
    display_columns = [
        "strategy",
        "market",
@@ -671,8 +680,7 @@ def _print_attribution_section(summary_df: pd.DataFrame, title: str, proxy_label
        "beta_recovery",
    ]
    table = summary_df.reindex(columns=display_columns).copy()
-    del proxy_labels
-    table = table.rename(columns=_section_beta_header_map(summary_df))
+    table = table.rename(columns=_section_beta_header_map(semantics))
    numeric_columns = [
        column
        for column in table.columns
@@ -689,22 +697,18 @@ def print_attribution_summary(summary_df: pd.DataFrame) -> None:
        print("Factor attribution: no usable regressions were produced.")
        return

-    proxy_mask = summary_df["proxy_only"].fillna(False).astype(bool)
-    standard_rows = summary_df.loc[~proxy_mask]
-    proxy_rows = summary_df.loc[proxy_mask]
-
    print("\nFactor attribution")
-    if not standard_rows.empty:
+    sections: dict[tuple[bool, tuple[tuple[str, str], ...]], list[int]] = {}
+    for index, row in summary_df.iterrows():
+        sections.setdefault(_section_key(row), []).append(index)
+
+    for (is_proxy, semantics_items), row_indexes in sections.items():
+        section_rows = summary_df.loc[row_indexes]
+        title = "Proxy factor attribution" if is_proxy else "Standard factor attribution"
        _print_attribution_section(
-            standard_rows,
-            title="Standard factor attribution",
-            proxy_labels=False,
-        )
-    if not proxy_rows.empty:
-        _print_attribution_section(
-            proxy_rows,
-            title="Proxy factor attribution",
-            proxy_labels=True,
+            section_rows,
+            title=title,
+            semantics=dict(semantics_items),
        )
    print("\nInterpretation")
    for _, row in summary_df.iterrows():