diff --git a/factor_attribution.py b/factor_attribution.py index c851457..a09c3b0 100644 --- a/factor_attribution.py +++ b/factor_attribution.py @@ -86,6 +86,16 @@ LOADING_COLUMNS = [ "t_stat", "p_value", ] +SEMANTIC_BETA_COLUMNS = [ + "beta_mkt", + "beta_smb", + "beta_hml", + "beta_rmw", + "beta_cma", + "beta_mom", + "beta_lowvol", + "beta_recovery", +] class ExternalFactorFormatError(ValueError): @@ -451,17 +461,34 @@ def _beta_semantics_map(proxy_only: bool) -> dict[str, str]: } -def _parse_beta_semantics(row: pd.Series) -> dict[str, str]: +def _resolve_beta_semantics(row: pd.Series) -> dict[str, str]: + canonical = _beta_semantics_map(bool(row.get("proxy_only", False))) raw_value = row.get("beta_semantics") if isinstance(raw_value, str) and raw_value: try: parsed = json.loads(raw_value) except json.JSONDecodeError: - parsed = None + return canonical else: if isinstance(parsed, dict): - return {str(key): str(value) for key, value in parsed.items()} - return _beta_semantics_map(bool(row.get("proxy_only", False))) + parsed_mapping = {str(key): str(value) for key, value in parsed.items()} + if set(parsed_mapping) == set(SEMANTIC_BETA_COLUMNS) and parsed_mapping == canonical: + return parsed_mapping + return canonical + + +def _section_beta_header_map(summary_df: pd.DataFrame) -> dict[str, str]: + if summary_df.empty: + return {} + + semantics = _resolve_beta_semantics(summary_df.iloc[0]) + header_map: dict[str, str] = {} + for beta_column, factor_name in semantics.items(): + suffix = factor_name.lower() + if suffix == "mkt_rf": + suffix = "mkt" + header_map[beta_column] = f"beta_{suffix}" + return header_map def attribute_strategies( @@ -609,7 +636,7 @@ def _describe_fit(r_squared: float) -> str: def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str: beta_columns = [column for column in row.index if column.startswith("beta_")] - factor_labels = _parse_beta_semantics(row) + factor_labels = _resolve_beta_semantics(row) present = [] for column in beta_columns: value = row.get(column) @@ -644,15 +671,8 @@ def _print_attribution_section(summary_df: pd.DataFrame, title: str, proxy_label "beta_recovery", ] table = summary_df.reindex(columns=display_columns).copy() - if proxy_labels: - table = table.rename( - columns={ - "beta_smb": "beta_smb_proxy", - "beta_hml": "beta_hml_proxy", - "beta_rmw": "beta_rmw_proxy", - "beta_cma": "beta_cma_proxy", - } - ) + del proxy_labels + table = table.rename(columns=_section_beta_header_map(summary_df)) numeric_columns = [ column for column in table.columns diff --git a/tests/test_factor_attribution.py b/tests/test_factor_attribution.py index e9396b7..cb97aef 100644 --- a/tests/test_factor_attribution.py +++ b/tests/test_factor_attribution.py @@ -881,6 +881,48 @@ class AttributionIntegrationTests(unittest.TestCase): self.assertIn("SMB_PROXY", output) self.assertNotIn(" beta_smb ", output) + def test_print_attribution_summary_ignores_malformed_proxy_beta_semantics(self): + summary = pd.DataFrame( + [ + { + "strategy": "Strategy", + "market": "cn", + "model": "proxy", + "factor_source": "proxy_only", + "proxy_only": True, + "beta_semantics": "{not-json", + "start_date": "2025-01-02", + "end_date": "2026-03-24", + "n_obs": 319, + "alpha_daily": 0.0002, + "alpha_ann": 0.0504, + "alpha_t_stat": 1.5, + "alpha_p_value": 0.12, + "r_squared": 0.72, + "adj_r_squared": 0.70, + "residual_vol_ann": 0.14, + "beta_mkt": 0.85, + "beta_smb": -0.30, + "beta_hml": 0.25, + "beta_rmw": 0.10, + "beta_cma": -0.05, + "beta_mom": 0.20, + "beta_lowvol": np.nan, + "beta_recovery": np.nan, + } + ] + ) + + buffer = io.StringIO() + with contextlib.redirect_stdout(buffer): + print_attribution_summary(summary) + + output = buffer.getvalue() + self.assertIn("Proxy factor attribution", output) + self.assertIn("beta_smb_proxy", output) + self.assertIn("SMB_PROXY", output) + self.assertNotIn(" beta_smb ", output) + def test_print_attribution_summary_splits_standard_and_proxy_sections_for_mixed_frames(self): summary = pd.DataFrame( [ @@ -971,6 +1013,88 @@ class AttributionIntegrationTests(unittest.TestCase): self.assertIn("beta_smb_proxy", output) self.assertIn("beta_smb ", output) + def test_print_attribution_summary_ignores_mismatched_beta_semantics_in_mixed_frames(self): + summary = pd.DataFrame( + [ + { + "strategy": "US Strategy", + "market": "us", + "model": "ff5", + "factor_source": "external+local", + "proxy_only": False, + "beta_semantics": json.dumps( + { + "beta_mkt": "MKT", + "beta_smb": "SMB_PROXY", + "beta_hml": "HML_PROXY", + "beta_rmw": "RMW_PROXY", + "beta_cma": "CMA_PROXY", + "beta_mom": "MOM", + "beta_lowvol": "LOWVOL", + "beta_recovery": "RECOVERY", + "extra": "BAD", + } + ), + "start_date": "2025-01-02", + "end_date": "2026-03-24", + "n_obs": 319, + "alpha_daily": 0.0004, + "alpha_ann": 0.1008, + "alpha_t_stat": 2.1, + "alpha_p_value": 0.04, + "r_squared": 0.82, + "adj_r_squared": 0.81, + "residual_vol_ann": 0.12, + "beta_mkt": 1.05, + "beta_smb": -0.20, + "beta_hml": 0.30, + "beta_rmw": 0.05, + "beta_cma": np.nan, + "beta_mom": np.nan, + "beta_lowvol": np.nan, + "beta_recovery": np.nan, + }, + { + "strategy": "CN Strategy", + "market": "cn", + "model": "proxy", + "factor_source": "proxy_only", + "proxy_only": True, + "beta_semantics": json.dumps({"beta_smb": "SMB", "beta_hml": "HML"}), + "start_date": "2025-01-02", + "end_date": "2026-03-24", + "n_obs": 319, + "alpha_daily": 0.0002, + "alpha_ann": 0.0504, + "alpha_t_stat": 1.5, + "alpha_p_value": 0.12, + "r_squared": 0.72, + "adj_r_squared": 0.70, + "residual_vol_ann": 0.14, + "beta_mkt": 0.85, + "beta_smb": -0.30, + "beta_hml": 0.25, + "beta_rmw": 0.10, + "beta_cma": -0.05, + "beta_mom": 0.20, + "beta_lowvol": np.nan, + "beta_recovery": np.nan, + }, + ] + ) + + buffer = io.StringIO() + with contextlib.redirect_stdout(buffer): + print_attribution_summary(summary) + + output = buffer.getvalue() + self.assertIn("Standard factor attribution", output) + self.assertIn("Proxy factor attribution", output) + self.assertIn("MKT_RF 1.05", output) + self.assertIn("SMB_PROXY -0.30", output) + self.assertIn("beta_smb_proxy", output) + self.assertNotIn("HML_PROXY 0.30", output) + def _make_price_frame(self, dates: pd.DatetimeIndex, benchmark: str) -> pd.DataFrame: steps = np.arange(len(dates), dtype=float) data = {}