import http.client import contextlib import json import io import socket import ssl import tempfile import unittest import zipfile from pathlib import Path from urllib.error import URLError from unittest import mock import numpy as np import pandas as pd from factor_attribution import ( ExternalFactorDownloadError, ExternalFactorFormatError, KEN_FRENCH_DAILY_FF5_ZIP_URL, _download_kf_zip_bytes, _parse_kf_daily_csv, attribute_strategies, build_extension_factors, build_proxy_core_factors, export_attribution, load_external_us_factors, print_attribution_summary, prepare_factor_models, run_factor_regression, ) class ExternalFactorLoaderTests(unittest.TestCase): def test_download_kf_zip_bytes_fetches_official_ken_french_zip(self): response = mock.MagicMock() response.read.return_value = b"zip-bytes" response.__enter__.return_value = response response.__exit__.return_value = False with mock.patch("factor_attribution.urlopen", return_value=response) as mocked_urlopen: raw_bytes = _download_kf_zip_bytes() self.assertEqual(raw_bytes, b"zip-bytes") request = mocked_urlopen.call_args.args[0] self.assertEqual(request.full_url, KEN_FRENCH_DAILY_FF5_ZIP_URL) self.assertEqual(mocked_urlopen.call_args.kwargs["timeout"], 30) def test_download_kf_zip_bytes_wraps_transport_errors(self): for error in ( URLError("boom"), TimeoutError("timed out"), ConnectionError("conn reset"), socket.timeout("socket timed out"), socket.gaierror("dns failed"), ssl.SSLError("tls failed"), ): with self.subTest(error_type=type(error).__name__): with mock.patch("factor_attribution.urlopen", side_effect=error): with self.assertRaises(ExternalFactorDownloadError): _download_kf_zip_bytes() def test_download_kf_zip_bytes_wraps_incomplete_read_errors(self): response = mock.MagicMock() response.read.side_effect = http.client.IncompleteRead(b"partial", 10) response.__enter__.return_value = response response.__exit__.return_value = False with mock.patch("factor_attribution.urlopen", return_value=response): with self.assertRaises(ExternalFactorDownloadError): _download_kf_zip_bytes() def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self): csv_text = ( "This line is ignored\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" "20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n" "\n" ) zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", csv_text, ) with tempfile.TemporaryDirectory() as tmpdir: with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=zip_bytes, ): factors = load_external_us_factors(cache_dir=Path(tmpdir)) self.assertListEqual( list(factors.columns), ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"], ) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) self.assertAlmostEqual(factors.iloc[0]["RF"], 0.0002) self.assertEqual(str(factors.index[0].date()), "2026-01-02") def test_load_external_us_factors_falls_back_to_cache_when_download_fails(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch("factor_attribution.urlopen", side_effect=socket.gaierror("dns failed")): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_falls_back_to_cache_when_download_read_is_incomplete(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) response = mock.MagicMock() response.read.side_effect = http.client.IncompleteRead(b"partial", 10) response.__enter__.return_value = response response.__exit__.return_value = False with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch("factor_attribution.urlopen", return_value=response): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_falls_back_to_cache_when_http_status_line_is_bad(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch("factor_attribution.urlopen", side_effect=http.client.BadStatusLine("HTTP/1.1 ???")): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_parse_kf_daily_csv_raises_external_factor_format_error_for_missing_header(self): zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", "not the expected file format\n20260102,1.00\n", ) with self.assertRaises(ExternalFactorFormatError): _parse_kf_daily_csv(zip_bytes) def test_load_external_us_factors_warns_and_falls_back_to_cache_when_source_format_is_invalid(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") malformed_zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", "not the expected file format\n20260102,1.00\n", ) with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=malformed_zip_bytes, ): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_warns_and_falls_back_to_cache_when_zip_is_invalid(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=b"not-a-zip-file", ): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_surfaces_cache_write_failures(self): csv_text = ( "This line is ignored\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" "\n" ) zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", csv_text, ) with tempfile.TemporaryDirectory() as tmpdir: with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=zip_bytes, ): with mock.patch("pandas.DataFrame.to_csv", side_effect=OSError("disk full")): with self.assertRaises(OSError): load_external_us_factors(cache_dir=Path(tmpdir)) def test_load_external_us_factors_does_not_swallow_unrelated_local_failures(self): csv_text = ( "This line is ignored\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" "\n" ) zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", csv_text, ) cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=zip_bytes, ): with mock.patch( "factor_attribution._parse_kf_daily_csv", side_effect=RuntimeError("unexpected local bug"), ): with self.assertRaises(RuntimeError): load_external_us_factors(cache_dir=cache_dir) def _make_zip_bytes(self, filename: str, contents: str) -> bytes: buffer = io.BytesIO() with zipfile.ZipFile(buffer, mode="w") as archive: archive.writestr(filename, contents) return buffer.getvalue() class LocalFactorConstructionTests(unittest.TestCase): def test_build_extension_factors_returns_expected_columns_with_non_null_values_after_warmup(self): prices = self._make_price_frame(benchmark="SPY") factors = build_extension_factors(prices, benchmark="SPY", market="us") self.assertListEqual(list(factors.columns), ["MOM", "LOWVOL", "RECOVERY"]) self.assertTrue(factors.iloc[260:].notna().all().all()) self.assertGreater(factors.iloc[260:].abs().sum().sum(), 0.0) def test_build_proxy_core_factors_returns_expected_columns_with_non_null_values_after_warmup(self): prices = self._make_price_frame(benchmark="000300.SS") factors = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") self.assertListEqual( list(factors.columns), ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"], ) self.assertTrue(factors.iloc[260:].notna().all().all()) self.assertGreater(factors.iloc[260:].abs().sum().sum(), 0.0) def test_build_extension_factors_does_not_use_future_prices(self): prices = self._make_price_frame(benchmark="SPY") mutated = prices.copy() future_start = prices.index[280] mutated.loc[future_start:, "A"] = mutated.loc[future_start:, "A"] * 1.8 mutated.loc[future_start:, "B"] = mutated.loc[future_start:, "B"] * 0.4 original = build_extension_factors(prices, benchmark="SPY", market="us") changed = build_extension_factors(mutated, benchmark="SPY", market="us") comparison_end = prices.index[279] pd.testing.assert_frame_equal(original.loc[:comparison_end], changed.loc[:comparison_end]) self.assertGreater( (original.loc[future_start:] - changed.loc[future_start:]).abs().sum().sum(), 0.0, ) def test_build_proxy_core_factors_market_branch_does_not_use_future_benchmark_prices(self): prices = self._make_price_frame(benchmark="000300.SS") mutated = prices.copy() future_start = prices.index[280] mutated.loc[future_start:, "000300.SS"] = mutated.loc[future_start:, "000300.SS"] * 1.4 original = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") changed = build_proxy_core_factors(mutated, benchmark="000300.SS", market="cn") comparison_end = prices.index[279] pd.testing.assert_series_equal( original.loc[:comparison_end, "MKT"], changed.loc[:comparison_end, "MKT"], check_names=False, ) proxy_columns = ["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"] pd.testing.assert_frame_equal( original.loc[:, proxy_columns], changed.loc[:, proxy_columns], ) self.assertGreater( (original.loc[future_start:, "MKT"] - changed.loc[future_start:, "MKT"]).abs().sum(), 0.0, ) def test_build_proxy_core_factors_proxy_columns_do_not_use_future_stock_prices(self): prices = self._make_price_frame(benchmark="000300.SS") mutated = prices.copy() future_start = prices.index[280] mutated.loc[future_start:, "C"] = mutated.loc[future_start:, "C"] * 0.35 mutated.loc[future_start:, "D"] = mutated.loc[future_start:, "D"] * 1.6 original = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") changed = build_proxy_core_factors(mutated, benchmark="000300.SS", market="cn") comparison_end = prices.index[279] proxy_columns = ["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"] pd.testing.assert_frame_equal( original.loc[:comparison_end, proxy_columns], changed.loc[:comparison_end, proxy_columns], ) self.assertGreater( (original.loc[future_start:, proxy_columns] - changed.loc[future_start:, proxy_columns]).abs().sum().sum(), 0.0, ) def test_build_proxy_core_factors_falls_back_to_equal_weight_market_when_benchmark_missing(self): prices_with_benchmark = self._make_price_frame(benchmark="CN_BENCH") prices = prices_with_benchmark.drop(columns=["CN_BENCH"]) factors = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") reference = build_proxy_core_factors(prices_with_benchmark, benchmark="CN_BENCH", market="cn") expected_market = prices.pct_change().mean(axis=1) pd.testing.assert_series_equal(factors["MKT"], expected_market, check_names=False) self.assertListEqual( list(factors.columns), ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"], ) self.assertTrue(factors.iloc[260:][["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]].notna().all().all()) self.assertGreater( factors.iloc[260:][["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]].abs().sum().sum(), 0.0, ) pd.testing.assert_frame_equal( factors[["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]], reference[["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]], ) def _make_price_frame(self, benchmark: str) -> pd.DataFrame: dates = pd.date_range("2025-01-01", periods=320, freq="B") steps = np.arange(len(dates), dtype=float) symbols = [ ("A", 45.0, 0.0006, 0.030, 19.0, 0.1), ("B", 60.0, 0.0003, 0.025, 23.0, 0.8), ("C", 75.0, -0.0002, 0.035, 17.0, 1.4), ("D", 90.0, 0.0008, 0.020, 29.0, 0.5), ("E", 55.0, -0.0001, 0.028, 31.0, 1.9), ("F", 70.0, 0.0005, 0.032, 21.0, 2.5), ] data = {} for symbol, base, drift, amplitude, frequency, phase in symbols: log_path = drift * steps + amplitude * np.sin(steps / frequency + phase) data[symbol] = base * np.exp(log_path) benchmark_path = 0.0004 * steps + 0.018 * np.sin(steps / 27.0 + 0.3) data[benchmark] = 250.0 * np.exp(benchmark_path) return pd.DataFrame(data, index=dates) class RegressionTests(unittest.TestCase): def test_run_factor_regression_recovers_known_coefficients(self): dates = pd.date_range("2024-01-01", periods=300, freq="B") angles = np.linspace(0.0, 18.0, len(dates)) factors = pd.DataFrame( { "MKT_RF": 0.012 * np.sin(angles), "SMB": 0.007 * np.cos(angles * 0.7) + np.linspace(-0.002, 0.003, len(dates)), "RF": np.full(len(dates), 0.0001), }, index=dates, ) factors.loc[dates[:4], "SMB"] = np.nan strategy = ( 0.0005 + 1.2 * factors["MKT_RF"] + 0.4 * factors["SMB"] + factors["RF"] ) result = run_factor_regression( strategy, factors, factor_cols=["MKT_RF", "SMB"], risk_free_col="RF", ) self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=6) self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=6) self.assertAlmostEqual(result["betas"]["SMB"], 0.4, places=6) self.assertGreater(result["r_squared"], 0.999999) self.assertEqual(result["start_date"], "2024-01-05") self.assertEqual(result["end_date"], "2025-02-21") self.assertEqual(result["n_obs"], 296) def test_run_factor_regression_rejects_underdetermined_designs(self): dates = pd.date_range("2024-01-01", periods=2, freq="B") factors = pd.DataFrame( { "MKT_RF": [0.01, -0.02], "SMB": [0.005, 0.004], }, index=dates, ) strategy = pd.Series([0.012, -0.018], index=dates) with self.assertRaisesRegex(ValueError, "Insufficient observations"): run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) def test_run_factor_regression_allows_square_full_rank_design_without_inference(self): dates = pd.date_range("2024-01-01", periods=3, freq="B") factors = pd.DataFrame( { "MKT_RF": [0.0, 1.0, 0.0], "SMB": [0.0, 0.0, 1.0], }, index=dates, ) strategy = pd.Series([0.0005, 1.2005, -0.3995], index=dates) result = run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=10) self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=10) self.assertAlmostEqual(result["betas"]["SMB"], -0.4, places=10) self.assertEqual(result["r_squared"], 1.0) self.assertTrue(np.isnan(result["alpha_t_stat"])) self.assertTrue(np.isnan(result["alpha_p_value"])) self.assertTrue(np.isnan(result["t_stats"]["MKT_RF"])) self.assertTrue(np.isnan(result["t_stats"]["SMB"])) self.assertTrue(np.isnan(result["p_values"]["MKT_RF"])) self.assertTrue(np.isnan(result["p_values"]["SMB"])) self.assertTrue(np.isnan(result["adj_r_squared"])) self.assertAlmostEqual(result["residual_vol_ann"], 0.0, places=12) def test_run_factor_regression_single_observation_intercept_only_has_zero_residual_vol(self): dates = pd.date_range("2024-01-01", periods=1, freq="B") factors = pd.DataFrame(index=dates) strategy = pd.Series([0.0015], index=dates) result = run_factor_regression(strategy, factors, factor_cols=[]) self.assertAlmostEqual(result["alpha_daily"], 0.0015, places=12) self.assertEqual(result["betas"], {}) self.assertEqual(result["t_stats"], {}) self.assertEqual(result["p_values"], {}) self.assertEqual(result["r_squared"], 0.0) self.assertTrue(np.isnan(result["alpha_t_stat"])) self.assertTrue(np.isnan(result["alpha_p_value"])) self.assertTrue(np.isnan(result["adj_r_squared"])) self.assertEqual(result["n_obs"], 1) self.assertAlmostEqual(result["residual_vol_ann"], 0.0, places=12) def test_run_factor_regression_rejects_rank_deficient_designs(self): dates = pd.date_range("2024-01-01", periods=6, freq="B") market = np.array([0.01, -0.02, 0.015, 0.005, -0.01, 0.02]) factors = pd.DataFrame( { "MKT_RF": market, "SMB": market * 2.0, }, index=dates, ) strategy = pd.Series(0.0005 + 1.0 * factors["MKT_RF"] + 0.5 * factors["SMB"], index=dates) with self.assertRaisesRegex(ValueError, "rank-deficient"): run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) def test_prepare_factor_models_uses_proxy_family_without_external_us_factors(self): dates = pd.date_range("2024-01-01", periods=5, freq="B") extension = pd.DataFrame( { "MOM": np.linspace(0.001, 0.005, len(dates)), "LOWVOL": np.linspace(-0.002, 0.002, len(dates)), "RECOVERY": np.linspace(0.003, -0.001, len(dates)), }, index=dates, ) proxy = pd.DataFrame( { "MKT": np.linspace(-0.01, 0.01, len(dates)), "SMB_PROXY": np.linspace(0.002, 0.004, len(dates)), "HML_PROXY": np.linspace(-0.003, 0.001, len(dates)), "RMW_PROXY": np.linspace(0.005, 0.001, len(dates)), "CMA_PROXY": np.linspace(-0.004, -0.002, len(dates)), }, index=dates, ) prepared = prepare_factor_models( market="us", extension_factors=extension, proxy_factors=proxy, external_factors=None, ) self.assertEqual(prepared["factor_source"], "proxy_only") self.assertIsNone(prepared["risk_free_col"]) self.assertListEqual(list(prepared["models"]), ["proxy"]) self.assertListEqual( prepared["models"]["proxy"], ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"], ) self.assertListEqual( list(prepared["factor_frame"].columns), ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"], ) class AttributionIntegrationTests(unittest.TestCase): def test_attribute_strategies_exports_standard_model_summary_and_loadings(self): dates = pd.date_range("2025-01-01", periods=320, freq="B") angles = np.linspace(0.0, 24.0, len(dates)) factors = pd.DataFrame( { "MKT_RF": 0.010 * np.sin(angles), "SMB": 0.006 * np.cos(angles * 0.7), "HML": 0.004 * np.sin(angles * 1.3 + 0.4), "RMW": 0.003 * np.cos(angles * 1.1 + 0.2), "CMA": 0.002 * np.sin(angles * 0.5 + 0.8), "RF": np.full(len(dates), 0.0001), }, index=dates, ) strategy_returns = ( 0.0004 + 1.10 * factors["MKT_RF"] - 0.25 * factors["SMB"] + 0.35 * factors["HML"] + 0.10 * factors["RMW"] - 0.05 * factors["CMA"] + factors["RF"] ) benchmark_returns = 0.95 * factors["MKT_RF"] + factors["RF"] results = pd.DataFrame( { "Strategy": 100_000.0 * (1.0 + strategy_returns).cumprod(), "SPY (Benchmark)": 100_000.0 * (1.0 + benchmark_returns).cumprod(), }, index=dates, ) prices = self._make_price_frame(dates, benchmark="SPY") with tempfile.TemporaryDirectory() as tmpdir: summary, loadings = attribute_strategies( results_df=results, benchmark_label="SPY (Benchmark)", benchmark="SPY", price_data=prices, market="us", model_selection="ff5", external_factors=factors, ) export_attribution(summary, loadings, tmpdir) self.assertTrue((Path(tmpdir) / "summary.csv").exists()) self.assertTrue((Path(tmpdir) / "loadings.csv").exists()) exported_summary = pd.read_csv(Path(tmpdir) / "summary.csv") exported_loadings = pd.read_csv(Path(tmpdir) / "loadings.csv") self.assertEqual(len(summary), 1) self.assertListEqual( list(summary.columns), [ "strategy", "market", "model", "factor_source", "proxy_only", "beta_semantics", "start_date", "end_date", "n_obs", "alpha_daily", "alpha_ann", "alpha_t_stat", "alpha_p_value", "r_squared", "adj_r_squared", "residual_vol_ann", "beta_mkt", "beta_smb", "beta_hml", "beta_rmw", "beta_cma", "beta_mom", "beta_lowvol", "beta_recovery", ], ) self.assertEqual(summary.loc[0, "strategy"], "Strategy") self.assertEqual(summary.loc[0, "model"], "ff5") self.assertEqual(summary.loc[0, "factor_source"], "external+local") self.assertFalse(bool(summary.loc[0, "proxy_only"])) self.assertEqual( json.loads(summary.loc[0, "beta_semantics"]), { "beta_mkt": "MKT_RF", "beta_smb": "SMB", "beta_hml": "HML", "beta_rmw": "RMW", "beta_cma": "CMA", "beta_mom": "MOM", "beta_lowvol": "LOWVOL", "beta_recovery": "RECOVERY", }, ) self.assertAlmostEqual(summary.loc[0, "beta_mkt"], 1.10, places=3) self.assertAlmostEqual(summary.loc[0, "beta_smb"], -0.25, places=3) self.assertAlmostEqual(summary.loc[0, "beta_hml"], 0.35, places=3) self.assertTrue(np.isnan(summary.loc[0, "beta_mom"])) self.assertListEqual( list(loadings.columns), ["strategy", "market", "model", "factor_source", "proxy_only", "factor", "beta", "t_stat", "p_value"], ) self.assertEqual(set(loadings["factor"]), {"MKT_RF", "SMB", "HML", "RMW", "CMA"}) self.assertEqual(len(loadings), 5) pd.testing.assert_frame_equal(summary, exported_summary, check_dtype=False) pd.testing.assert_frame_equal(loadings, exported_loadings, check_dtype=False) def test_attribute_strategies_uses_proxy_model_for_cn_runs(self): dates = pd.date_range("2025-01-01", periods=320, freq="B") prices = self._make_price_frame(dates, benchmark="000300.SS") returns = prices["000300.SS"].pct_change().fillna(0.0) * 0.7 + 0.0002 results = pd.DataFrame( { "Strategy": 100_000.0 * (1.0 + returns).cumprod(), "CSI 300 (Benchmark)": 100_000.0 * (1.0 + prices["000300.SS"].pct_change().fillna(0.0)).cumprod(), }, index=dates, ) summary, loadings = attribute_strategies( results_df=results, benchmark_label="CSI 300 (Benchmark)", benchmark="000300.SS", price_data=prices, market="cn", model_selection="ff5", external_factors=None, ) self.assertEqual(len(summary), 1) self.assertEqual(summary.loc[0, "model"], "proxy") self.assertEqual(summary.loc[0, "factor_source"], "proxy_only") self.assertTrue(bool(summary.loc[0, "proxy_only"])) self.assertEqual( json.loads(summary.loc[0, "beta_semantics"]), { "beta_mkt": "MKT", "beta_smb": "SMB_PROXY", "beta_hml": "HML_PROXY", "beta_rmw": "RMW_PROXY", "beta_cma": "CMA_PROXY", "beta_mom": "MOM", "beta_lowvol": "LOWVOL", "beta_recovery": "RECOVERY", }, ) self.assertNotIn("beta_smb_proxy", summary.columns) self.assertNotIn("beta_hml_proxy", summary.columns) self.assertNotIn("beta_rmw_proxy", summary.columns) self.assertNotIn("beta_cma_proxy", summary.columns) self.assertFalse(np.isnan(summary.loc[0, "beta_smb"])) self.assertFalse(np.isnan(summary.loc[0, "beta_hml"])) self.assertFalse(np.isnan(summary.loc[0, "beta_rmw"])) self.assertFalse(np.isnan(summary.loc[0, "beta_cma"])) self.assertEqual( set(loadings["factor"]), {"MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"}, ) loadings_by_factor = loadings.set_index("factor")["beta"] semantics = json.loads(summary.loc[0, "beta_semantics"]) self.assertAlmostEqual(summary.loc[0, "beta_mkt"], loadings_by_factor[semantics["beta_mkt"]], places=10) self.assertAlmostEqual(summary.loc[0, "beta_smb"], loadings_by_factor[semantics["beta_smb"]], places=10) self.assertAlmostEqual(summary.loc[0, "beta_hml"], loadings_by_factor[semantics["beta_hml"]], places=10) self.assertAlmostEqual(summary.loc[0, "beta_rmw"], loadings_by_factor[semantics["beta_rmw"]], places=10) self.assertAlmostEqual(summary.loc[0, "beta_cma"], loadings_by_factor[semantics["beta_cma"]], places=10) def test_attribute_strategies_without_benchmark_uses_equal_weight_proxy_market(self): dates = pd.date_range("2025-01-01", periods=320, freq="B") prices = self._make_price_frame(dates, benchmark="000300.SS").drop(columns=["000300.SS"]) equal_weight_returns = prices.pct_change().mean(axis=1).fillna(0.0) results = pd.DataFrame( { "Strategy": 100_000.0 * (1.0 + 0.0002 + 0.8 * equal_weight_returns).cumprod(), "External Benchmark": 100_000.0 * (1.0 + 0.0001 + 0.6 * equal_weight_returns).cumprod(), }, index=dates, ) summary_missing, loadings_missing = attribute_strategies( results_df=results, benchmark_label="External Benchmark", benchmark=None, price_data=prices, market="cn", model_selection="ff5", external_factors=None, ) summary_explicit, loadings_explicit = attribute_strategies( results_df=results, benchmark_label="External Benchmark", benchmark="MISSING_BENCHMARK", price_data=prices, market="cn", model_selection="ff5", external_factors=None, ) pd.testing.assert_frame_equal(summary_missing, summary_explicit, check_dtype=False) pd.testing.assert_frame_equal(loadings_missing, loadings_explicit, check_dtype=False) def test_print_attribution_summary_prints_compact_table_and_interpretation(self): summary = pd.DataFrame( [ { "strategy": "Strategy", "market": "us", "model": "ff5", "factor_source": "external+local", "proxy_only": False, "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0004, "alpha_ann": 0.1008, "alpha_t_stat": 2.1, "alpha_p_value": 0.04, "r_squared": 0.82, "adj_r_squared": 0.81, "residual_vol_ann": 0.12, "beta_mkt": 1.05, "beta_smb": -0.20, "beta_hml": 0.30, "beta_rmw": 0.05, "beta_cma": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("Factor attribution", output) self.assertIn("Strategy", output) self.assertIn("ff5", output) self.assertIn("alpha_ann", output) self.assertIn("Interpretation", output) def test_print_attribution_summary_keeps_proxy_factor_labels_in_output(self): summary = pd.DataFrame( [ { "strategy": "Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "beta_semantics": json.dumps( { "beta_mkt": "MKT", "beta_smb": "SMB_PROXY", "beta_hml": "HML_PROXY", "beta_rmw": "RMW_PROXY", "beta_cma": "CMA_PROXY", "beta_mom": "MOM", "beta_lowvol": "LOWVOL", "beta_recovery": "RECOVERY", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("beta_smb_proxy", output) self.assertIn("beta_hml_proxy", output) self.assertIn("SMB_PROXY", output) self.assertNotIn(" beta_smb ", output) def test_print_attribution_summary_ignores_malformed_proxy_beta_semantics(self): summary = pd.DataFrame( [ { "strategy": "Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "beta_semantics": "{not-json", "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("Proxy factor attribution", output) self.assertIn("beta_smb_proxy", output) self.assertIn("SMB_PROXY", output) self.assertNotIn(" beta_smb ", output) def test_print_attribution_summary_honors_complete_noncanonical_beta_semantics(self): summary = pd.DataFrame( [ { "strategy": "US Strategy", "market": "us", "model": "ff5", "factor_source": "external+local", "proxy_only": False, "beta_semantics": json.dumps( { "beta_mkt": "MARKET_EXCESS", "beta_smb": "SIZE", "beta_hml": "VALUE", "beta_rmw": "QUALITY", "beta_cma": "INVESTMENT", "beta_mom": "MOMENTUM", "beta_lowvol": "MINVOL", "beta_recovery": "BOUNCE", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0004, "alpha_ann": 0.1008, "alpha_t_stat": 2.1, "alpha_p_value": 0.04, "r_squared": 0.82, "adj_r_squared": 0.81, "residual_vol_ann": 0.12, "beta_mkt": 1.05, "beta_smb": -0.20, "beta_hml": 0.30, "beta_rmw": 0.05, "beta_cma": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, }, { "strategy": "CN Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "beta_semantics": json.dumps( { "beta_mkt": "LOCAL_MARKET", "beta_smb": "SIZE_PROXY_CUSTOM", "beta_hml": "VALUE_PROXY_CUSTOM", "beta_rmw": "QUALITY_PROXY_CUSTOM", "beta_cma": "INVEST_PROXY_CUSTOM", "beta_mom": "TREND", "beta_lowvol": "DEFENSIVE", "beta_recovery": "RECOVERY_PROXY", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, }, ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("Standard factor attribution", output) self.assertIn("Proxy factor attribution", output) self.assertIn("beta_market_excess", output) self.assertIn("beta_size_proxy_custom", output) self.assertIn("MARKET_EXCESS 1.05", output) self.assertIn("SIZE_PROXY_CUSTOM -0.30", output) self.assertNotIn("MKT_RF 1.05", output) self.assertNotIn("SMB_PROXY -0.30", output) def test_print_attribution_summary_ignores_duplicate_beta_semantics_labels(self): summary = pd.DataFrame( [ { "strategy": "US Strategy", "market": "us", "model": "ff5", "factor_source": "external+local", "proxy_only": False, "beta_semantics": json.dumps( { "beta_mkt": "DUPLICATE", "beta_smb": "DUPLICATE", "beta_hml": "VALUE", "beta_rmw": "QUALITY", "beta_cma": "INVESTMENT", "beta_mom": "MOMENTUM", "beta_lowvol": "MINVOL", "beta_recovery": "BOUNCE", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0004, "alpha_ann": 0.1008, "alpha_t_stat": 2.1, "alpha_p_value": 0.04, "r_squared": 0.82, "adj_r_squared": 0.81, "residual_vol_ann": 0.12, "beta_mkt": 1.05, "beta_smb": -0.20, "beta_hml": 0.30, "beta_rmw": 0.05, "beta_cma": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("beta_smb", output) self.assertIn("MKT_RF 1.05", output) self.assertNotIn("beta_duplicate", output) self.assertNotIn("DUPLICATE 1.05", output) def test_print_attribution_summary_ignores_colliding_normalized_beta_semantics_labels(self): summary = pd.DataFrame( [ { "strategy": "CN Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "beta_semantics": json.dumps( { "beta_mkt": "A-B", "beta_smb": "A B", "beta_hml": "VALUE_PROXY_CUSTOM", "beta_rmw": "QUALITY_PROXY_CUSTOM", "beta_cma": "INVEST_PROXY_CUSTOM", "beta_mom": "TREND", "beta_lowvol": "DEFENSIVE", "beta_recovery": "RECOVERY_PROXY", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("beta_smb_proxy", output) self.assertIn("SMB_PROXY -0.30", output) self.assertNotIn("beta_a_b", output) self.assertNotIn("A B -0.30", output) def test_print_attribution_summary_ignores_punctuation_only_beta_semantics_labels(self): summary = pd.DataFrame( [ { "strategy": "CN Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "beta_semantics": json.dumps( { "beta_mkt": "!!!", "beta_smb": "SIZE_PROXY_CUSTOM", "beta_hml": "VALUE_PROXY_CUSTOM", "beta_rmw": "QUALITY_PROXY_CUSTOM", "beta_cma": "INVEST_PROXY_CUSTOM", "beta_mom": "TREND", "beta_lowvol": "DEFENSIVE", "beta_recovery": "RECOVERY_PROXY", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("beta_smb_proxy", output) self.assertIn("SMB_PROXY -0.30", output) self.assertIn("MKT 0.85", output) self.assertNotIn("beta_size_proxy_custom", output) self.assertNotIn("!!! 0.85", output) def test_print_attribution_summary_ignores_labels_that_normalize_to_empty_suffix(self): summary = pd.DataFrame( [ { "strategy": "US Strategy", "market": "us", "model": "ff5", "factor_source": "external+local", "proxy_only": False, "beta_semantics": json.dumps( { "beta_mkt": "___", "beta_smb": "SIZE", "beta_hml": "VALUE", "beta_rmw": "QUALITY", "beta_cma": "INVESTMENT", "beta_mom": "MOMENTUM", "beta_lowvol": "MINVOL", "beta_recovery": "BOUNCE", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0004, "alpha_ann": 0.1008, "alpha_t_stat": 2.1, "alpha_p_value": 0.04, "r_squared": 0.82, "adj_r_squared": 0.81, "residual_vol_ann": 0.12, "beta_mkt": 1.05, "beta_smb": -0.20, "beta_hml": 0.30, "beta_rmw": 0.05, "beta_cma": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("beta_smb", output) self.assertIn("MKT_RF 1.05", output) self.assertNotIn("beta_size", output) self.assertNotIn("___ 1.05", output) def test_print_attribution_summary_splits_standard_and_proxy_sections_for_mixed_frames(self): summary = pd.DataFrame( [ { "strategy": "US Strategy", "market": "us", "model": "ff5", "factor_source": "external+local", "proxy_only": False, "beta_semantics": json.dumps( { "beta_mkt": "MKT_RF", "beta_smb": "SMB", "beta_hml": "HML", "beta_rmw": "RMW", "beta_cma": "CMA", "beta_mom": "MOM", "beta_lowvol": "LOWVOL", "beta_recovery": "RECOVERY", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0004, "alpha_ann": 0.1008, "alpha_t_stat": 2.1, "alpha_p_value": 0.04, "r_squared": 0.82, "adj_r_squared": 0.81, "residual_vol_ann": 0.12, "beta_mkt": 1.05, "beta_smb": -0.20, "beta_hml": 0.30, "beta_rmw": 0.05, "beta_cma": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, }, { "strategy": "CN Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "beta_semantics": json.dumps( { "beta_mkt": "MKT", "beta_smb": "SMB_PROXY", "beta_hml": "HML_PROXY", "beta_rmw": "RMW_PROXY", "beta_cma": "CMA_PROXY", "beta_mom": "MOM", "beta_lowvol": "LOWVOL", "beta_recovery": "RECOVERY", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, }, ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("Standard factor attribution", output) self.assertIn("Proxy factor attribution", output) self.assertIn("beta_smb_proxy", output) self.assertIn("beta_smb ", output) def test_print_attribution_summary_ignores_mismatched_beta_semantics_in_mixed_frames(self): summary = pd.DataFrame( [ { "strategy": "US Strategy", "market": "us", "model": "ff5", "factor_source": "external+local", "proxy_only": False, "beta_semantics": json.dumps( { "beta_mkt": "MKT", "beta_smb": "SMB_PROXY", "beta_hml": "HML_PROXY", "beta_rmw": "RMW_PROXY", "beta_cma": "CMA_PROXY", "beta_mom": "MOM", "beta_lowvol": "LOWVOL", "beta_recovery": "RECOVERY", "extra": "BAD", } ), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0004, "alpha_ann": 0.1008, "alpha_t_stat": 2.1, "alpha_p_value": 0.04, "r_squared": 0.82, "adj_r_squared": 0.81, "residual_vol_ann": 0.12, "beta_mkt": 1.05, "beta_smb": -0.20, "beta_hml": 0.30, "beta_rmw": 0.05, "beta_cma": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, }, { "strategy": "CN Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "beta_semantics": json.dumps({"beta_smb": "SMB", "beta_hml": "HML"}), "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, }, ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("Standard factor attribution", output) self.assertIn("Proxy factor attribution", output) self.assertIn("MKT_RF 1.05", output) self.assertIn("SMB_PROXY -0.30", output) self.assertIn("beta_smb_proxy", output) self.assertNotIn("HML_PROXY 0.30", output) def _make_price_frame(self, dates: pd.DatetimeIndex, benchmark: str) -> pd.DataFrame: steps = np.arange(len(dates), dtype=float) data = {} for symbol, base, drift, amplitude, frequency, phase in ( ("AAA", 45.0, 0.0005, 0.030, 19.0, 0.1), ("BBB", 60.0, 0.0002, 0.025, 23.0, 0.8), ("CCC", 75.0, -0.0001, 0.035, 17.0, 1.4), ("DDD", 90.0, 0.0007, 0.020, 29.0, 0.5), ("EEE", 55.0, -0.0002, 0.028, 31.0, 1.9), ("FFF", 70.0, 0.0004, 0.032, 21.0, 2.5), ): log_path = drift * steps + amplitude * np.sin(steps / frequency + phase) data[symbol] = base * np.exp(log_path) benchmark_path = 0.0004 * steps + 0.018 * np.sin(steps / 27.0 + 0.3) data[benchmark] = 250.0 * np.exp(benchmark_path) return pd.DataFrame(data, index=dates)