import http.client import contextlib import io import socket import ssl import tempfile import unittest import zipfile from pathlib import Path from urllib.error import URLError from unittest import mock import numpy as np import pandas as pd from factor_attribution import ( ExternalFactorDownloadError, ExternalFactorFormatError, KEN_FRENCH_DAILY_FF5_ZIP_URL, _download_kf_zip_bytes, _parse_kf_daily_csv, attribute_strategies, build_extension_factors, build_proxy_core_factors, export_attribution, load_external_us_factors, print_attribution_summary, prepare_factor_models, run_factor_regression, ) class ExternalFactorLoaderTests(unittest.TestCase): def test_download_kf_zip_bytes_fetches_official_ken_french_zip(self): response = mock.MagicMock() response.read.return_value = b"zip-bytes" response.__enter__.return_value = response response.__exit__.return_value = False with mock.patch("factor_attribution.urlopen", return_value=response) as mocked_urlopen: raw_bytes = _download_kf_zip_bytes() self.assertEqual(raw_bytes, b"zip-bytes") request = mocked_urlopen.call_args.args[0] self.assertEqual(request.full_url, KEN_FRENCH_DAILY_FF5_ZIP_URL) self.assertEqual(mocked_urlopen.call_args.kwargs["timeout"], 30) def test_download_kf_zip_bytes_wraps_transport_errors(self): for error in ( URLError("boom"), TimeoutError("timed out"), ConnectionError("conn reset"), socket.timeout("socket timed out"), socket.gaierror("dns failed"), ssl.SSLError("tls failed"), ): with self.subTest(error_type=type(error).__name__): with mock.patch("factor_attribution.urlopen", side_effect=error): with self.assertRaises(ExternalFactorDownloadError): _download_kf_zip_bytes() def test_download_kf_zip_bytes_wraps_incomplete_read_errors(self): response = mock.MagicMock() response.read.side_effect = http.client.IncompleteRead(b"partial", 10) response.__enter__.return_value = response response.__exit__.return_value = False with mock.patch("factor_attribution.urlopen", return_value=response): with self.assertRaises(ExternalFactorDownloadError): _download_kf_zip_bytes() def test_load_external_us_factors_parses_percent_values_and_dates_from_zip_payload(self): csv_text = ( "This line is ignored\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" "20260105,-0.20,0.10,0.30,-0.15,0.05,0.02\n" "\n" ) zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", csv_text, ) with tempfile.TemporaryDirectory() as tmpdir: with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=zip_bytes, ): factors = load_external_us_factors(cache_dir=Path(tmpdir)) self.assertListEqual( list(factors.columns), ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"], ) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) self.assertAlmostEqual(factors.iloc[0]["RF"], 0.0002) self.assertEqual(str(factors.index[0].date()), "2026-01-02") def test_load_external_us_factors_falls_back_to_cache_when_download_fails(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch("factor_attribution.urlopen", side_effect=socket.gaierror("dns failed")): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_falls_back_to_cache_when_download_read_is_incomplete(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) response = mock.MagicMock() response.read.side_effect = http.client.IncompleteRead(b"partial", 10) response.__enter__.return_value = response response.__exit__.return_value = False with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch("factor_attribution.urlopen", return_value=response): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_falls_back_to_cache_when_http_status_line_is_bad(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch("factor_attribution.urlopen", side_effect=http.client.BadStatusLine("HTTP/1.1 ???")): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_parse_kf_daily_csv_raises_external_factor_format_error_for_missing_header(self): zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", "not the expected file format\n20260102,1.00\n", ) with self.assertRaises(ExternalFactorFormatError): _parse_kf_daily_csv(zip_bytes) def test_load_external_us_factors_warns_and_falls_back_to_cache_when_source_format_is_invalid(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") malformed_zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", "not the expected file format\n20260102,1.00\n", ) with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=malformed_zip_bytes, ): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_warns_and_falls_back_to_cache_when_zip_is_invalid(self): cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=b"not-a-zip-file", ): with self.assertWarnsRegex(UserWarning, "cached data"): factors = load_external_us_factors(cache_dir=cache_dir) self.assertEqual(len(factors), 1) self.assertAlmostEqual(factors.iloc[0]["MKT_RF"], 0.01) def test_load_external_us_factors_surfaces_cache_write_failures(self): csv_text = ( "This line is ignored\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" "\n" ) zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", csv_text, ) with tempfile.TemporaryDirectory() as tmpdir: with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=zip_bytes, ): with mock.patch("pandas.DataFrame.to_csv", side_effect=OSError("disk full")): with self.assertRaises(OSError): load_external_us_factors(cache_dir=Path(tmpdir)) def test_load_external_us_factors_does_not_swallow_unrelated_local_failures(self): csv_text = ( "This line is ignored\n" ",Mkt-RF,SMB,HML,RMW,CMA,RF\n" "20260102,1.00,0.50,-0.25,0.10,-0.05,0.02\n" "\n" ) zip_bytes = self._make_zip_bytes( "F-F_Research_Data_5_Factors_2x3_daily.csv", csv_text, ) cached = pd.DataFrame( { "MKT_RF": [0.01], "SMB": [0.0], "HML": [0.0], "RMW": [0.0], "CMA": [0.0], "RF": [0.0001], }, index=pd.to_datetime(["2026-01-02"]), ) with tempfile.TemporaryDirectory() as tmpdir: cache_dir = Path(tmpdir) cached.to_csv(cache_dir / "ff5_us_daily.csv") with mock.patch( "factor_attribution._download_kf_zip_bytes", return_value=zip_bytes, ): with mock.patch( "factor_attribution._parse_kf_daily_csv", side_effect=RuntimeError("unexpected local bug"), ): with self.assertRaises(RuntimeError): load_external_us_factors(cache_dir=cache_dir) def _make_zip_bytes(self, filename: str, contents: str) -> bytes: buffer = io.BytesIO() with zipfile.ZipFile(buffer, mode="w") as archive: archive.writestr(filename, contents) return buffer.getvalue() class LocalFactorConstructionTests(unittest.TestCase): def test_build_extension_factors_returns_expected_columns_with_non_null_values_after_warmup(self): prices = self._make_price_frame(benchmark="SPY") factors = build_extension_factors(prices, benchmark="SPY", market="us") self.assertListEqual(list(factors.columns), ["MOM", "LOWVOL", "RECOVERY"]) self.assertTrue(factors.iloc[260:].notna().all().all()) self.assertGreater(factors.iloc[260:].abs().sum().sum(), 0.0) def test_build_proxy_core_factors_returns_expected_columns_with_non_null_values_after_warmup(self): prices = self._make_price_frame(benchmark="000300.SS") factors = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") self.assertListEqual( list(factors.columns), ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"], ) self.assertTrue(factors.iloc[260:].notna().all().all()) self.assertGreater(factors.iloc[260:].abs().sum().sum(), 0.0) def test_build_extension_factors_does_not_use_future_prices(self): prices = self._make_price_frame(benchmark="SPY") mutated = prices.copy() future_start = prices.index[280] mutated.loc[future_start:, "A"] = mutated.loc[future_start:, "A"] * 1.8 mutated.loc[future_start:, "B"] = mutated.loc[future_start:, "B"] * 0.4 original = build_extension_factors(prices, benchmark="SPY", market="us") changed = build_extension_factors(mutated, benchmark="SPY", market="us") comparison_end = prices.index[279] pd.testing.assert_frame_equal(original.loc[:comparison_end], changed.loc[:comparison_end]) self.assertGreater( (original.loc[future_start:] - changed.loc[future_start:]).abs().sum().sum(), 0.0, ) def test_build_proxy_core_factors_market_branch_does_not_use_future_benchmark_prices(self): prices = self._make_price_frame(benchmark="000300.SS") mutated = prices.copy() future_start = prices.index[280] mutated.loc[future_start:, "000300.SS"] = mutated.loc[future_start:, "000300.SS"] * 1.4 original = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") changed = build_proxy_core_factors(mutated, benchmark="000300.SS", market="cn") comparison_end = prices.index[279] pd.testing.assert_series_equal( original.loc[:comparison_end, "MKT"], changed.loc[:comparison_end, "MKT"], check_names=False, ) proxy_columns = ["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"] pd.testing.assert_frame_equal( original.loc[:, proxy_columns], changed.loc[:, proxy_columns], ) self.assertGreater( (original.loc[future_start:, "MKT"] - changed.loc[future_start:, "MKT"]).abs().sum(), 0.0, ) def test_build_proxy_core_factors_proxy_columns_do_not_use_future_stock_prices(self): prices = self._make_price_frame(benchmark="000300.SS") mutated = prices.copy() future_start = prices.index[280] mutated.loc[future_start:, "C"] = mutated.loc[future_start:, "C"] * 0.35 mutated.loc[future_start:, "D"] = mutated.loc[future_start:, "D"] * 1.6 original = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") changed = build_proxy_core_factors(mutated, benchmark="000300.SS", market="cn") comparison_end = prices.index[279] proxy_columns = ["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"] pd.testing.assert_frame_equal( original.loc[:comparison_end, proxy_columns], changed.loc[:comparison_end, proxy_columns], ) self.assertGreater( (original.loc[future_start:, proxy_columns] - changed.loc[future_start:, proxy_columns]).abs().sum().sum(), 0.0, ) def test_build_proxy_core_factors_falls_back_to_equal_weight_market_when_benchmark_missing(self): prices_with_benchmark = self._make_price_frame(benchmark="CN_BENCH") prices = prices_with_benchmark.drop(columns=["CN_BENCH"]) factors = build_proxy_core_factors(prices, benchmark="000300.SS", market="cn") reference = build_proxy_core_factors(prices_with_benchmark, benchmark="CN_BENCH", market="cn") expected_market = prices.pct_change().mean(axis=1) pd.testing.assert_series_equal(factors["MKT"], expected_market, check_names=False) self.assertListEqual( list(factors.columns), ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"], ) self.assertTrue(factors.iloc[260:][["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]].notna().all().all()) self.assertGreater( factors.iloc[260:][["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]].abs().sum().sum(), 0.0, ) pd.testing.assert_frame_equal( factors[["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]], reference[["SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY"]], ) def _make_price_frame(self, benchmark: str) -> pd.DataFrame: dates = pd.date_range("2025-01-01", periods=320, freq="B") steps = np.arange(len(dates), dtype=float) symbols = [ ("A", 45.0, 0.0006, 0.030, 19.0, 0.1), ("B", 60.0, 0.0003, 0.025, 23.0, 0.8), ("C", 75.0, -0.0002, 0.035, 17.0, 1.4), ("D", 90.0, 0.0008, 0.020, 29.0, 0.5), ("E", 55.0, -0.0001, 0.028, 31.0, 1.9), ("F", 70.0, 0.0005, 0.032, 21.0, 2.5), ] data = {} for symbol, base, drift, amplitude, frequency, phase in symbols: log_path = drift * steps + amplitude * np.sin(steps / frequency + phase) data[symbol] = base * np.exp(log_path) benchmark_path = 0.0004 * steps + 0.018 * np.sin(steps / 27.0 + 0.3) data[benchmark] = 250.0 * np.exp(benchmark_path) return pd.DataFrame(data, index=dates) class RegressionTests(unittest.TestCase): def test_run_factor_regression_recovers_known_coefficients(self): dates = pd.date_range("2024-01-01", periods=300, freq="B") angles = np.linspace(0.0, 18.0, len(dates)) factors = pd.DataFrame( { "MKT_RF": 0.012 * np.sin(angles), "SMB": 0.007 * np.cos(angles * 0.7) + np.linspace(-0.002, 0.003, len(dates)), "RF": np.full(len(dates), 0.0001), }, index=dates, ) factors.loc[dates[:4], "SMB"] = np.nan strategy = ( 0.0005 + 1.2 * factors["MKT_RF"] + 0.4 * factors["SMB"] + factors["RF"] ) result = run_factor_regression( strategy, factors, factor_cols=["MKT_RF", "SMB"], risk_free_col="RF", ) self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=6) self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=6) self.assertAlmostEqual(result["betas"]["SMB"], 0.4, places=6) self.assertGreater(result["r_squared"], 0.999999) self.assertEqual(result["start_date"], "2024-01-05") self.assertEqual(result["end_date"], "2025-02-21") self.assertEqual(result["n_obs"], 296) def test_run_factor_regression_rejects_underdetermined_designs(self): dates = pd.date_range("2024-01-01", periods=2, freq="B") factors = pd.DataFrame( { "MKT_RF": [0.01, -0.02], "SMB": [0.005, 0.004], }, index=dates, ) strategy = pd.Series([0.012, -0.018], index=dates) with self.assertRaisesRegex(ValueError, "Insufficient observations"): run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) def test_run_factor_regression_allows_square_full_rank_design_without_inference(self): dates = pd.date_range("2024-01-01", periods=3, freq="B") factors = pd.DataFrame( { "MKT_RF": [0.0, 1.0, 0.0], "SMB": [0.0, 0.0, 1.0], }, index=dates, ) strategy = pd.Series([0.0005, 1.2005, -0.3995], index=dates) result = run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) self.assertAlmostEqual(result["alpha_daily"], 0.0005, places=10) self.assertAlmostEqual(result["betas"]["MKT_RF"], 1.2, places=10) self.assertAlmostEqual(result["betas"]["SMB"], -0.4, places=10) self.assertEqual(result["r_squared"], 1.0) self.assertTrue(np.isnan(result["alpha_t_stat"])) self.assertTrue(np.isnan(result["alpha_p_value"])) self.assertTrue(np.isnan(result["t_stats"]["MKT_RF"])) self.assertTrue(np.isnan(result["t_stats"]["SMB"])) self.assertTrue(np.isnan(result["p_values"]["MKT_RF"])) self.assertTrue(np.isnan(result["p_values"]["SMB"])) self.assertTrue(np.isnan(result["adj_r_squared"])) self.assertAlmostEqual(result["residual_vol_ann"], 0.0, places=12) def test_run_factor_regression_single_observation_intercept_only_has_zero_residual_vol(self): dates = pd.date_range("2024-01-01", periods=1, freq="B") factors = pd.DataFrame(index=dates) strategy = pd.Series([0.0015], index=dates) result = run_factor_regression(strategy, factors, factor_cols=[]) self.assertAlmostEqual(result["alpha_daily"], 0.0015, places=12) self.assertEqual(result["betas"], {}) self.assertEqual(result["t_stats"], {}) self.assertEqual(result["p_values"], {}) self.assertEqual(result["r_squared"], 0.0) self.assertTrue(np.isnan(result["alpha_t_stat"])) self.assertTrue(np.isnan(result["alpha_p_value"])) self.assertTrue(np.isnan(result["adj_r_squared"])) self.assertEqual(result["n_obs"], 1) self.assertAlmostEqual(result["residual_vol_ann"], 0.0, places=12) def test_run_factor_regression_rejects_rank_deficient_designs(self): dates = pd.date_range("2024-01-01", periods=6, freq="B") market = np.array([0.01, -0.02, 0.015, 0.005, -0.01, 0.02]) factors = pd.DataFrame( { "MKT_RF": market, "SMB": market * 2.0, }, index=dates, ) strategy = pd.Series(0.0005 + 1.0 * factors["MKT_RF"] + 0.5 * factors["SMB"], index=dates) with self.assertRaisesRegex(ValueError, "rank-deficient"): run_factor_regression(strategy, factors, factor_cols=["MKT_RF", "SMB"]) def test_prepare_factor_models_uses_proxy_family_without_external_us_factors(self): dates = pd.date_range("2024-01-01", periods=5, freq="B") extension = pd.DataFrame( { "MOM": np.linspace(0.001, 0.005, len(dates)), "LOWVOL": np.linspace(-0.002, 0.002, len(dates)), "RECOVERY": np.linspace(0.003, -0.001, len(dates)), }, index=dates, ) proxy = pd.DataFrame( { "MKT": np.linspace(-0.01, 0.01, len(dates)), "SMB_PROXY": np.linspace(0.002, 0.004, len(dates)), "HML_PROXY": np.linspace(-0.003, 0.001, len(dates)), "RMW_PROXY": np.linspace(0.005, 0.001, len(dates)), "CMA_PROXY": np.linspace(-0.004, -0.002, len(dates)), }, index=dates, ) prepared = prepare_factor_models( market="us", extension_factors=extension, proxy_factors=proxy, external_factors=None, ) self.assertEqual(prepared["factor_source"], "proxy_only") self.assertIsNone(prepared["risk_free_col"]) self.assertListEqual(list(prepared["models"]), ["proxy"]) self.assertListEqual( prepared["models"]["proxy"], ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"], ) self.assertListEqual( list(prepared["factor_frame"].columns), ["MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"], ) class AttributionIntegrationTests(unittest.TestCase): def test_attribute_strategies_exports_standard_model_summary_and_loadings(self): dates = pd.date_range("2025-01-01", periods=320, freq="B") angles = np.linspace(0.0, 24.0, len(dates)) factors = pd.DataFrame( { "MKT_RF": 0.010 * np.sin(angles), "SMB": 0.006 * np.cos(angles * 0.7), "HML": 0.004 * np.sin(angles * 1.3 + 0.4), "RMW": 0.003 * np.cos(angles * 1.1 + 0.2), "CMA": 0.002 * np.sin(angles * 0.5 + 0.8), "RF": np.full(len(dates), 0.0001), }, index=dates, ) strategy_returns = ( 0.0004 + 1.10 * factors["MKT_RF"] - 0.25 * factors["SMB"] + 0.35 * factors["HML"] + 0.10 * factors["RMW"] - 0.05 * factors["CMA"] + factors["RF"] ) benchmark_returns = 0.95 * factors["MKT_RF"] + factors["RF"] results = pd.DataFrame( { "Strategy": 100_000.0 * (1.0 + strategy_returns).cumprod(), "SPY (Benchmark)": 100_000.0 * (1.0 + benchmark_returns).cumprod(), }, index=dates, ) prices = self._make_price_frame(dates, benchmark="SPY") with tempfile.TemporaryDirectory() as tmpdir: summary, loadings = attribute_strategies( results_df=results, benchmark_label="SPY (Benchmark)", benchmark="SPY", price_data=prices, market="us", model_selection="ff5", external_factors=factors, ) export_attribution(summary, loadings, tmpdir) self.assertTrue((Path(tmpdir) / "summary.csv").exists()) self.assertTrue((Path(tmpdir) / "loadings.csv").exists()) exported_summary = pd.read_csv(Path(tmpdir) / "summary.csv") exported_loadings = pd.read_csv(Path(tmpdir) / "loadings.csv") self.assertEqual(len(summary), 1) self.assertListEqual( list(summary.columns), [ "strategy", "market", "model", "factor_source", "proxy_only", "start_date", "end_date", "n_obs", "alpha_daily", "alpha_ann", "alpha_t_stat", "alpha_p_value", "r_squared", "adj_r_squared", "residual_vol_ann", "beta_mkt", "beta_smb", "beta_hml", "beta_rmw", "beta_cma", "beta_mom", "beta_lowvol", "beta_recovery", ], ) self.assertEqual(summary.loc[0, "strategy"], "Strategy") self.assertEqual(summary.loc[0, "model"], "ff5") self.assertEqual(summary.loc[0, "factor_source"], "external+local") self.assertFalse(bool(summary.loc[0, "proxy_only"])) self.assertAlmostEqual(summary.loc[0, "beta_mkt"], 1.10, places=3) self.assertAlmostEqual(summary.loc[0, "beta_smb"], -0.25, places=3) self.assertAlmostEqual(summary.loc[0, "beta_hml"], 0.35, places=3) self.assertTrue(np.isnan(summary.loc[0, "beta_mom"])) self.assertListEqual( list(loadings.columns), ["strategy", "market", "model", "factor_source", "proxy_only", "factor", "beta", "t_stat", "p_value"], ) self.assertEqual(set(loadings["factor"]), {"MKT_RF", "SMB", "HML", "RMW", "CMA"}) self.assertEqual(len(loadings), 5) pd.testing.assert_frame_equal(summary, exported_summary, check_dtype=False) pd.testing.assert_frame_equal(loadings, exported_loadings, check_dtype=False) def test_attribute_strategies_uses_proxy_model_for_cn_runs(self): dates = pd.date_range("2025-01-01", periods=320, freq="B") prices = self._make_price_frame(dates, benchmark="000300.SS") returns = prices["000300.SS"].pct_change().fillna(0.0) * 0.7 + 0.0002 results = pd.DataFrame( { "Strategy": 100_000.0 * (1.0 + returns).cumprod(), "CSI 300 (Benchmark)": 100_000.0 * (1.0 + prices["000300.SS"].pct_change().fillna(0.0)).cumprod(), }, index=dates, ) summary, loadings = attribute_strategies( results_df=results, benchmark_label="CSI 300 (Benchmark)", benchmark="000300.SS", price_data=prices, market="cn", model_selection="ff5", external_factors=None, ) self.assertEqual(len(summary), 1) self.assertEqual(summary.loc[0, "model"], "proxy") self.assertEqual(summary.loc[0, "factor_source"], "proxy_only") self.assertTrue(bool(summary.loc[0, "proxy_only"])) self.assertNotIn("beta_smb_proxy", summary.columns) self.assertNotIn("beta_hml_proxy", summary.columns) self.assertNotIn("beta_rmw_proxy", summary.columns) self.assertNotIn("beta_cma_proxy", summary.columns) self.assertFalse(np.isnan(summary.loc[0, "beta_smb"])) self.assertFalse(np.isnan(summary.loc[0, "beta_hml"])) self.assertFalse(np.isnan(summary.loc[0, "beta_rmw"])) self.assertFalse(np.isnan(summary.loc[0, "beta_cma"])) self.assertEqual( set(loadings["factor"]), {"MKT", "SMB_PROXY", "HML_PROXY", "RMW_PROXY", "CMA_PROXY", "MOM", "LOWVOL", "RECOVERY"}, ) def test_attribute_strategies_without_benchmark_uses_equal_weight_proxy_market(self): dates = pd.date_range("2025-01-01", periods=320, freq="B") prices = self._make_price_frame(dates, benchmark="000300.SS").drop(columns=["000300.SS"]) equal_weight_returns = prices.pct_change().mean(axis=1).fillna(0.0) results = pd.DataFrame( { "Strategy": 100_000.0 * (1.0 + 0.0002 + 0.8 * equal_weight_returns).cumprod(), "External Benchmark": 100_000.0 * (1.0 + 0.0001 + 0.6 * equal_weight_returns).cumprod(), }, index=dates, ) summary_missing, loadings_missing = attribute_strategies( results_df=results, benchmark_label="External Benchmark", benchmark=None, price_data=prices, market="cn", model_selection="ff5", external_factors=None, ) summary_explicit, loadings_explicit = attribute_strategies( results_df=results, benchmark_label="External Benchmark", benchmark="MISSING_BENCHMARK", price_data=prices, market="cn", model_selection="ff5", external_factors=None, ) pd.testing.assert_frame_equal(summary_missing, summary_explicit, check_dtype=False) pd.testing.assert_frame_equal(loadings_missing, loadings_explicit, check_dtype=False) def test_print_attribution_summary_prints_compact_table_and_interpretation(self): summary = pd.DataFrame( [ { "strategy": "Strategy", "market": "us", "model": "ff5", "factor_source": "external+local", "proxy_only": False, "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0004, "alpha_ann": 0.1008, "alpha_t_stat": 2.1, "alpha_p_value": 0.04, "r_squared": 0.82, "adj_r_squared": 0.81, "residual_vol_ann": 0.12, "beta_mkt": 1.05, "beta_smb": -0.20, "beta_hml": 0.30, "beta_rmw": 0.05, "beta_cma": np.nan, "beta_mom": np.nan, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("Factor attribution", output) self.assertIn("Strategy", output) self.assertIn("ff5", output) self.assertIn("alpha_ann", output) self.assertIn("Interpretation", output) def test_print_attribution_summary_keeps_proxy_factor_labels_in_output(self): summary = pd.DataFrame( [ { "strategy": "Strategy", "market": "cn", "model": "proxy", "factor_source": "proxy_only", "proxy_only": True, "start_date": "2025-01-02", "end_date": "2026-03-24", "n_obs": 319, "alpha_daily": 0.0002, "alpha_ann": 0.0504, "alpha_t_stat": 1.5, "alpha_p_value": 0.12, "r_squared": 0.72, "adj_r_squared": 0.70, "residual_vol_ann": 0.14, "beta_mkt": 0.85, "beta_smb": -0.30, "beta_hml": 0.25, "beta_rmw": 0.10, "beta_cma": -0.05, "beta_mom": 0.20, "beta_lowvol": np.nan, "beta_recovery": np.nan, } ] ) buffer = io.StringIO() with contextlib.redirect_stdout(buffer): print_attribution_summary(summary) output = buffer.getvalue() self.assertIn("beta_smb_proxy", output) self.assertIn("beta_hml_proxy", output) self.assertIn("SMB_PROXY", output) self.assertNotIn(" beta_smb ", output) def _make_price_frame(self, dates: pd.DatetimeIndex, benchmark: str) -> pd.DataFrame: steps = np.arange(len(dates), dtype=float) data = {} for symbol, base, drift, amplitude, frequency, phase in ( ("AAA", 45.0, 0.0005, 0.030, 19.0, 0.1), ("BBB", 60.0, 0.0002, 0.025, 23.0, 0.8), ("CCC", 75.0, -0.0001, 0.035, 17.0, 1.4), ("DDD", 90.0, 0.0007, 0.020, 29.0, 0.5), ("EEE", 55.0, -0.0002, 0.028, 31.0, 1.9), ("FFF", 70.0, 0.0004, 0.032, 21.0, 2.5), ): log_path = drift * steps + amplitude * np.sin(steps / frequency + phase) data[symbol] = base * np.exp(log_path) benchmark_path = 0.0004 * steps + 0.018 * np.sin(steps / 27.0 + 0.3) data[benchmark] = 250.0 * np.exp(benchmark_path) return pd.DataFrame(data, index=dates)