""" Iterative Factor Discovery Loop. Round 1: Academic & practitioner hypotheses (30+ factors) Round 2: Data-driven variations on Round 1 winners Round 3: Interaction and conditional factors Round 4: Parameter optimization on finalists Round 5: Best combinations Each factor is tested immediately as a top-10 equal-weight strategy with monthly rebalancing and 10bps transaction costs. """ from __future__ import annotations import argparse import warnings from typing import Callable import numpy as np import pandas as pd import data_manager from universe import UNIVERSES warnings.filterwarnings("ignore") FactorFunc = Callable[[pd.DataFrame], pd.DataFrame] # --------------------------------------------------------------------------- # Backtest infrastructure # --------------------------------------------------------------------------- def strat( prices: pd.DataFrame, signal_func: FactorFunc, top_n: int = 10, rebal: int = 21, warmup: int = 252, ) -> pd.DataFrame: sig = signal_func(prices) rank = sig.rank(axis=1, ascending=False, na_option="bottom") n_valid = sig.notna().sum(axis=1) enough = n_valid >= top_n mask = (rank <= top_n) & enough.values.reshape(-1, 1) raw = mask.astype(float) w = raw.div(raw.sum(axis=1).replace(0, np.nan), axis=0).fillna(0.0) rmask = pd.Series(False, index=prices.index) rmask.iloc[list(range(warmup, len(prices), rebal))] = True w[~rmask] = np.nan w = w.ffill().fillna(0.0) w.iloc[:warmup] = 0.0 return w.shift(1).fillna(0.0) def bt(weights: pd.DataFrame, prices: pd.DataFrame, cost: float = 0.001) -> pd.Series: ret = prices.pct_change().fillna(0.0) pr = (weights * ret).sum(axis=1) pr -= weights.diff().abs().sum(axis=1) * cost return (1 + pr).cumprod() * 100000 def stats(eq: pd.Series) -> dict: dr = eq.pct_change().dropna() if len(dr) < 200 or dr.std() == 0: return {"cagr": np.nan, "sharpe": np.nan, "sortino": np.nan, "maxdd": np.nan, "calmar": np.nan} ny = len(dr) / 252 tot = eq.iloc[-1] / eq.iloc[0] - 1 cagr = (1 + tot) ** (1 / ny) - 1 sh = dr.mean() / dr.std() * np.sqrt(252) sd = dr[dr < 0].std() so = dr.mean() / sd * np.sqrt(252) if sd > 0 else 0 rm = eq.cummax() dd = ((eq - rm) / rm).min() cal = cagr / abs(dd) if dd != 0 else 0 return {"cagr": cagr, "sharpe": sh, "sortino": so, "maxdd": dd, "calmar": cal} def yearly(eq: pd.Series) -> dict[int, float]: dr = eq.pct_change().fillna(0) return {y: float((1 + dr[dr.index.year == y]).prod() - 1) for y in sorted(dr.index.year.unique())} def test_factor(name: str, func: FactorFunc, prices: pd.DataFrame, top_n: int = 10) -> dict: w = strat(prices, func, top_n=top_n) eq = bt(w, prices) s = stats(eq) s["name"] = name s["equity"] = eq return s def combo(fws: list[tuple[FactorFunc, float]]) -> FactorFunc: def _c(p): return sum(w * f(p).rank(axis=1, pct=True, na_option="keep") for f, w in fws) return _c def print_results(results: list[dict], title: str): df = pd.DataFrame([{k: v for k, v in r.items() if k != "equity"} for r in results]) df = df.set_index("name").sort_values("cagr", ascending=False) print(f"\n{'='*95}") print(f" {title}") print(f"{'='*95}") print(f" {'Factor':<45} {'CAGR':>7} {'Sharpe':>7} {'Sortino':>8} {'MaxDD':>7} {'Calmar':>7}") print(f" {'-'*85}") for name, row in df.iterrows(): flag = " <<<" if "BASELINE" in str(name) else "" c = row['cagr'] if np.isnan(c): continue print(f" {str(name):<45} {c:>+6.1%} {row['sharpe']:>7.2f} {row['sortino']:>8.2f} " f"{row['maxdd']:>+6.1%} {row['calmar']:>7.2f}{flag}") return df # ===================================================================== # ROUND 1 — Academic & Practitioner Hypotheses # ===================================================================== # --- Momentum family --- def f_mom_12_1(p): return p.shift(21).pct_change(231) def f_mom_6_1(p): return p.shift(21).pct_change(105) def f_mom_3_1(p): return p.shift(21).pct_change(42) def f_mom_1_0(p): return p.pct_change(21) # 1-month (reversal in US) # --- Recovery family --- def f_rec_63(p): return p / p.rolling(63, min_periods=63).min() - 1 def f_rec_126(p): return p / p.rolling(126, min_periods=126).min() - 1 def f_rec_21(p): return p / p.rolling(21, min_periods=21).min() - 1 # Novy-Marx 2012: intermediate momentum (7-12 month) def f_mom_intermediate(p): return p.shift(21).pct_change(147) # ~7 month # Asness et al: quality/profitability proxy via return consistency def f_consistent_returns(p): ret = p.pct_change() return (ret > 0).astype(float).rolling(252, min_periods=126).mean() # Da, Liu, Schaumburg 2014: information discreteness # Stocks with many small positive days > stocks with few large positive days def f_info_discrete(p): ret = p.pct_change() n_pos = (ret > 0).astype(float).rolling(60, min_periods=40).sum() sum_pos = ret.where(ret > 0, 0).rolling(60, min_periods=40).sum() avg_pos = sum_pos / n_pos.replace(0, np.nan) # High count of positive days + low average positive = smooth accumulation return n_pos / avg_pos.replace(0, np.nan) # Accumulation proxy (worked in Round 1) def f_up_volume_proxy(p): ret = p.pct_change() return ret.where(ret > 0, 0).rolling(20, min_periods=15).sum() # George & Hwang 2004: 52-week high ratio def f_52w_high(p): return p / p.rolling(252, min_periods=126).max() # Frequency of large up-moves (worked in Round 1) def f_gap_up_freq(p): ret = p.pct_change() return (ret > 0.01).astype(float).rolling(60, min_periods=40).mean() # Bali, Cakici, Whitelaw 2011: MAX effect (lottery demand) def f_anti_max(p): ret = p.pct_change() return -ret.rolling(20, min_periods=15).max() # Ang et al 2006: idiosyncratic volatility (negative) def f_neg_ivol(p): ret = p.pct_change() return -ret.rolling(20, min_periods=15).std() # Blitz & van Vliet 2007: low volatility anomaly def f_low_vol_60(p): ret = p.pct_change() return -ret.rolling(60, min_periods=40).std() # Hurst exponent proxy — autocorrelation of returns # Stocks with positive autocorrelation = trending def f_autocorrelation(p): ret = p.pct_change() def _ac(x): x = x.dropna() if len(x) < 20: return np.nan return np.corrcoef(x[:-1], x[1:])[0, 1] return ret.rolling(60, min_periods=40).apply(_ac, raw=False) # Short-term reversal (Jegadeesh 1990) def f_str_5d(p): return -p.pct_change(5) def f_str_10d(p): return -p.pct_change(10) # Earnings drift proxy (worked in Round 1) def f_earnings_drift(p): ret_5d = p.pct_change(5) vol = p.pct_change().rolling(60, min_periods=40).std() * np.sqrt(5) z = ret_5d / vol.replace(0, np.nan) return z.rolling(60, min_periods=20).mean() # Risk-adjusted momentum (Sharpe-momentum) def f_sharpe_mom(p): ret = p.pct_change() mu = ret.rolling(252, min_periods=126).mean() sigma = ret.rolling(252, min_periods=126).std() return mu / sigma.replace(0, np.nan) # Trend strength: slope of log-price regression def f_trend_slope(p): log_p = np.log(p.replace(0, np.nan)) def _slope(x): x = x.dropna().values if len(x) < 30: return np.nan t = np.arange(len(x), dtype=float) t -= t.mean() return (t * (x - x.mean())).sum() / (t * t).sum() return log_p.rolling(60, min_periods=30).apply(_slope, raw=False) # Acceleration: recent momentum vs. longer-term momentum def f_mom_accel(p): m3 = p.shift(5).pct_change(58) # ~3mo m12 = p.shift(21).pct_change(231) # ~12mo return m3 - m12 # Mean reversion z-score def f_mean_rev_z(p): ma20 = p.rolling(20, min_periods=20).mean() vol = p.pct_change().rolling(60, min_periods=40).std() * p return -(p - ma20) / vol.replace(0, np.nan) # Price relative to moving averages def f_above_ma200(p): return p / p.rolling(200, min_periods=200).mean() - 1 def f_above_ma50(p): return p / p.rolling(50, min_periods=50).mean() - 1 # Dual MA signal: 50-day MA / 200-day MA def f_golden_cross(p): ma50 = p.rolling(50, min_periods=50).mean() ma200 = p.rolling(200, min_periods=200).mean() return ma50 / ma200 - 1 # Drawdown recovery rate def f_dd_recovery_rate(p): rm = p.rolling(252, min_periods=126).max() dd = p / rm - 1 # negative when in drawdown return dd - dd.shift(20) # positive = recovering from drawdown # A-share specific: short-term reversal x volatility def f_reversal_vol(p): return -p.pct_change(5) * p.pct_change().rolling(20, min_periods=15).std() # Recovery + momentum (baseline) def f_rec_mom(p): r1 = f_rec_63(p).rank(axis=1, pct=True, na_option="keep") r2 = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep") return 0.5 * r1 + 0.5 * r2 # ===================================================================== # ROUND 2 — Second-order ideas from Round 1 analysis # ===================================================================== # The key insight: "quality of returns" matters more than "magnitude of returns" # Factors that measure HOW a stock goes up, not just that it went up. # Smoothness-weighted momentum def f_smooth_momentum(p): """Momentum penalized by path volatility. Stocks that go up smoothly.""" mom = p.shift(21).pct_change(231) ret = p.pct_change() vol = ret.rolling(252, min_periods=126).std() return mom / (vol.replace(0, np.nan) ** 0.5) # sqrt to dampen # Positive return ratio (like Sharpe numerator) def f_pos_ratio_60(p): """Fraction of positive return days in 60 days. Quality signal.""" ret = p.pct_change() return (ret > 0).astype(float).rolling(60, min_periods=40).mean() # Cumulative positive returns vs cumulative negative returns def f_up_down_asymmetry(p): """Ratio of cumulative up-move to cumulative down-move.""" ret = p.pct_change() up = ret.where(ret > 0, 0).rolling(60, min_periods=40).sum() down = (-ret.where(ret < 0, 0)).rolling(60, min_periods=40).sum() return up / down.replace(0, np.nan) # Streak momentum: max consecutive up days in last 40 days def f_max_streak(p): ret = p.pct_change() pos = (ret > 0).astype(float) def _max_streak(x): x = x.dropna().values if len(x) == 0: return 0 best = cur = 0 for v in x: cur = cur + 1 if v > 0.5 else 0 best = max(best, cur) return best return pos.rolling(40, min_periods=20).apply(_max_streak, raw=False) # Overnight proxy: gap between yesterday's close and today's pattern # Since we only have close prices, use close-to-close 1d return decomposition def f_up_capture(p): """Up-market capture ratio over 60 days.""" ret = p.pct_change() mkt = ret.mean(axis=1) up_mkt = mkt > 0 arr = ret.values.copy() arr[~up_mkt.values, :] = np.nan stock_up = pd.DataFrame(arr, index=ret.index, columns=ret.columns) mkt_up_vals = mkt.where(up_mkt, np.nan) stock_avg = stock_up.rolling(60, min_periods=20).mean() mkt_avg = mkt_up_vals.rolling(60, min_periods=20).mean() return stock_avg.div(mkt_avg, axis=0) # Down-market resilience def f_down_resilience(p): """How much LESS a stock falls on down-market days.""" ret = p.pct_change() mkt = ret.mean(axis=1) down_mkt = mkt < 0 arr = ret.values.copy() arr[~down_mkt.values, :] = np.nan down_ret = pd.DataFrame(arr, index=ret.index, columns=ret.columns) return -down_ret.rolling(120, min_periods=30).mean() # Recovery from rolling max with momentum filter def f_rec_mom_filtered(p): """Recovery factor only for stocks with positive 6-month momentum. Filters out dead-cat bounces.""" rec = p / p.rolling(126, min_periods=126).min() - 1 mom = p.shift(21).pct_change(105) return rec.where(mom > 0, np.nan) # Information discreteness v2: using the sign ratio def f_sign_ratio(p): """Ratio of (count of positive days)^2 * avg_size to total return. High ratio = many small ups = institutional flow.""" ret = p.pct_change() n_total = 60 n_pos = (ret > 0).astype(float).rolling(n_total, min_periods=40).sum() total_ret = ret.rolling(n_total, min_periods=40).sum() sign_vol = n_pos / n_total # Stocks where most of the return comes from many small positive days return sign_vol * total_ret.clip(lower=0) # ===================================================================== # ROUND 3 — Interaction & conditional factors # ===================================================================== def f_mom_x_recovery(p): """Momentum × Recovery interaction. The product, not the sum.""" mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep") rec_r = f_rec_63(p).rank(axis=1, pct=True, na_option="keep") return mom_r * rec_r def f_mom_x_upvol(p): """Momentum × Up-volume-proxy interaction.""" mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep") up_r = f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep") return mom_r * up_r def f_rec_deep_x_upvol(p): """Deep recovery × Up-volume interaction.""" rec_r = f_rec_126(p).rank(axis=1, pct=True, na_option="keep") up_r = f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep") return rec_r * up_r def f_trend_x_mom(p): """Trend strength × Momentum. Trending + momentum = double signal.""" tr_r = f_trend_slope(p).rank(axis=1, pct=True, na_option="keep") mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep") return tr_r * mom_r def f_quality_mom(p): """Momentum filtered by consistency. Only persistent winners.""" mom = f_mom_12_1(p) consist = f_consistent_returns(p) mom_r = mom.rank(axis=1, pct=True, na_option="keep") con_r = consist.rank(axis=1, pct=True, na_option="keep") return 0.4 * mom_r + 0.3 * con_r + 0.3 * f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep") def f_rec_deep_x_gap(p): """Deep recovery × gap-up frequency.""" rec_r = f_rec_126(p).rank(axis=1, pct=True, na_option="keep") gap_r = f_gap_up_freq(p).rank(axis=1, pct=True, na_option="keep") return rec_r * gap_r def f_mom_x_gap(p): """Momentum × gap-up frequency.""" mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep") gap_r = f_gap_up_freq(p).rank(axis=1, pct=True, na_option="keep") return mom_r * gap_r # Regime-conditional: momentum with volatility filter def f_mom_low_vol_regime(p): """Momentum only when market vol is below median. Momentum crashes in high-vol regimes.""" mom = f_mom_12_1(p) mkt_vol = p.pct_change().mean(axis=1).rolling(60).std() vol_median = mkt_vol.rolling(252, min_periods=126).median() low_vol = mkt_vol <= vol_median mask = pd.DataFrame( np.tile(low_vol.values[:, None], (1, mom.shape[1])), index=mom.index, columns=mom.columns, ) return mom.where(mask, 0) # ===================================================================== # Main loop # ===================================================================== def run_round( name: str, factors: list[tuple[str, FactorFunc]], prices: pd.DataFrame, top_n: int = 10, ) -> list[dict]: results = [] for fname, func in factors: r = test_factor(fname, func, prices, top_n=top_n) results.append(r) print_results(results, name) return results def run_market(market: str): config = UNIVERSES[market] benchmark = config["benchmark"] prices = data_manager.load(market) bench = prices[benchmark].dropna() if benchmark in prices.columns else None stocks = prices.drop(columns=[benchmark], errors="ignore") print(f"\n{'#'*95}") print(f" FACTOR DISCOVERY LOOP — {market.upper()} MARKET") print(f" {stocks.shape[1]} stocks, {stocks.shape[0]} days, " f"{stocks.index[0].date()} → {stocks.index[-1].date()}") print(f"{'#'*95}") # Benchmark if bench is not None: eq_bench = bench / bench.iloc[0] * 100000 bs = stats(eq_bench) print(f"\n Benchmark: CAGR {bs['cagr']:+.1%}, Sharpe {bs['sharpe']:.2f}") # ================================================================ # ROUND 1: Academic & practitioner factors # ================================================================ r1_factors = [ ("BASELINE:rec+mom", f_rec_mom), # Momentum family ("mom_12_1", f_mom_12_1), ("mom_6_1", f_mom_6_1), ("mom_3_1", f_mom_3_1), ("mom_1_0", f_mom_1_0), ("mom_intermediate_7m", f_mom_intermediate), ("sharpe_momentum", f_sharpe_mom), # Recovery family ("recovery_63d", f_rec_63), ("recovery_126d", f_rec_126), ("recovery_21d", f_rec_21), # Trend ("trend_slope_60d", f_trend_slope), ("golden_cross", f_golden_cross), ("above_ma200", f_above_ma200), # Volatility ("low_vol_60d", f_low_vol_60), ("neg_ivol_20d", f_neg_ivol), # Reversal ("STR_5d", f_str_5d), ("STR_10d", f_str_10d), # Quality / accumulation ("consistent_returns", f_consistent_returns), ("up_volume_proxy", f_up_volume_proxy), ("gap_up_freq", f_gap_up_freq), ("info_discrete", f_info_discrete), ("earnings_drift", f_earnings_drift), # Other academic ("52w_high", f_52w_high), ("anti_max_20d", f_anti_max), ("dd_recovery_rate", f_dd_recovery_rate), ("mom_acceleration", f_mom_accel), ] if market == "cn": r1_factors.append(("reversal_vol_cn", f_reversal_vol)) r1 = run_round("ROUND 1 — Academic & Practitioner Hypotheses", r1_factors, stocks) # Identify top-10 from round 1 r1_sorted = sorted(r1, key=lambda x: x.get("cagr", 0) or 0, reverse=True) r1_top_names = [r["name"] for r in r1_sorted[:10] if r.get("cagr") and r["cagr"] > 0] baseline_cagr = next((r["cagr"] for r in r1 if "BASELINE" in r["name"]), 0) print(f"\n Baseline CAGR: {baseline_cagr:+.1%}") print(f" Top 10: {r1_top_names}") # ================================================================ # ROUND 2: Second-order ideas based on what worked # ================================================================ r2_factors = [ ("BASELINE:rec+mom", f_rec_mom), ("smooth_momentum", f_smooth_momentum), ("pos_ratio_60d", f_pos_ratio_60), ("up_down_asymmetry", f_up_down_asymmetry), ("max_streak_40d", f_max_streak), ("up_capture_60d", f_up_capture), ("down_resilience_120d", f_down_resilience), ("rec_mom_filtered", f_rec_mom_filtered), ("sign_ratio", f_sign_ratio), ("autocorrelation_60d", f_autocorrelation), ("mean_rev_z", f_mean_rev_z), ] r2 = run_round("ROUND 2 — Return Quality & Behavioral Factors", r2_factors, stocks) # ================================================================ # ROUND 3: Interaction & conditional factors # ================================================================ r3_factors = [ ("BASELINE:rec+mom", f_rec_mom), ("mom×recovery", f_mom_x_recovery), ("mom×upvol", f_mom_x_upvol), ("rec_deep×upvol", f_rec_deep_x_upvol), ("trend×mom", f_trend_x_mom), ("quality_mom", f_quality_mom), ("rec_deep×gap", f_rec_deep_x_gap), ("mom×gap", f_mom_x_gap), ("mom_low_vol_regime", f_mom_low_vol_regime), ] r3 = run_round("ROUND 3 — Interaction & Conditional Factors", r3_factors, stocks) # ================================================================ # Collect ALL results from all rounds # ================================================================ all_results = r1 + r2 + r3 # Deduplicate baseline seen = set() unique = [] for r in all_results: if r["name"] not in seen: seen.add(r["name"]) unique.append(r) unique_sorted = sorted(unique, key=lambda x: x.get("cagr", 0) or 0, reverse=True) print(f"\n{'='*95}") print(f" ALL ROUNDS COMBINED — TOP 15 FACTORS — {market.upper()}") print(f"{'='*95}") print(f" {'Factor':<45} {'CAGR':>7} {'Sharpe':>7} {'Sortino':>8} {'MaxDD':>7} {'Calmar':>7}") print(f" {'-'*85}") for r in unique_sorted[:15]: flag = " <<<" if "BASELINE" in r["name"] else "" print(f" {r['name']:<45} {r['cagr']:>+6.1%} {r['sharpe']:>7.2f} {r['sortino']:>8.2f} " f"{r['maxdd']:>+6.1%} {r['calmar']:>7.2f}{flag}") # ================================================================ # ROUND 4: Combine top non-baseline factors # ================================================================ top_funcs = {} func_map_all = dict(r1_factors + r2_factors + r3_factors) non_baseline = [r for r in unique_sorted if "BASELINE" not in r["name"] and r.get("cagr", 0)] for r in non_baseline[:12]: if r["name"] in func_map_all: top_funcs[r["name"]] = func_map_all[r["name"]] top_names = list(top_funcs.keys()) print(f"\n Building combinations from top {len(top_names)} factors: {top_names}") combo_factors = [("BASELINE:rec+mom", f_rec_mom)] # All pairs from top-8 for i in range(min(8, len(top_names))): for j in range(i + 1, min(8, len(top_names))): n1, n2 = top_names[i], top_names[j] combo_factors.append(( f"{n1[:20]}+{n2[:20]}", combo([(top_funcs[n1], 0.5), (top_funcs[n2], 0.5)]) )) # Triple combos from top-5 for i in range(min(5, len(top_names))): for j in range(i + 1, min(5, len(top_names))): for k in range(j + 1, min(5, len(top_names))): n1, n2, n3 = top_names[i], top_names[j], top_names[k] combo_factors.append(( f"{n1[:15]}+{n2[:15]}+{n3[:15]}", combo([(top_funcs[n1], 0.33), (top_funcs[n2], 0.33), (top_funcs[n3], 0.34)]) )) r4 = run_round("ROUND 4 — Factor Combinations", combo_factors, stocks) # ================================================================ # ROUND 5: Yearly breakdown of top 5 combos # ================================================================ r4_sorted = sorted(r4, key=lambda x: x.get("cagr", 0) or 0, reverse=True) top5 = r4_sorted[:5] # Make sure baseline is included base = next((r for r in r4 if "BASELINE" in r["name"]), None) if base and base not in top5: top5.append(base) print(f"\n{'='*95}") print(f" ROUND 5 — YEARLY RETURNS OF BEST STRATEGIES — {market.upper()}") print(f"{'='*95}") cols = [(r["name"], r["equity"]) for r in top5] if bench is not None: eq_bench = bench / bench.iloc[0] * 100000 cols.append(("BENCHMARK", eq_bench)) # Header header = f" {'Year':<6}" for name, _ in cols: header += f" | {name[:22]:>22}" print(header) print(" " + "-" * (6 + 25 * len(cols))) all_years = sorted(set(y for _, eq in cols for y in eq.index.year.unique())) for year in all_years: line = f" {year:<6}" for _, eq in cols: dr = eq.pct_change().fillna(0) yr = dr[dr.index.year == year] r = float((1 + yr).prod() - 1) if len(yr) > 0 else 0 line += f" | {r:>+21.1%}" print(line) # Period CAGRs for ny in [3, 5, 10]: cutoff = stocks.index[-1] - pd.DateOffset(years=ny) print(f"\n --- {ny}-year CAGR ---") for name, eq in cols: sl = eq[eq.index >= cutoff] if len(sl) < 50: continue tot = sl.iloc[-1] / sl.iloc[0] - 1 cagr = (1 + tot) ** (1 / ny) - 1 print(f" {name[:50]:<50} {cagr:>+8.1%}") def main(): parser = argparse.ArgumentParser() parser.add_argument("--market", default="us", choices=["us", "cn"]) args = parser.parse_args() run_market(args.market) if __name__ == "__main__": main()