Compare commits
39 Commits
14ec64c1da
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| cdaca4bc2a | |||
| f5e8c708f3 | |||
| c015873ee1 | |||
| bf6fccfd11 | |||
| 7853eafe55 | |||
| 1edce83430 | |||
| 3abc51e3e3 | |||
| 7239310be3 | |||
| 5e1c4a681d | |||
| 2015b62104 | |||
| b2176b0c3e | |||
| ae25f2f6b5 | |||
| a66b039d2d | |||
| 88d765713e | |||
| 35a91ba6cc | |||
| b3d87b3d92 | |||
| 097131d962 | |||
| 82a3e63c2b | |||
| 69a03f52d9 | |||
| 9c4a219c68 | |||
| f6670d9e6d | |||
| 18174a9e11 | |||
| 3d934b3316 | |||
| 0876c0b6af | |||
| f2e14ec200 | |||
| 507565c556 | |||
| 26937f035e | |||
| 7afc60dfcb | |||
| 7e44ece569 | |||
| 7e8d24c1e9 | |||
| 2382364a46 | |||
| 71912b8358 | |||
| 7f0c5de574 | |||
| c46727b1ca | |||
| 0e94688066 | |||
| 9e6da727a3 | |||
| e70922d9af | |||
| feb1864a4d | |||
| 80493cb6af |
8
.gitignore
vendored
8
.gitignore
vendored
@@ -21,6 +21,14 @@ data/universe_*.json
|
||||
# Trader state — per-machine, regenerated by auto/simulate
|
||||
data/trader_*.json
|
||||
|
||||
# Factor attribution output and cached factors
|
||||
data/attribution_*/
|
||||
data/factors/
|
||||
data/factors_review_tmp/
|
||||
|
||||
# External tool artifacts
|
||||
docs/superpowers/
|
||||
|
||||
# IDE / editor
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
@@ -44,7 +44,7 @@ No test suite or linter is configured.
|
||||
|
||||
**Backtest engine** (`main.py`): Orchestrates data loading, strategy execution, and visualization. The `backtest()` function is vectorized — it takes a strategy and price DataFrame, applies transaction costs (proportional + optional fixed per-trade fee) via turnover, and returns an equity curve. Supports two execution modes: `close` (classic) and `open-close` (signal on open prices, execute at close).
|
||||
|
||||
**Daily trader** (`trader.py`): Live/forward-testing system with persistent portfolio state in `data/trader_{market}_{strategy}.json`. The `auto` subcommand runs both signal generation and execution in a single invocation — designed for cron. The `simulate` subcommand replays a date range day-by-day with realistic portfolio tracking (fractional shares, cash, commissions). Available strategies: `recovery_mom_top10`, `recovery_mom_top20`, `momentum`, `momentum_quality`, `dual_momentum`, `inverse_vol`, `trend_following`, `buy_and_hold`.
|
||||
**Daily trader** (`trader.py`): Live/forward-testing system with persistent portfolio state in `data/trader_{market}_{strategy}.json`. The `auto` subcommand runs both signal generation and execution in a single invocation — designed for cron. The `simulate` subcommand replays a date range day-by-day with realistic portfolio tracking (fractional shares, cash, commissions). Available strategies: `recovery_mom_top10`, `recovery_mom_top20`, `momentum`, `momentum_quality`, `dual_momentum`, `inverse_vol`, `trend_following`, `buy_and_hold`, plus 32 factor-combo strategies (`fc_{signal}_{freq}` — see `strategies/factor_combo.py`).
|
||||
|
||||
**Strategy protocol** (`strategies/base.py`): All strategies inherit from `Strategy` ABC and implement `generate_signals(data) → DataFrame` where the returned DataFrame contains portfolio weights (rows = dates, columns = assets, values sum to ~1.0 per row). Each strategy is responsible for applying its own 1-day lag via `.shift(1)` to avoid lookahead bias — the backtest engine does not shift.
|
||||
|
||||
@@ -59,6 +59,7 @@ No test suite or linter is configured.
|
||||
- `momentum_quality.py` — Momentum + return consistency + low drawdown
|
||||
- `adaptive_momentum.py` — Momentum weighted by inverse volatility
|
||||
- `recovery_momentum.py` — Recovery (price/63d low) + 12-1mo momentum composite. Best US performer.
|
||||
- `factor_combo.py` — Configurable factor-combination strategies with daily/weekly/biweekly/monthly rebalancing. US champions: `rec_mfilt+deep_upvol` (50.7% CAGR monthly), `ma200+mom7m+rec126`, `rec_mfilt+ma200`, `mom7m+rec126`. CN champions: `up_cap+quality_mom` (26.1% CAGR monthly), `down_resil+qual_mom`, `rec63+mom_gap`, `up_cap+mom_gap`. All registered in trader.py as `fc_{signal}_{freq}` (e.g., `fc_rec_mfilt_deep_upvol_monthly`). 32 new strategies total.
|
||||
|
||||
**Metrics** (`metrics.py`): Standalone functions for portfolio analytics (Sharpe, Sortino, Calmar, max drawdown, etc.). `summary()` prints a formatted report and returns a dict.
|
||||
|
||||
|
||||
1
data/sp500_history.json
Normal file
1
data/sp500_history.json
Normal file
File diff suppressed because one or more lines are too long
@@ -63,10 +63,11 @@ def _download(tickers: list[str], start: str, end: str | None = None,
|
||||
result = {}
|
||||
for field in fields:
|
||||
if field in raw.columns.get_level_values(0) if isinstance(raw.columns, pd.MultiIndex) else field in raw.columns:
|
||||
if len(tickers) > 1:
|
||||
result[field] = raw[field]
|
||||
selected = raw[field]
|
||||
if isinstance(selected, pd.Series):
|
||||
result[field] = selected.to_frame(name=tickers[0])
|
||||
else:
|
||||
result[field] = raw[field].to_frame(name=tickers[0])
|
||||
result[field] = selected
|
||||
else:
|
||||
result[field] = pd.DataFrame()
|
||||
return result
|
||||
@@ -83,10 +84,11 @@ def _download_period(tickers: list[str], period: str,
|
||||
result = {}
|
||||
for field in fields:
|
||||
if field in raw.columns.get_level_values(0) if isinstance(raw.columns, pd.MultiIndex) else field in raw.columns:
|
||||
if len(tickers) > 1:
|
||||
result[field] = raw[field]
|
||||
selected = raw[field]
|
||||
if isinstance(selected, pd.Series):
|
||||
result[field] = selected.to_frame(name=tickers[0])
|
||||
else:
|
||||
result[field] = raw[field].to_frame(name=tickers[0])
|
||||
result[field] = selected
|
||||
else:
|
||||
result[field] = pd.DataFrame()
|
||||
return result
|
||||
@@ -103,6 +105,66 @@ def _clean(data: pd.DataFrame) -> pd.DataFrame:
|
||||
return data
|
||||
|
||||
|
||||
def _clean_market_data(data: pd.DataFrame, field: str) -> pd.DataFrame:
|
||||
"""Clean market data while preserving volume gaps."""
|
||||
good = data.columns[data.notna().mean() > 0.5]
|
||||
dropped = set(data.columns) - set(good)
|
||||
if dropped:
|
||||
print(f"--- Dropped {len(dropped)} tickers with >50% missing data ---")
|
||||
data = data[good]
|
||||
if field == "volume":
|
||||
return data
|
||||
return data.ffill().dropna(how="all")
|
||||
|
||||
|
||||
def _merge_market_panel(existing: pd.DataFrame | None, new_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Merge new data into an existing cached panel, preserving old columns and dates."""
|
||||
if existing is None or existing.empty:
|
||||
merged = new_data.copy()
|
||||
elif new_data.empty:
|
||||
merged = existing.copy()
|
||||
else:
|
||||
merged = existing.combine_first(new_data)
|
||||
merged.loc[new_data.index, new_data.columns] = new_data
|
||||
merged = merged.sort_index()
|
||||
merged = merged[~merged.index.duplicated(keep="last")]
|
||||
return merged
|
||||
|
||||
|
||||
def update_market_data(market: str, tickers: list[str], fields: list[str]) -> dict[str, pd.DataFrame]:
|
||||
"""Download, clean, persist, and return market data panels for requested Yahoo fields."""
|
||||
field_aliases = {
|
||||
"close": "Close",
|
||||
"open": "Open",
|
||||
"high": "High",
|
||||
"low": "Low",
|
||||
"volume": "Volume",
|
||||
}
|
||||
normalized_fields = []
|
||||
yahoo_fields = []
|
||||
for field in fields:
|
||||
normalized = field.lower()
|
||||
if normalized not in field_aliases:
|
||||
raise ValueError(f"Unsupported market data field: {field}")
|
||||
normalized_fields.append(normalized)
|
||||
yahoo_fields.append(field_aliases[normalized])
|
||||
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
start = (datetime.now() - timedelta(days=365 * 10)).strftime("%Y-%m-%d")
|
||||
downloaded = _download(tickers, start=start, fields=yahoo_fields)
|
||||
|
||||
cleaned = {}
|
||||
for normalized, yahoo_field in zip(normalized_fields, yahoo_fields):
|
||||
data = _clean_market_data(downloaded.get(yahoo_field, pd.DataFrame()), normalized)
|
||||
existing = load(market, normalized)
|
||||
data = _merge_market_panel(existing, data)
|
||||
path = _data_path(market, normalized)
|
||||
data.to_csv(path)
|
||||
print(f"--- Saved {data.shape[0]} days x {data.shape[1]} tickers to {path} ---")
|
||||
cleaned[normalized] = data
|
||||
return cleaned
|
||||
|
||||
|
||||
def update(market: str, tickers: list[str],
|
||||
with_open: bool = False) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
|
||||
"""
|
||||
|
||||
376
docs/superpowers/specs/2026-04-07-factor-attribution-design.md
Normal file
376
docs/superpowers/specs/2026-04-07-factor-attribution-design.md
Normal file
@@ -0,0 +1,376 @@
|
||||
# Factor Attribution Design
|
||||
|
||||
Date: 2026-04-07
|
||||
Repo: `/Users/gahow/projects/quant`
|
||||
|
||||
## Goal
|
||||
|
||||
Add a factor attribution module that explains strategy returns using:
|
||||
|
||||
- Standard external US factors when available: `MKT-RF`, `SMB`, `HML`, `RMW`, `CMA`, `RF`
|
||||
- Local price-derived extension factors: `MOM`, `LOWVOL`, `RECOVERY`
|
||||
- Local proxy fallback factors for markets without standard external data
|
||||
|
||||
The module must integrate with the current backtest workflow, reuse existing strategy equity curves, cache downloaded factor data locally, and produce both terminal summaries and exportable tabular outputs.
|
||||
|
||||
## Scope
|
||||
|
||||
In scope:
|
||||
|
||||
- New factor attribution module for research backtests
|
||||
- US support using external standard factors plus local extension factors
|
||||
- CN support using local proxy factors only
|
||||
- CAPM, FF5, and FF5-plus-extension models
|
||||
- CLI flags in `main.py` to enable attribution and export results
|
||||
- Tests for parsing, factor construction, and regression behavior
|
||||
|
||||
Out of scope for this iteration:
|
||||
|
||||
- Intraday attribution
|
||||
- Portfolio optimizer changes
|
||||
- Live trader attribution in `trader.py`
|
||||
- Notebook or plotting UI for attribution results
|
||||
- External fundamental datasets beyond standard downloadable factor files
|
||||
|
||||
## Existing Context
|
||||
|
||||
The repo already has:
|
||||
|
||||
- A vectorized backtest engine in `main.py`
|
||||
- Strategy implementations that produce daily target weights
|
||||
- Performance metrics in `metrics.py`
|
||||
- Local daily price caches in `data/us.csv`, `data/us_open.csv`, `data/cn.csv`
|
||||
|
||||
Current "alpha" in `trader.py simulate` is only total return minus benchmark return. The new module adds regression-based alpha and factor exposure analysis.
|
||||
|
||||
## Design Overview
|
||||
|
||||
Add a new module `factor_attribution.py` with four responsibilities:
|
||||
|
||||
1. Load and cache factor datasets
|
||||
2. Build local extension and proxy factors from existing price data
|
||||
3. Run regression models against strategy daily returns
|
||||
4. Render summary tables and export detailed results
|
||||
|
||||
`main.py` remains the orchestration point. It will continue running backtests and benchmark normalization, then optionally invoke attribution on the resulting daily return series.
|
||||
|
||||
## Module Structure
|
||||
|
||||
### `factor_attribution.py`
|
||||
|
||||
Planned top-level responsibilities:
|
||||
|
||||
- `load_external_us_factors(...)`
|
||||
- Download Ken French daily factor files
|
||||
- Parse, normalize, convert percent to decimal
|
||||
- Cache to `data/factors/`
|
||||
- Fall back to cache when network fetch fails
|
||||
|
||||
- `build_extension_factors(price_data, benchmark, market)`
|
||||
- Build local daily factor return series for:
|
||||
- `MOM`
|
||||
- `LOWVOL`
|
||||
- `RECOVERY`
|
||||
|
||||
- `build_proxy_core_factors(price_data, benchmark, market)`
|
||||
- Used mainly for CN or when external factors are unavailable
|
||||
- Build daily proxy series for:
|
||||
- `MKT`
|
||||
- `SMB_PROXY`
|
||||
- `HML_PROXY`
|
||||
- `RMW_PROXY`
|
||||
- `CMA_PROXY`
|
||||
|
||||
- `prepare_factor_models(...)`
|
||||
- Merge standard factors and local factors
|
||||
- Produce factor matrices for:
|
||||
- `capm`
|
||||
- `ff5`
|
||||
- `ff5plus`
|
||||
|
||||
- `run_factor_regression(strategy_returns, factor_frame, risk_free_col)`
|
||||
- Fit OLS with intercept
|
||||
- Return alpha, annualized alpha, loadings, t-stats, p-values, R-squared, adjusted R-squared, residual volatility, date range, and observation count
|
||||
|
||||
- `attribute_strategies(results_df, benchmark_series, price_data, market, model_selection)`
|
||||
- Convert equity curves to returns
|
||||
- Run attribution for each strategy
|
||||
- Return structured summary and long-form loadings tables
|
||||
|
||||
- `print_attribution_summary(...)`
|
||||
- Render compact terminal output
|
||||
|
||||
- `export_attribution(...)`
|
||||
- Write CSV outputs
|
||||
|
||||
## Data Sources
|
||||
|
||||
### US Standard Factors
|
||||
|
||||
Preferred source:
|
||||
|
||||
- Ken French daily factor datasets for:
|
||||
- Fama-French 5 Factors daily
|
||||
- Momentum daily if separately required
|
||||
|
||||
Normalization rules:
|
||||
|
||||
- Convert index to pandas `DatetimeIndex`
|
||||
- Convert values from percent to decimal returns
|
||||
- Keep `RF` as decimal daily risk-free rate
|
||||
|
||||
Cache location:
|
||||
|
||||
- `data/factors/ff5_us_daily.csv`
|
||||
- `data/factors/mom_us_daily.csv`
|
||||
|
||||
If the source format changes or download fails:
|
||||
|
||||
- Use the latest local cache if present
|
||||
- Otherwise fall back to local proxy factors and mark the run as `proxy_only`
|
||||
|
||||
### Local Price Inputs
|
||||
|
||||
Reuse repo price caches:
|
||||
|
||||
- US: `data/us.csv`, `data/us_open.csv`
|
||||
- CN: `data/cn.csv`
|
||||
|
||||
Only adjusted close prices are required for attribution factor construction.
|
||||
|
||||
## Factor Definitions
|
||||
|
||||
### Standard Factors
|
||||
|
||||
For US:
|
||||
|
||||
- `MKT-RF`, `SMB`, `HML`, `RMW`, `CMA`, `RF` from external factor data
|
||||
|
||||
### Local Extension Factors
|
||||
|
||||
These are built from the same universe already used by the repo.
|
||||
|
||||
#### `MOM`
|
||||
|
||||
- Cross-sectional momentum long-short factor
|
||||
- Rank stocks by 12-1 month return
|
||||
- Long top quantile, short bottom quantile
|
||||
- Equal weight within long and short legs
|
||||
- Factor return is long return minus short return
|
||||
|
||||
#### `LOWVOL`
|
||||
|
||||
- Cross-sectional low-volatility factor
|
||||
- Compute rolling volatility from daily returns
|
||||
- Long lowest-vol quantile, short highest-vol quantile
|
||||
- Equal weight within legs
|
||||
|
||||
#### `RECOVERY`
|
||||
|
||||
- Cross-sectional recovery factor
|
||||
- Rank stocks by distance from rolling 63-day low
|
||||
- Long strongest recovery names, short weakest recovery names
|
||||
- Equal weight within legs
|
||||
|
||||
### Proxy Core Factors
|
||||
|
||||
Used for CN by default and as fallback for US.
|
||||
|
||||
#### `MKT`
|
||||
|
||||
- Benchmark daily return if benchmark exists
|
||||
- Otherwise equal-weight universe return
|
||||
|
||||
#### `SMB_PROXY`
|
||||
|
||||
- Size proxy using inverse price level or market-cap proxy when only price data is available
|
||||
- First iteration uses inverse price rank as a transparent proxy and explicitly labels it as proxy
|
||||
|
||||
#### `HML_PROXY`
|
||||
|
||||
- Value proxy using price-to-range or distance-to-trailing-low style signal
|
||||
- This is not a true book-to-market factor and must be labeled proxy
|
||||
|
||||
#### `RMW_PROXY`
|
||||
|
||||
- Profitability proxy from return consistency and stability
|
||||
|
||||
#### `CMA_PROXY`
|
||||
|
||||
- Investment proxy from asset trend smoothness or expansion/contraction behavior inferred from price action
|
||||
|
||||
Proxy factors are included for model completeness, but the output must label them clearly as proxies rather than standard academic factors.
|
||||
|
||||
## Factor Construction Rules
|
||||
|
||||
- All local factors use only information available up to date `t` to explain returns at `t+1`
|
||||
- No future data leakage
|
||||
- Factor series are daily return series, not ranks
|
||||
- Long-short factors should be approximately dollar-neutral
|
||||
- Missing values are allowed during warmup windows and dropped during model alignment
|
||||
- Quantile counts should adapt to available universe size
|
||||
|
||||
## Regression Models
|
||||
|
||||
### CAPM
|
||||
|
||||
Model:
|
||||
|
||||
- `strategy_excess_return ~ alpha + (MKT-RF)`
|
||||
|
||||
### FF5
|
||||
|
||||
Model:
|
||||
|
||||
- `strategy_excess_return ~ alpha + MKT-RF + SMB + HML + RMW + CMA`
|
||||
|
||||
### FF5Plus
|
||||
|
||||
Model:
|
||||
|
||||
- `strategy_excess_return ~ alpha + MKT-RF + SMB + HML + RMW + CMA + MOM + LOWVOL + RECOVERY`
|
||||
|
||||
### Proxy Model
|
||||
|
||||
For markets without standard factors:
|
||||
|
||||
- `strategy_return ~ alpha + MKT + SMB_PROXY + HML_PROXY + RMW_PROXY + CMA_PROXY + MOM + LOWVOL + RECOVERY`
|
||||
|
||||
The module should report which model family was actually used.
|
||||
|
||||
## Alignment Rules
|
||||
|
||||
- Convert all equity curves to daily returns
|
||||
- Build factor frames at daily frequency
|
||||
- Join strategy returns and factor returns on date intersection
|
||||
- For standard factor models, subtract `RF` from strategy returns
|
||||
- Keep benchmark return separately for active return diagnostics, but not as a replacement for `MKT-RF` in standard factor models
|
||||
|
||||
## Output Schema
|
||||
|
||||
### Summary Output
|
||||
|
||||
One row per strategy per model with fields including:
|
||||
|
||||
- `strategy`
|
||||
- `market`
|
||||
- `model`
|
||||
- `factor_source`
|
||||
- `proxy_only`
|
||||
- `start_date`
|
||||
- `end_date`
|
||||
- `n_obs`
|
||||
- `alpha_daily`
|
||||
- `alpha_ann`
|
||||
- `alpha_t_stat`
|
||||
- `alpha_p_value`
|
||||
- `r_squared`
|
||||
- `adj_r_squared`
|
||||
- `residual_vol_ann`
|
||||
|
||||
Selected factor loadings should also be flattened into summary columns when available:
|
||||
|
||||
- `beta_mkt`
|
||||
- `beta_smb`
|
||||
- `beta_hml`
|
||||
- `beta_rmw`
|
||||
- `beta_cma`
|
||||
- `beta_mom`
|
||||
- `beta_lowvol`
|
||||
- `beta_recovery`
|
||||
|
||||
### Loadings Output
|
||||
|
||||
Long-form table:
|
||||
|
||||
- `strategy`
|
||||
- `model`
|
||||
- `factor`
|
||||
- `beta`
|
||||
- `t_stat`
|
||||
- `p_value`
|
||||
|
||||
## CLI Changes
|
||||
|
||||
Add arguments to `main.py`:
|
||||
|
||||
- `--attribution`
|
||||
- `--attribution-model {capm,ff5,ff5plus,all}`
|
||||
- `--attribution-export <dir>`
|
||||
|
||||
Behavior:
|
||||
|
||||
- If `--attribution` is not set, current behavior is unchanged
|
||||
- If set, attribution runs after backtest metrics are printed
|
||||
- If export path is set, write:
|
||||
- `summary.csv`
|
||||
- `loadings.csv`
|
||||
|
||||
## Terminal Reporting
|
||||
|
||||
For each strategy and selected model, print a compact line containing:
|
||||
|
||||
- annualized alpha
|
||||
- major factor loadings
|
||||
- R-squared
|
||||
- residual volatility
|
||||
|
||||
After the numeric table, print a short interpretation section:
|
||||
|
||||
- whether alpha remains after adding factors
|
||||
- which factors explain most of the strategy
|
||||
- whether the model fit is weak or strong
|
||||
|
||||
Interpretation should remain descriptive and avoid overclaiming statistical significance.
|
||||
|
||||
## Error Handling
|
||||
|
||||
- External factor download failure:
|
||||
- Use cache if available
|
||||
- Otherwise downgrade to proxy mode
|
||||
- Missing or short overlap window:
|
||||
- Skip that model and report insufficient data
|
||||
- Singular matrix or severe multicollinearity:
|
||||
- Catch and report model failure or unstable fit
|
||||
- Missing benchmark column:
|
||||
- Fall back to equal-weight universe market proxy where possible
|
||||
|
||||
## Testing Plan
|
||||
|
||||
### Unit Tests
|
||||
|
||||
- External factor parser converts dates and percent units correctly
|
||||
- Cache loader returns cached data on download failure
|
||||
- Extension factor builders produce expected columns and no future leakage
|
||||
- Regression on synthetic data recovers approximate known alpha and betas
|
||||
|
||||
### Integration Tests
|
||||
|
||||
- End-to-end attribution on a small deterministic equity and factor dataset
|
||||
- CLI export produces expected files and columns
|
||||
|
||||
### Regression Tests
|
||||
|
||||
- Fixed local US sample produces stable output shape and model naming
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
- Prefer `numpy.linalg.lstsq` or `scipy` OLS utilities already available in dependencies
|
||||
- Keep implementation dependency-light
|
||||
- Keep factor construction functions separate from regression code for testability
|
||||
- Avoid changing existing strategy behavior
|
||||
|
||||
## Risks
|
||||
|
||||
- Standard factor downloads may change source file formatting
|
||||
- Proxy factor definitions for CN will be weaker than true academic factors
|
||||
- Some strategy returns may be highly collinear with momentum-like factors, reducing interpretability
|
||||
- Short or overlapping warmup windows can materially reduce sample size
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- A user can run backtests with `--attribution` and receive factor-based explanations of returns
|
||||
- US runs use standard external factors when available
|
||||
- CN runs still produce a clearly labeled proxy attribution report
|
||||
- Outputs distinguish residual alpha from factor exposure
|
||||
- The module is easy to extend with new factors later
|
||||
376
docs/superpowers/specs/2026-04-17-us-alpha-research-design.md
Normal file
376
docs/superpowers/specs/2026-04-17-us-alpha-research-design.md
Normal file
@@ -0,0 +1,376 @@
|
||||
# US High-Alpha Research Design
|
||||
|
||||
**Date:** 2026-04-17
|
||||
|
||||
## Goal
|
||||
|
||||
Build a research framework for US `long-only` equity strategies that uses only free or already-accessible data, avoids lookahead and survivorship traps as much as the available data allows, and can rank candidate strategy families over `1/2/3/5/10y` windows. The objective is not to manufacture the single highest backtest CAGR, but to identify strategy families whose alpha survives realistic liquidity filters, transaction costs, and point-in-time constraints.
|
||||
|
||||
## Constraints
|
||||
|
||||
- Data sources must be free or already accessible from the current project environment.
|
||||
- Portfolio construction must be `long-only`.
|
||||
- The US research universe may extend beyond the S&P 500 into a broader US stock pool, but all conclusions must clearly distinguish between:
|
||||
- `strict` results from a point-in-time-clean universe.
|
||||
- `exploratory` results from a wider free-data universe that is not fully point-in-time-clean.
|
||||
- All signals must use only information available at the time of decision.
|
||||
- The framework must explicitly guard against:
|
||||
- survivorship bias
|
||||
- lookahead bias
|
||||
- static industry-label leakage
|
||||
- microcap and illiquidity contamination
|
||||
|
||||
## Success Criteria
|
||||
|
||||
The framework is successful if it produces:
|
||||
|
||||
1. A unified research and backtest pipeline for US strategies.
|
||||
2. A ranked comparison of `3-5` high-value strategy families across `1/2/3/5/10y`.
|
||||
3. Metrics that go beyond headline CAGR, including:
|
||||
- `CAGR`
|
||||
- `Sharpe`
|
||||
- `Sortino`
|
||||
- `MaxDD`
|
||||
- `Calmar`
|
||||
- `Turnover`
|
||||
- `Average positions`
|
||||
- `Median ADV usage`
|
||||
- `Subperiod stability`
|
||||
4. Tiered interpretation of results:
|
||||
- `Tier A`: realistic and tradable under tighter liquidity assumptions
|
||||
- `Tier B`: strong alpha but lower capacity
|
||||
- `Tier C`: attractive only under loose assumptions and not suitable as a production candidate
|
||||
|
||||
Any strategy that reports near-`50% CAGR` must also explain:
|
||||
|
||||
- which market regime contributed most of the return
|
||||
- whether performance depends on low-liquidity or small-cap tails
|
||||
- whether results survive after removing the most extreme tail names
|
||||
|
||||
## Research Philosophy
|
||||
|
||||
This project should prefer honest, repeatable alpha discovery over spectacular but fragile backtests. Under the current constraints, a `10y 50% CAGR` should be treated as an upper-end outcome that may appear in selective windows, not as a baseline expectation. The more realistic goal is to find strategies that are strong over `3/5y`, still meaningfully outperform over `10y`, and remain robust after tightening assumptions.
|
||||
|
||||
## Strategy Families
|
||||
|
||||
The research effort will focus on four strategy families.
|
||||
|
||||
### 1. Earnings Drift Proxy
|
||||
|
||||
Target the post-information-repricing phase after major company-specific events. This is conceptually the highest-alpha family, but also the most dependent on event data quality.
|
||||
|
||||
Primary implementation order:
|
||||
|
||||
- use free historical earnings date data if it is stable enough
|
||||
- otherwise fall back to price-and-volume-defined event proxies
|
||||
|
||||
Core signal ingredients:
|
||||
|
||||
- strong post-event excess return over `1-3` days
|
||||
- abnormal volume
|
||||
- gap that does not immediately fill
|
||||
- price holding near short- and medium-term highs after the event
|
||||
|
||||
### 2. Breakout After Compression
|
||||
|
||||
Target stocks that transition from low-volatility congestion into sustained trend expansion. This is the cleanest strategy family to implement with free daily OHLCV data and is the best first candidate for a strict production-grade pipeline.
|
||||
|
||||
Core signal ingredients:
|
||||
|
||||
- proximity to `120d` or `252d` highs
|
||||
- volatility compression over the prior `20-40` trading days
|
||||
- rising dollar volume
|
||||
- positive relative strength versus market and industry proxies
|
||||
|
||||
### 3. Gap-and-Go / High-Volume Continuation
|
||||
|
||||
Target the second phase of move continuation after abnormal return and volume shocks rather than blindly chasing the first event day.
|
||||
|
||||
Core signal ingredients:
|
||||
|
||||
- abnormal `1d` or `3d` return
|
||||
- abnormal volume versus trailing `60d`
|
||||
- post-event price holding above the event anchor
|
||||
- subsequent breakout continuation
|
||||
|
||||
This family has high potential upside but is more sensitive to cost assumptions and market regime.
|
||||
|
||||
### 4. Regime-Gated Cross-Sectional Alpha
|
||||
|
||||
Use broad market and industry-state filters to improve the hit rate of the other strategy families and provide a lower-volatility baseline alpha engine.
|
||||
|
||||
Core signal ingredients:
|
||||
|
||||
- market risk-on versus risk-off state
|
||||
- industry ETF leadership
|
||||
- relative strength
|
||||
- recovery from drawdowns
|
||||
- trend quality
|
||||
- near-`52w` high behavior
|
||||
- price/volume confirmation
|
||||
|
||||
This family is not expected to produce the highest standalone CAGR, but it is expected to improve robustness and reduce participation in hostile environments.
|
||||
|
||||
## Prioritization
|
||||
|
||||
Recommended implementation order:
|
||||
|
||||
1. `Breakout After Compression`
|
||||
2. `Regime-Gated Cross-Sectional Alpha`
|
||||
3. `Gap-and-Go / High-Volume Continuation`
|
||||
4. `Earnings Drift Proxy` only after validating free event-data quality
|
||||
|
||||
Rationale:
|
||||
|
||||
- `Breakout After Compression` is the most implementable and least ambiguous with free data.
|
||||
- `Regime-Gated Cross-Sectional Alpha` provides a shared control layer for the rest of the framework.
|
||||
- `Gap-and-Go` has higher upside but also higher sensitivity to assumptions.
|
||||
- `Earnings Drift Proxy` is theoretically powerful but should not become the project bottleneck if free event history is incomplete.
|
||||
|
||||
## Data Layer
|
||||
|
||||
The framework needs a richer data layer than the current `close/open` setup.
|
||||
|
||||
### Required price fields
|
||||
|
||||
Daily US market data should support at least:
|
||||
|
||||
- `open`
|
||||
- `high`
|
||||
- `low`
|
||||
- `close`
|
||||
- `volume`
|
||||
|
||||
This is required to define:
|
||||
|
||||
- real breakouts
|
||||
- gap events
|
||||
- volatility compression
|
||||
- abnormal dollar volume
|
||||
|
||||
### Required ETF layer
|
||||
|
||||
Add stable market and industry ETFs for regime and leadership analysis, at minimum:
|
||||
|
||||
- `SPY`
|
||||
- `QQQ`
|
||||
- `IWM`
|
||||
- `MDY`
|
||||
- `XLF`
|
||||
- `XLK`
|
||||
- `XLI`
|
||||
- `XLV`
|
||||
- `XLY`
|
||||
- `XLP`
|
||||
- `XLE`
|
||||
- `XLU`
|
||||
- `XLRE`
|
||||
- `XLB`
|
||||
- `SOXX`
|
||||
- `IGV`
|
||||
- `SMH`
|
||||
|
||||
### Universe modes
|
||||
|
||||
The framework must support two explicit modes.
|
||||
|
||||
#### Strict mode
|
||||
|
||||
Use point-in-time-clean universe membership, initially based on the existing PIT S&P 500 machinery in the repository. This is the baseline for formal, defensible results.
|
||||
|
||||
#### Exploratory mode
|
||||
|
||||
Use a wider free-data US stock pool to search for stronger alpha patterns. These results are useful for idea generation but must be labeled as exploratory unless later promoted into a point-in-time-clean setup.
|
||||
|
||||
## Universe Construction Rules
|
||||
|
||||
The tradable universe must be computed daily from lagged information.
|
||||
|
||||
### Daily eligibility rules
|
||||
|
||||
Each stock may enter the candidate set only if all required conditions hold as of `t-1`:
|
||||
|
||||
- enough listing history exists to compute the strategy lookbacks
|
||||
- enough valid volume observations exist
|
||||
- minimum lagged price threshold is met
|
||||
- minimum lagged dollar-volume threshold is met
|
||||
|
||||
Representative defaults:
|
||||
|
||||
- `close[t-1] > 5`
|
||||
- `median_dollar_volume_60d[t-1] > $20M` in `strict` mode
|
||||
- `median_dollar_volume_60d[t-1] > $5M` in `exploratory` mode
|
||||
- `>= 252` valid trading days before eligibility
|
||||
- `>= 40` valid volume days in the trailing `60d`
|
||||
|
||||
Thresholds should be strategy-specific and tunable in robustness sweeps.
|
||||
|
||||
### Industry mapping
|
||||
|
||||
Do not use today's static sector labels to explain historical behavior. For historical regime and industry alignment, prefer PIT-safe proxies such as rolling correlation or beta to industry ETFs over `63/126d` windows.
|
||||
|
||||
## Anti-Lookahead Rules
|
||||
|
||||
The framework must enforce the following rules consistently.
|
||||
|
||||
1. Signals computed using `t` daily bars may only be traded no earlier than `t+1`.
|
||||
2. If an event is effectively published after market close, it becomes tradable no earlier than the next trading day after publication.
|
||||
3. Rolling inputs for liquidity, volatility, and breakout logic must use complete lagged windows with explicit timing semantics.
|
||||
4. Cross-sectional ranking must happen only within the daily eligible universe.
|
||||
5. Universe membership, filters, and factor normalization must be applied before portfolio selection, not after.
|
||||
|
||||
## Execution Convention
|
||||
|
||||
Default execution convention:
|
||||
|
||||
- observe data through `t` close
|
||||
- compute signal after the `t` close
|
||||
- trade at `t+1`
|
||||
|
||||
The framework may compare `t+1 open` and `t+1 close` execution variants if the data path supports both, but the default research baseline should be conservative and consistent.
|
||||
|
||||
## Backtest and Evaluation Framework
|
||||
|
||||
Every strategy family must run through a single pipeline that:
|
||||
|
||||
1. loads required market data
|
||||
2. constructs the daily eligible universe
|
||||
3. computes regime filters
|
||||
4. computes strategy scores or event states
|
||||
5. builds a `long-only` portfolio
|
||||
6. applies transaction costs
|
||||
7. reports `1/2/3/5/10y` windows
|
||||
8. records robustness diagnostics
|
||||
|
||||
### Portfolio defaults
|
||||
|
||||
Initial baseline settings:
|
||||
|
||||
- `long-only`
|
||||
- concentrated books such as `top 5`, `top 10`, `top 20`
|
||||
- start with `equal weight`
|
||||
- add `inverse-vol` weighting only as a secondary comparison
|
||||
|
||||
Equal-weight concentrated portfolios should be the first baseline because they are harder to over-engineer than adaptive weighting schemes.
|
||||
|
||||
### Required robustness checks
|
||||
|
||||
Any strategy candidate that looks strong must automatically be re-run under:
|
||||
|
||||
- tighter liquidity thresholds
|
||||
- fewer and more positions
|
||||
- higher trading costs
|
||||
- different rebalance frequencies
|
||||
- exclusion of the lowest-liquidity or smallest-cap tail
|
||||
|
||||
Only strategies that survive these perturbations should be promoted to `Tier A`.
|
||||
|
||||
## Repository Changes
|
||||
|
||||
The following repository changes are required.
|
||||
|
||||
### New modules
|
||||
|
||||
#### `research/us_universe.py`
|
||||
|
||||
Responsibilities:
|
||||
|
||||
- build daily tradable-universe masks
|
||||
- support `strict` and `exploratory` modes
|
||||
- enforce lagged eligibility rules
|
||||
|
||||
#### `data_manager.py` extension or new `market_data.py`
|
||||
|
||||
Responsibilities:
|
||||
|
||||
- support daily US `OHLCV`
|
||||
- support ETF data updates
|
||||
- preserve existing price-loading workflows where practical
|
||||
|
||||
#### `research/regime_filters.py`
|
||||
|
||||
Responsibilities:
|
||||
|
||||
- market risk-on/risk-off filters
|
||||
- ETF leadership signals
|
||||
- breadth and relative-strength helpers
|
||||
|
||||
#### `research/event_factors.py`
|
||||
|
||||
Responsibilities:
|
||||
|
||||
- breakout-compression scores
|
||||
- gap-continuation scores
|
||||
- high-volume continuation logic
|
||||
- earnings-drift proxy logic
|
||||
|
||||
#### `research/us_alpha_pipeline.py`
|
||||
|
||||
Responsibilities:
|
||||
|
||||
- orchestrate end-to-end research runs
|
||||
- load data
|
||||
- build universe masks
|
||||
- run strategy families
|
||||
- produce windowed rankings
|
||||
- label output as `strict` or `exploratory`
|
||||
|
||||
#### `research/us_alpha_report.py`
|
||||
|
||||
Responsibilities:
|
||||
|
||||
- format tables and CSV outputs
|
||||
- summarize results by family and horizon
|
||||
- support markdown export if needed
|
||||
|
||||
## Research Phasing
|
||||
|
||||
The implementation should be split into two phases.
|
||||
|
||||
### Phase 1
|
||||
|
||||
Build the strict, defensible research backbone:
|
||||
|
||||
- PIT S&P 500 universe
|
||||
- OHLCV data support
|
||||
- ETF regime filters
|
||||
- `Breakout After Compression`
|
||||
- `Regime-Gated Cross-Sectional Alpha`
|
||||
- `Gap-and-Go / High-Volume Continuation`
|
||||
- unified backtest and reporting pipeline
|
||||
|
||||
This phase should produce a clean research system that is difficult to fool with future information.
|
||||
|
||||
### Phase 2
|
||||
|
||||
Expand into higher-upside exploratory research:
|
||||
|
||||
- wider US stock universe
|
||||
- broader signal scanning
|
||||
- stronger CAGR search
|
||||
- explicit exploratory labeling
|
||||
|
||||
This phase is for alpha discovery, not for making final claims about unbiased production performance.
|
||||
|
||||
## Recommended Output
|
||||
|
||||
The finished framework should produce:
|
||||
|
||||
- a repeatable research entrypoint for US alpha studies
|
||||
- CSV outputs for `1/2/3/5/10y` windows
|
||||
- a ranked table of strategy families
|
||||
- tier classification for candidates
|
||||
- notes on where near-`50% CAGR` outcomes come from and whether they remain credible after tightening assumptions
|
||||
|
||||
## Non-Goals
|
||||
|
||||
This project does not aim to:
|
||||
|
||||
- promise stable `10y 50% CAGR`
|
||||
- claim a fully point-in-time-clean all-US-stock universe from free data alone
|
||||
- optimize to a single headline metric at the expense of realism
|
||||
- treat exploratory full-market scans as production-quality evidence
|
||||
|
||||
## Key Decision
|
||||
|
||||
The core design choice is to build infrastructure that minimizes self-deception first, and only then search for extreme CAGR outcomes. Any other order is likely to produce attractive but unreliable results.
|
||||
727
factor_attribution.py
Normal file
727
factor_attribution.py
Normal file
@@ -0,0 +1,727 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import http.client
|
||||
import io
|
||||
import re
|
||||
import socket
|
||||
import ssl
|
||||
import warnings
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from urllib.error import URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy import stats
|
||||
|
||||
KEN_FRENCH_DAILY_FF5_ZIP_URL = (
|
||||
"https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"
|
||||
"F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
|
||||
)
|
||||
|
||||
EXPECTED_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"]
|
||||
CAPM_FACTOR_COLUMNS = ["MKT_RF"]
|
||||
FF5_FACTOR_COLUMNS = ["MKT_RF", "SMB", "HML", "RMW", "CMA"]
|
||||
EXTENSION_FACTOR_COLUMNS = ["MOM", "LOWVOL", "RECOVERY"]
|
||||
FF5PLUS_FACTOR_COLUMNS = FF5_FACTOR_COLUMNS + EXTENSION_FACTOR_COLUMNS
|
||||
PROXY_FACTOR_COLUMNS = [
|
||||
"MKT",
|
||||
"SMB_PROXY",
|
||||
"HML_PROXY",
|
||||
"RMW_PROXY",
|
||||
"CMA_PROXY",
|
||||
] + EXTENSION_FACTOR_COLUMNS
|
||||
TRADING_DAYS_PER_YEAR = 252
|
||||
MISSING_BENCHMARK_SENTINEL = "__missing_benchmark__"
|
||||
SUMMARY_BETA_COLUMN_BY_FACTOR = {
|
||||
"MKT_RF": "beta_mkt",
|
||||
"MKT": "beta_mkt",
|
||||
"SMB": "beta_smb",
|
||||
"SMB_PROXY": "beta_smb",
|
||||
"HML": "beta_hml",
|
||||
"HML_PROXY": "beta_hml",
|
||||
"RMW": "beta_rmw",
|
||||
"RMW_PROXY": "beta_rmw",
|
||||
"CMA": "beta_cma",
|
||||
"CMA_PROXY": "beta_cma",
|
||||
"MOM": "beta_mom",
|
||||
"LOWVOL": "beta_lowvol",
|
||||
"RECOVERY": "beta_recovery",
|
||||
}
|
||||
SUMMARY_COLUMNS = [
|
||||
"strategy",
|
||||
"market",
|
||||
"model",
|
||||
"factor_source",
|
||||
"proxy_only",
|
||||
"beta_semantics",
|
||||
"start_date",
|
||||
"end_date",
|
||||
"n_obs",
|
||||
"alpha_daily",
|
||||
"alpha_ann",
|
||||
"alpha_t_stat",
|
||||
"alpha_p_value",
|
||||
"r_squared",
|
||||
"adj_r_squared",
|
||||
"residual_vol_ann",
|
||||
"beta_mkt",
|
||||
"beta_smb",
|
||||
"beta_hml",
|
||||
"beta_rmw",
|
||||
"beta_cma",
|
||||
"beta_mom",
|
||||
"beta_lowvol",
|
||||
"beta_recovery",
|
||||
]
|
||||
LOADING_COLUMNS = [
|
||||
"strategy",
|
||||
"market",
|
||||
"model",
|
||||
"factor_source",
|
||||
"proxy_only",
|
||||
"factor",
|
||||
"beta",
|
||||
"t_stat",
|
||||
"p_value",
|
||||
]
|
||||
SEMANTIC_BETA_COLUMNS = [
|
||||
"beta_mkt",
|
||||
"beta_smb",
|
||||
"beta_hml",
|
||||
"beta_rmw",
|
||||
"beta_cma",
|
||||
"beta_mom",
|
||||
"beta_lowvol",
|
||||
"beta_recovery",
|
||||
]
|
||||
|
||||
|
||||
class ExternalFactorFormatError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class ExternalFactorDownloadError(OSError):
|
||||
pass
|
||||
|
||||
|
||||
def _download_kf_zip_bytes() -> bytes:
|
||||
request = Request(
|
||||
KEN_FRENCH_DAILY_FF5_ZIP_URL,
|
||||
headers={"User-Agent": "quant-factor-attribution/0.1"},
|
||||
)
|
||||
try:
|
||||
with urlopen(request, timeout=30) as response:
|
||||
return response.read()
|
||||
except (
|
||||
URLError,
|
||||
TimeoutError,
|
||||
ConnectionError,
|
||||
socket.timeout,
|
||||
socket.gaierror,
|
||||
ssl.SSLError,
|
||||
http.client.HTTPException,
|
||||
http.client.IncompleteRead,
|
||||
http.client.RemoteDisconnected,
|
||||
) as exc:
|
||||
raise ExternalFactorDownloadError(f"Failed to download external factor data: {exc}") from exc
|
||||
|
||||
|
||||
def _parse_kf_daily_csv(raw_bytes: bytes) -> pd.DataFrame:
|
||||
with zipfile.ZipFile(io.BytesIO(raw_bytes)) as archive:
|
||||
member_names = [
|
||||
name
|
||||
for name in archive.namelist()
|
||||
if not name.endswith("/") and name.lower().endswith((".csv", ".txt"))
|
||||
]
|
||||
if not member_names:
|
||||
raise ExternalFactorFormatError("Ken French archive did not contain a CSV or TXT file")
|
||||
|
||||
try:
|
||||
text = archive.read(member_names[0]).decode("utf-8-sig")
|
||||
except UnicodeDecodeError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor file was not valid UTF-8 text") from exc
|
||||
|
||||
lines = [line for line in text.splitlines() if line.strip()]
|
||||
try:
|
||||
header_index = next(i for i, line in enumerate(lines) if "Mkt-RF" in line)
|
||||
except StopIteration as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor file was missing the daily factor header") from exc
|
||||
|
||||
table = "\n".join(lines[header_index:])
|
||||
try:
|
||||
factors = pd.read_csv(io.StringIO(table))
|
||||
except pd.errors.ParserError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor table could not be parsed") from exc
|
||||
|
||||
factors = factors.rename(columns={"Mkt-RF": "MKT_RF"})
|
||||
date_column = factors.columns[0]
|
||||
missing_columns = [column for column in EXPECTED_FACTOR_COLUMNS if column not in factors.columns]
|
||||
if missing_columns:
|
||||
raise ExternalFactorFormatError(
|
||||
f"Ken French factor table was missing columns: {', '.join(missing_columns)}"
|
||||
)
|
||||
|
||||
factors = factors[factors[date_column].astype(str).str.fullmatch(r"\d{8}")]
|
||||
if factors.empty:
|
||||
raise ExternalFactorFormatError("Ken French factor table did not contain daily rows")
|
||||
|
||||
try:
|
||||
factors[date_column] = pd.to_datetime(factors[date_column], format="%Y%m%d")
|
||||
except ValueError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor table contained invalid dates") from exc
|
||||
|
||||
factors = factors.set_index(date_column)
|
||||
factors.index.name = None
|
||||
try:
|
||||
factors = factors[EXPECTED_FACTOR_COLUMNS].astype(float) / 100.0
|
||||
except ValueError as exc:
|
||||
raise ExternalFactorFormatError("Ken French factor table contained non-numeric values") from exc
|
||||
|
||||
return factors
|
||||
|
||||
|
||||
def _warn_and_load_cached_factors(cache_path: Path, reason: str) -> pd.DataFrame:
|
||||
warnings.warn(
|
||||
f"Using cached data from {cache_path} because {reason}.",
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return pd.read_csv(cache_path, index_col=0, parse_dates=True)
|
||||
|
||||
|
||||
def load_external_us_factors(cache_dir: Path | str = "data/factors") -> pd.DataFrame:
|
||||
cache_path = Path(cache_dir) / "ff5_us_daily.csv"
|
||||
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
raw_bytes = _download_kf_zip_bytes()
|
||||
except ExternalFactorDownloadError as exc:
|
||||
if cache_path.exists():
|
||||
return _warn_and_load_cached_factors(cache_path, f"download failed: {exc}")
|
||||
raise
|
||||
|
||||
try:
|
||||
factors = _parse_kf_daily_csv(raw_bytes)
|
||||
except zipfile.BadZipFile as exc:
|
||||
if cache_path.exists():
|
||||
return _warn_and_load_cached_factors(cache_path, f"the upstream ZIP was invalid: {exc}")
|
||||
raise
|
||||
except ExternalFactorFormatError as exc:
|
||||
if cache_path.exists():
|
||||
return _warn_and_load_cached_factors(
|
||||
cache_path,
|
||||
f"the upstream factor format was invalid: {exc}",
|
||||
)
|
||||
raise
|
||||
|
||||
factors.to_csv(cache_path)
|
||||
return factors
|
||||
|
||||
|
||||
def _select_stock_prices(price_data: pd.DataFrame, benchmark: str) -> pd.DataFrame:
|
||||
stocks = price_data.drop(columns=[benchmark], errors="ignore")
|
||||
return stocks.sort_index().astype(float)
|
||||
|
||||
|
||||
def _long_short_factor(
|
||||
scores: pd.DataFrame,
|
||||
returns: pd.DataFrame,
|
||||
quantile: float = 0.3,
|
||||
) -> pd.Series:
|
||||
lagged_scores = scores.shift(1)
|
||||
high_cutoff = lagged_scores.quantile(1 - quantile, axis=1)
|
||||
low_cutoff = lagged_scores.quantile(quantile, axis=1)
|
||||
|
||||
long_mask = lagged_scores.ge(high_cutoff, axis=0)
|
||||
short_mask = lagged_scores.le(low_cutoff, axis=0)
|
||||
long_returns = returns.where(long_mask).mean(axis=1)
|
||||
short_returns = returns.where(short_mask).mean(axis=1)
|
||||
return (long_returns - short_returns).rename(None)
|
||||
|
||||
|
||||
def build_extension_factors(
|
||||
price_data: pd.DataFrame,
|
||||
benchmark: str,
|
||||
market: str,
|
||||
) -> pd.DataFrame:
|
||||
del market
|
||||
|
||||
stocks = _select_stock_prices(price_data, benchmark)
|
||||
returns = stocks.pct_change()
|
||||
|
||||
momentum_scores = stocks.shift(21).pct_change(231)
|
||||
low_vol_scores = -returns.rolling(60, min_periods=60).std()
|
||||
recovery_scores = stocks / stocks.rolling(63, min_periods=63).min() - 1.0
|
||||
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"MOM": _long_short_factor(momentum_scores, returns),
|
||||
"LOWVOL": _long_short_factor(low_vol_scores, returns),
|
||||
"RECOVERY": _long_short_factor(recovery_scores, returns),
|
||||
},
|
||||
index=price_data.index,
|
||||
)
|
||||
|
||||
|
||||
def _positive_share(values: np.ndarray) -> float:
|
||||
return float(np.mean(values > 0))
|
||||
|
||||
|
||||
def build_proxy_core_factors(
|
||||
price_data: pd.DataFrame,
|
||||
benchmark: str,
|
||||
market: str,
|
||||
) -> pd.DataFrame:
|
||||
del market
|
||||
|
||||
stocks = _select_stock_prices(price_data, benchmark)
|
||||
returns = stocks.pct_change()
|
||||
|
||||
if benchmark in price_data:
|
||||
market_factor = price_data[benchmark].astype(float).pct_change()
|
||||
else:
|
||||
market_factor = returns.mean(axis=1)
|
||||
|
||||
inverse_price_scores = -stocks
|
||||
value_proxy_scores = -(stocks / stocks.rolling(252, min_periods=252).min() - 1.0)
|
||||
profitability_proxy_scores = returns.rolling(63, min_periods=63).apply(_positive_share, raw=True)
|
||||
investment_proxy_scores = -stocks.pct_change(126)
|
||||
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"MKT": market_factor,
|
||||
"SMB_PROXY": _long_short_factor(inverse_price_scores, returns),
|
||||
"HML_PROXY": _long_short_factor(value_proxy_scores, returns),
|
||||
"RMW_PROXY": _long_short_factor(profitability_proxy_scores, returns),
|
||||
"CMA_PROXY": _long_short_factor(investment_proxy_scores, returns),
|
||||
},
|
||||
index=price_data.index,
|
||||
)
|
||||
|
||||
|
||||
def prepare_factor_models(
|
||||
market: str,
|
||||
extension_factors: pd.DataFrame,
|
||||
proxy_factors: pd.DataFrame | None = None,
|
||||
external_factors: pd.DataFrame | None = None,
|
||||
) -> dict[str, object]:
|
||||
market_name = market.lower()
|
||||
if market_name == "us" and external_factors is not None:
|
||||
factor_frame = pd.concat([external_factors, extension_factors], axis=1)
|
||||
return {
|
||||
"factor_frame": factor_frame,
|
||||
"models": {
|
||||
"capm": CAPM_FACTOR_COLUMNS.copy(),
|
||||
"ff5": FF5_FACTOR_COLUMNS.copy(),
|
||||
"ff5plus": FF5PLUS_FACTOR_COLUMNS.copy(),
|
||||
},
|
||||
"risk_free_col": "RF",
|
||||
"factor_source": "external+local",
|
||||
"proxy_only": False,
|
||||
"model_family": "standard",
|
||||
}
|
||||
|
||||
if proxy_factors is None:
|
||||
raise ValueError("proxy_factors are required when external factors are unavailable")
|
||||
|
||||
factor_frame = pd.concat([proxy_factors, extension_factors], axis=1)
|
||||
return {
|
||||
"factor_frame": factor_frame,
|
||||
"models": {"proxy": PROXY_FACTOR_COLUMNS.copy()},
|
||||
"risk_free_col": None,
|
||||
"factor_source": "proxy_only",
|
||||
"proxy_only": True,
|
||||
"model_family": "proxy",
|
||||
}
|
||||
|
||||
|
||||
def run_factor_regression(
|
||||
strategy_returns: pd.Series,
|
||||
factor_frame: pd.DataFrame,
|
||||
factor_cols: list[str],
|
||||
risk_free_col: str | None = None,
|
||||
) -> dict[str, object]:
|
||||
regression_frame = pd.concat(
|
||||
[strategy_returns.rename("strategy"), factor_frame[factor_cols + ([risk_free_col] if risk_free_col else [])]],
|
||||
axis=1,
|
||||
).dropna()
|
||||
|
||||
if regression_frame.empty:
|
||||
raise ValueError("No overlapping strategy and factor observations were available for regression")
|
||||
|
||||
y = regression_frame["strategy"].astype(float)
|
||||
if risk_free_col is not None:
|
||||
y = y - regression_frame[risk_free_col].astype(float)
|
||||
|
||||
x = regression_frame[factor_cols].astype(float).to_numpy()
|
||||
x = np.column_stack([np.ones(len(regression_frame)), x])
|
||||
n_obs = len(regression_frame)
|
||||
param_count = x.shape[1]
|
||||
if n_obs < param_count:
|
||||
raise ValueError(
|
||||
f"Insufficient observations for regression: need at least {param_count} rows, got {n_obs}"
|
||||
)
|
||||
|
||||
coefficients, _, rank, _ = np.linalg.lstsq(x, y.to_numpy(), rcond=None)
|
||||
if rank < param_count:
|
||||
raise ValueError(
|
||||
"Regression design matrix is rank-deficient; coefficients are not uniquely identified"
|
||||
)
|
||||
|
||||
fitted = x @ coefficients
|
||||
residuals = y.to_numpy() - fitted
|
||||
residual_series = pd.Series(residuals, index=regression_frame.index)
|
||||
if len(residual_series) == 1:
|
||||
residual_vol_ann = 0.0
|
||||
else:
|
||||
residual_vol_ann = float(residual_series.std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR))
|
||||
|
||||
dof = n_obs - param_count
|
||||
if dof > 0:
|
||||
residual_variance = float((residuals @ residuals) / dof)
|
||||
covariance = residual_variance * np.linalg.pinv(x.T @ x)
|
||||
standard_errors = np.sqrt(np.diag(covariance))
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
t_stats = np.divide(
|
||||
coefficients,
|
||||
standard_errors,
|
||||
out=np.full_like(coefficients, np.nan, dtype=float),
|
||||
where=standard_errors > 0,
|
||||
)
|
||||
p_values = 2.0 * stats.t.sf(np.abs(t_stats), df=dof)
|
||||
adj_r_squared_is_defined = True
|
||||
else:
|
||||
t_stats = np.full_like(coefficients, np.nan, dtype=float)
|
||||
p_values = np.full_like(coefficients, np.nan, dtype=float)
|
||||
adj_r_squared_is_defined = False
|
||||
|
||||
ss_total = float(((y - y.mean()) ** 2).sum())
|
||||
ss_residual = float(np.sum(residuals**2))
|
||||
r_squared = 1.0 - ss_residual / ss_total if ss_total else 0.0
|
||||
if adj_r_squared_is_defined:
|
||||
adj_r_squared = 1.0 - (1.0 - r_squared) * (n_obs - 1) / (n_obs - param_count)
|
||||
else:
|
||||
adj_r_squared = float("nan")
|
||||
|
||||
factor_slice = slice(1, None)
|
||||
return {
|
||||
"alpha_daily": float(coefficients[0]),
|
||||
"alpha_ann": float(coefficients[0] * TRADING_DAYS_PER_YEAR),
|
||||
"alpha_t_stat": float(t_stats[0]),
|
||||
"alpha_p_value": float(p_values[0]),
|
||||
"betas": {name: float(value) for name, value in zip(factor_cols, coefficients[factor_slice])},
|
||||
"t_stats": {name: float(value) for name, value in zip(factor_cols, t_stats[factor_slice])},
|
||||
"p_values": {name: float(value) for name, value in zip(factor_cols, p_values[factor_slice])},
|
||||
"r_squared": float(r_squared),
|
||||
"adj_r_squared": float(adj_r_squared),
|
||||
"residual_vol_ann": residual_vol_ann,
|
||||
"start_date": regression_frame.index.min().date().isoformat(),
|
||||
"end_date": regression_frame.index.max().date().isoformat(),
|
||||
"n_obs": n_obs,
|
||||
}
|
||||
|
||||
|
||||
def _empty_attribution_frames() -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
return (
|
||||
pd.DataFrame(columns=SUMMARY_COLUMNS),
|
||||
pd.DataFrame(columns=LOADING_COLUMNS),
|
||||
)
|
||||
|
||||
|
||||
def _select_model_names(
|
||||
model_selection: str,
|
||||
available_models: dict[str, list[str]],
|
||||
) -> list[str]:
|
||||
if model_selection == "all":
|
||||
return list(available_models)
|
||||
if model_selection in available_models:
|
||||
return [model_selection]
|
||||
return list(available_models)
|
||||
|
||||
|
||||
def _resolve_benchmark_symbol(benchmark: str | None) -> str:
|
||||
if benchmark is None:
|
||||
return MISSING_BENCHMARK_SENTINEL
|
||||
return benchmark
|
||||
|
||||
|
||||
def _beta_semantics_map(proxy_only: bool) -> dict[str, str]:
|
||||
return {
|
||||
"beta_mkt": "MKT" if proxy_only else "MKT_RF",
|
||||
"beta_smb": "SMB_PROXY" if proxy_only else "SMB",
|
||||
"beta_hml": "HML_PROXY" if proxy_only else "HML",
|
||||
"beta_rmw": "RMW_PROXY" if proxy_only else "RMW",
|
||||
"beta_cma": "CMA_PROXY" if proxy_only else "CMA",
|
||||
"beta_mom": "MOM",
|
||||
"beta_lowvol": "LOWVOL",
|
||||
"beta_recovery": "RECOVERY",
|
||||
}
|
||||
|
||||
|
||||
def _resolve_beta_semantics(row: pd.Series) -> dict[str, str]:
|
||||
canonical = _beta_semantics_map(bool(row.get("proxy_only", False)))
|
||||
raw_value = row.get("beta_semantics")
|
||||
if isinstance(raw_value, str) and raw_value:
|
||||
try:
|
||||
parsed = json.loads(raw_value)
|
||||
except json.JSONDecodeError:
|
||||
return canonical
|
||||
else:
|
||||
if isinstance(parsed, dict):
|
||||
parsed_mapping = {str(key): str(value) for key, value in parsed.items()}
|
||||
if set(parsed_mapping) == set(SEMANTIC_BETA_COLUMNS) and all(
|
||||
value.strip() for value in parsed_mapping.values()
|
||||
) and _semantics_have_unique_headers(parsed_mapping):
|
||||
return parsed_mapping
|
||||
return canonical
|
||||
|
||||
|
||||
def _beta_header_name(factor_name: str) -> str:
|
||||
suffix = factor_name.strip().lower()
|
||||
suffix = re.sub(r"[^a-z0-9]+", "_", suffix).strip("_")
|
||||
if suffix == "mkt_rf":
|
||||
suffix = "mkt"
|
||||
return f"beta_{suffix}"
|
||||
|
||||
|
||||
def _semantics_have_unique_headers(semantics: dict[str, str]) -> bool:
|
||||
headers = [_beta_header_name(semantics[column]) for column in SEMANTIC_BETA_COLUMNS]
|
||||
if any(header == "beta_" for header in headers):
|
||||
return False
|
||||
return len(headers) == len(set(headers))
|
||||
|
||||
|
||||
def _section_beta_header_map(semantics: dict[str, str]) -> dict[str, str]:
|
||||
header_map: dict[str, str] = {}
|
||||
for beta_column, factor_name in semantics.items():
|
||||
header_map[beta_column] = _beta_header_name(factor_name)
|
||||
return header_map
|
||||
|
||||
|
||||
def _section_key(row: pd.Series) -> tuple[bool, tuple[tuple[str, str], ...]]:
|
||||
semantics = _resolve_beta_semantics(row)
|
||||
return bool(row.get("proxy_only", False)), tuple((key, semantics[key]) for key in SEMANTIC_BETA_COLUMNS)
|
||||
|
||||
|
||||
def attribute_strategies(
|
||||
results_df: pd.DataFrame,
|
||||
benchmark_label: str,
|
||||
price_data: pd.DataFrame,
|
||||
market: str,
|
||||
model_selection: str = "all",
|
||||
benchmark: str | None = None,
|
||||
external_factors: pd.DataFrame | None = None,
|
||||
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
benchmark_symbol = _resolve_benchmark_symbol(benchmark)
|
||||
|
||||
extension_factors = build_extension_factors(price_data, benchmark=benchmark_symbol, market=market)
|
||||
|
||||
resolved_external_factors = external_factors
|
||||
market_name = market.lower()
|
||||
if market_name == "us" and resolved_external_factors is None:
|
||||
try:
|
||||
resolved_external_factors = load_external_us_factors()
|
||||
except (ExternalFactorDownloadError, ExternalFactorFormatError, zipfile.BadZipFile) as exc:
|
||||
warnings.warn(
|
||||
f"Falling back to proxy factor attribution because external US factors were unavailable: {exc}",
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
resolved_external_factors = None
|
||||
|
||||
proxy_factors = None
|
||||
if market_name != "us" or resolved_external_factors is None:
|
||||
proxy_factors = build_proxy_core_factors(price_data, benchmark=benchmark_symbol, market=market)
|
||||
|
||||
prepared = prepare_factor_models(
|
||||
market=market,
|
||||
extension_factors=extension_factors,
|
||||
proxy_factors=proxy_factors,
|
||||
external_factors=resolved_external_factors,
|
||||
)
|
||||
model_names = _select_model_names(model_selection, prepared["models"])
|
||||
|
||||
strategy_returns = results_df.sort_index().pct_change(fill_method=None)
|
||||
if strategy_returns.empty:
|
||||
return _empty_attribution_frames()
|
||||
|
||||
summary_rows: list[dict[str, object]] = []
|
||||
loading_rows: list[dict[str, object]] = []
|
||||
for strategy_name in strategy_returns.columns:
|
||||
if strategy_name == benchmark_label:
|
||||
continue
|
||||
|
||||
for model_name in model_names:
|
||||
factor_cols = prepared["models"][model_name]
|
||||
try:
|
||||
regression_result = run_factor_regression(
|
||||
strategy_returns=strategy_returns[strategy_name],
|
||||
factor_frame=prepared["factor_frame"],
|
||||
factor_cols=factor_cols,
|
||||
risk_free_col=prepared["risk_free_col"],
|
||||
)
|
||||
except ValueError as exc:
|
||||
warnings.warn(
|
||||
f"Skipping factor attribution for {strategy_name} ({model_name}): {exc}",
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
continue
|
||||
|
||||
summary_row: dict[str, object] = {
|
||||
"strategy": strategy_name,
|
||||
"market": market_name,
|
||||
"model": model_name,
|
||||
"factor_source": prepared["factor_source"],
|
||||
"proxy_only": prepared["proxy_only"],
|
||||
"beta_semantics": json.dumps(_beta_semantics_map(bool(prepared["proxy_only"])), sort_keys=True),
|
||||
"start_date": regression_result["start_date"],
|
||||
"end_date": regression_result["end_date"],
|
||||
"n_obs": regression_result["n_obs"],
|
||||
"alpha_daily": regression_result["alpha_daily"],
|
||||
"alpha_ann": regression_result["alpha_ann"],
|
||||
"alpha_t_stat": regression_result["alpha_t_stat"],
|
||||
"alpha_p_value": regression_result["alpha_p_value"],
|
||||
"r_squared": regression_result["r_squared"],
|
||||
"adj_r_squared": regression_result["adj_r_squared"],
|
||||
"residual_vol_ann": regression_result["residual_vol_ann"],
|
||||
"beta_mkt": np.nan,
|
||||
"beta_smb": np.nan,
|
||||
"beta_hml": np.nan,
|
||||
"beta_rmw": np.nan,
|
||||
"beta_cma": np.nan,
|
||||
"beta_mom": np.nan,
|
||||
"beta_lowvol": np.nan,
|
||||
"beta_recovery": np.nan,
|
||||
}
|
||||
for factor_name, beta in regression_result["betas"].items():
|
||||
summary_column = SUMMARY_BETA_COLUMN_BY_FACTOR.get(factor_name)
|
||||
if summary_column is not None:
|
||||
summary_row[summary_column] = beta
|
||||
loading_rows.append(
|
||||
{
|
||||
"strategy": strategy_name,
|
||||
"market": market_name,
|
||||
"model": model_name,
|
||||
"factor_source": prepared["factor_source"],
|
||||
"proxy_only": prepared["proxy_only"],
|
||||
"factor": factor_name,
|
||||
"beta": beta,
|
||||
"t_stat": regression_result["t_stats"][factor_name],
|
||||
"p_value": regression_result["p_values"][factor_name],
|
||||
}
|
||||
)
|
||||
|
||||
summary_rows.append(summary_row)
|
||||
|
||||
summary_df = pd.DataFrame(summary_rows, columns=SUMMARY_COLUMNS)
|
||||
loadings_df = pd.DataFrame(loading_rows, columns=LOADING_COLUMNS)
|
||||
return summary_df, loadings_df
|
||||
|
||||
|
||||
def export_attribution(
|
||||
summary_df: pd.DataFrame,
|
||||
loadings_df: pd.DataFrame,
|
||||
output_dir: Path | str,
|
||||
) -> None:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
summary_df.to_csv(output_path / "summary.csv", index=False)
|
||||
loadings_df.to_csv(output_path / "loadings.csv", index=False)
|
||||
|
||||
|
||||
def _describe_alpha(alpha_ann: float) -> str:
|
||||
if alpha_ann > 0.02:
|
||||
return "positive"
|
||||
if alpha_ann < -0.02:
|
||||
return "negative"
|
||||
return "close to flat"
|
||||
|
||||
|
||||
def _describe_fit(r_squared: float) -> str:
|
||||
if r_squared >= 0.75:
|
||||
return "strong"
|
||||
if r_squared >= 0.4:
|
||||
return "moderate"
|
||||
return "weak"
|
||||
|
||||
|
||||
def _top_loading_descriptions(row: pd.Series, limit: int = 2) -> str:
|
||||
beta_columns = [column for column in row.index if column.startswith("beta_")]
|
||||
factor_labels = _resolve_beta_semantics(row)
|
||||
present = []
|
||||
for column in beta_columns:
|
||||
value = row.get(column)
|
||||
label = factor_labels.get(column)
|
||||
if label is not None and pd.notna(value):
|
||||
present.append((label, float(value)))
|
||||
|
||||
if not present:
|
||||
return "no material factor loadings were estimated"
|
||||
|
||||
top_loadings = sorted(present, key=lambda item: abs(item[1]), reverse=True)[:limit]
|
||||
return ", ".join(f"{name} {value:.2f}" for name, value in top_loadings)
|
||||
|
||||
|
||||
def _print_attribution_section(summary_df: pd.DataFrame, title: str, semantics: dict[str, str]) -> None:
|
||||
display_columns = [
|
||||
"strategy",
|
||||
"market",
|
||||
"model",
|
||||
"factor_source",
|
||||
"proxy_only",
|
||||
"alpha_ann",
|
||||
"r_squared",
|
||||
"residual_vol_ann",
|
||||
"beta_mkt",
|
||||
"beta_smb",
|
||||
"beta_hml",
|
||||
"beta_rmw",
|
||||
"beta_cma",
|
||||
"beta_mom",
|
||||
"beta_lowvol",
|
||||
"beta_recovery",
|
||||
]
|
||||
table = summary_df.reindex(columns=display_columns).copy()
|
||||
table = table.rename(columns=_section_beta_header_map(semantics))
|
||||
numeric_columns = [
|
||||
column
|
||||
for column in table.columns
|
||||
if column not in {"strategy", "market", "model", "factor_source", "proxy_only"}
|
||||
]
|
||||
table.loc[:, numeric_columns] = table.loc[:, numeric_columns].round(4)
|
||||
|
||||
print(f"\n{title}")
|
||||
print(table.to_string(index=False, na_rep=""))
|
||||
|
||||
|
||||
def print_attribution_summary(summary_df: pd.DataFrame) -> None:
|
||||
if summary_df.empty:
|
||||
print("Factor attribution: no usable regressions were produced.")
|
||||
return
|
||||
|
||||
print("\nFactor attribution")
|
||||
sections: dict[tuple[bool, tuple[tuple[str, str], ...]], list[int]] = {}
|
||||
for index, row in summary_df.iterrows():
|
||||
sections.setdefault(_section_key(row), []).append(index)
|
||||
|
||||
for (is_proxy, semantics_items), row_indexes in sections.items():
|
||||
section_rows = summary_df.loc[row_indexes]
|
||||
title = "Proxy factor attribution" if is_proxy else "Standard factor attribution"
|
||||
_print_attribution_section(
|
||||
section_rows,
|
||||
title=title,
|
||||
semantics=dict(semantics_items),
|
||||
)
|
||||
print("\nInterpretation")
|
||||
for _, row in summary_df.iterrows():
|
||||
print(
|
||||
f"- {row['strategy']} / {row['model']}: estimated annualized alpha is "
|
||||
f"{_describe_alpha(float(row['alpha_ann']))} ({row['alpha_ann']:.2%}); "
|
||||
f"strongest loadings are {_top_loading_descriptions(row)}; "
|
||||
f"model fit looks {_describe_fit(float(row['r_squared']))} (R^2={row['r_squared']:.2f})."
|
||||
)
|
||||
213
factor_backtest.py
Normal file
213
factor_backtest.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Backtest best factor combinations with yearly return breakdown.
|
||||
|
||||
US best: momentum + recovery + low_downside_beta + short_term_reversal
|
||||
CN best: momentum + anti_lottery + vol_reversal
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
import metrics
|
||||
from universe import UNIVERSES
|
||||
from factor_research import (
|
||||
factor_momentum_12_1,
|
||||
factor_recovery,
|
||||
factor_short_term_reversal,
|
||||
factor_downside_beta_proxy,
|
||||
factor_lottery_demand,
|
||||
factor_turnover_reversal,
|
||||
factor_52w_high_distance,
|
||||
)
|
||||
|
||||
|
||||
def build_strategy_signals(
|
||||
prices: pd.DataFrame,
|
||||
factor_funcs: list,
|
||||
weights: list[float],
|
||||
top_n: int = 10,
|
||||
rebal_freq: int = 21,
|
||||
) -> pd.DataFrame:
|
||||
"""Build equal-weight top-N strategy from ranked factor combination."""
|
||||
signals_list = [f(prices) for f in factor_funcs]
|
||||
ranked = [s.rank(axis=1, pct=True, na_option="keep") for s in signals_list]
|
||||
composite = sum(w * r for w, r in zip(weights, ranked))
|
||||
|
||||
# Warmup: need at least 252 days
|
||||
warmup = 252
|
||||
|
||||
rank = composite.rank(axis=1, ascending=False, na_option="bottom")
|
||||
n_valid = composite.notna().sum(axis=1)
|
||||
enough = n_valid >= top_n
|
||||
top_mask = (rank <= top_n) & enough.values.reshape(-1, 1)
|
||||
|
||||
raw = top_mask.astype(float)
|
||||
row_sums = raw.sum(axis=1).replace(0, np.nan)
|
||||
signals = raw.div(row_sums, axis=0).fillna(0.0)
|
||||
|
||||
# Monthly rebalance
|
||||
rebal_mask = pd.Series(False, index=prices.index)
|
||||
rebal_indices = list(range(warmup, len(prices), rebal_freq))
|
||||
rebal_mask.iloc[rebal_indices] = True
|
||||
signals[~rebal_mask] = np.nan
|
||||
signals = signals.ffill().fillna(0.0)
|
||||
signals.iloc[:warmup] = 0.0
|
||||
|
||||
return signals.shift(1).fillna(0.0)
|
||||
|
||||
|
||||
def backtest_equity(signals: pd.DataFrame, prices: pd.DataFrame, cost: float = 0.001) -> pd.Series:
|
||||
"""Simple vectorized backtest returning equity curve."""
|
||||
returns = prices.pct_change().fillna(0.0)
|
||||
port_ret = (signals * returns).sum(axis=1)
|
||||
|
||||
# Transaction costs via turnover
|
||||
turnover = signals.diff().abs().sum(axis=1)
|
||||
port_ret -= turnover * cost
|
||||
|
||||
equity = (1 + port_ret).cumprod() * 100000
|
||||
return equity
|
||||
|
||||
|
||||
def yearly_returns(equity: pd.Series) -> pd.DataFrame:
|
||||
"""Compute calendar year returns from equity curve."""
|
||||
daily_ret = equity.pct_change().fillna(0)
|
||||
years = daily_ret.index.year
|
||||
rows = []
|
||||
for year in sorted(years.unique()):
|
||||
mask = years == year
|
||||
yr_ret = (1 + daily_ret[mask]).prod() - 1
|
||||
# Also compute max drawdown for the year
|
||||
eq_yr = equity[mask]
|
||||
running_max = eq_yr.cummax()
|
||||
dd = (eq_yr - running_max) / running_max
|
||||
rows.append({
|
||||
"year": year,
|
||||
"return": yr_ret,
|
||||
"max_dd": dd.min(),
|
||||
"start_val": float(eq_yr.iloc[0]),
|
||||
"end_val": float(eq_yr.iloc[-1]),
|
||||
})
|
||||
return pd.DataFrame(rows).set_index("year")
|
||||
|
||||
|
||||
def run(market: str, years_list: list[int]):
|
||||
config = UNIVERSES[market]
|
||||
benchmark = config["benchmark"]
|
||||
|
||||
print(f"Loading {market.upper()} price data...")
|
||||
prices = data_manager.load(market)
|
||||
bench_prices = prices[benchmark] if benchmark in prices.columns else None
|
||||
stocks = prices.drop(columns=[benchmark], errors="ignore")
|
||||
|
||||
if market == "us":
|
||||
label = "Mom+Recovery+LowDBeta+STR"
|
||||
factor_funcs = [factor_momentum_12_1, factor_recovery, factor_downside_beta_proxy, factor_short_term_reversal]
|
||||
weights = [0.25, 0.25, 0.25, 0.25]
|
||||
baseline_label = "Recovery+Mom (baseline)"
|
||||
baseline_funcs = [factor_momentum_12_1, factor_recovery]
|
||||
baseline_weights = [0.5, 0.5]
|
||||
else:
|
||||
label = "Mom+Near52wHigh+VolReversal"
|
||||
factor_funcs = [factor_momentum_12_1, factor_52w_high_distance, factor_turnover_reversal]
|
||||
weights = [0.40, 0.30, 0.30]
|
||||
baseline_label = "Mom+Recovery (baseline)"
|
||||
baseline_funcs = [factor_momentum_12_1, factor_recovery]
|
||||
baseline_weights = [0.5, 0.5]
|
||||
|
||||
for top_n in [10]:
|
||||
print(f"\n{'='*90}")
|
||||
print(f" {market.upper()} — Top {top_n} — {label}")
|
||||
print(f"{'='*90}")
|
||||
|
||||
# Best combo
|
||||
sig = build_strategy_signals(stocks, factor_funcs, weights, top_n=top_n)
|
||||
eq = backtest_equity(sig, stocks)
|
||||
|
||||
# Baseline
|
||||
sig_base = build_strategy_signals(stocks, baseline_funcs, baseline_weights, top_n=top_n)
|
||||
eq_base = backtest_equity(sig_base, stocks)
|
||||
|
||||
# Benchmark
|
||||
if bench_prices is not None:
|
||||
bp = bench_prices.dropna()
|
||||
eq_bench = bp / bp.iloc[0] * 100000
|
||||
|
||||
for n_years in years_list:
|
||||
cutoff = stocks.index[-1] - pd.DateOffset(years=n_years)
|
||||
eq_slice = eq[eq.index >= cutoff]
|
||||
eq_base_slice = eq_base[eq_base.index >= cutoff]
|
||||
|
||||
if len(eq_slice) < 50:
|
||||
continue
|
||||
|
||||
# Normalize to starting capital
|
||||
eq_norm = eq_slice / eq_slice.iloc[0] * 100000
|
||||
eq_base_norm = eq_base_slice / eq_base_slice.iloc[0] * 100000
|
||||
|
||||
yr = yearly_returns(eq_norm)
|
||||
yr_base = yearly_returns(eq_base_norm)
|
||||
|
||||
if bench_prices is not None:
|
||||
eq_bench_slice = eq_bench[eq_bench.index >= cutoff]
|
||||
eq_bench_norm = eq_bench_slice / eq_bench_slice.iloc[0] * 100000
|
||||
yr_bench = yearly_returns(eq_bench_norm)
|
||||
|
||||
print(f"\n--- Last {n_years} Years (from {eq_slice.index[0].date()}) ---\n")
|
||||
|
||||
# Combined table
|
||||
print(f" {'Year':<6} | {label:>30} | {baseline_label:>25} | {'Benchmark':>12} | {'Alpha vs Bench':>14}")
|
||||
print(f" {'-'*6}-+-{'-'*30}-+-{'-'*25}-+-{'-'*12}-+-{'-'*14}")
|
||||
|
||||
all_years = sorted(yr.index.tolist())
|
||||
total_new = 1.0
|
||||
total_base = 1.0
|
||||
total_bench = 1.0
|
||||
|
||||
for y in all_years:
|
||||
r_new = yr.loc[y, "return"] if y in yr.index else 0
|
||||
dd_new = yr.loc[y, "max_dd"] if y in yr.index else 0
|
||||
r_base = yr_base.loc[y, "return"] if y in yr_base.index else 0
|
||||
r_bench = yr_bench.loc[y, "return"] if bench_prices is not None and y in yr_bench.index else 0
|
||||
alpha = r_new - r_bench
|
||||
|
||||
total_new *= (1 + r_new)
|
||||
total_base *= (1 + r_base)
|
||||
total_bench *= (1 + r_bench)
|
||||
|
||||
print(f" {y:<6} | {r_new:>+14.2%} (dd {dd_new:>+7.2%}) | {r_base:>+25.2%} | {r_bench:>+12.2%} | {alpha:>+14.2%}")
|
||||
|
||||
total_r_new = total_new - 1
|
||||
total_r_base = total_base - 1
|
||||
total_r_bench = total_bench - 1
|
||||
cagr_new = (total_new ** (1 / n_years)) - 1
|
||||
cagr_base = (total_base ** (1 / n_years)) - 1
|
||||
cagr_bench = (total_bench ** (1 / n_years)) - 1
|
||||
|
||||
print(f" {'-'*6}-+-{'-'*30}-+-{'-'*25}-+-{'-'*12}-+-{'-'*14}")
|
||||
print(f" {'Total':<6} | {total_r_new:>+14.2%}{' '*16} | {total_r_base:>+25.2%} | {total_r_bench:>+12.2%} |")
|
||||
print(f" {'CAGR':<6} | {cagr_new:>+14.2%}{' '*16} | {cagr_base:>+25.2%} | {cagr_bench:>+12.2%} |")
|
||||
|
||||
# Full period metrics
|
||||
print(f"\n Full metrics ({label}):")
|
||||
daily_ret = eq_norm.pct_change().dropna()
|
||||
sharpe = daily_ret.mean() / daily_ret.std() * np.sqrt(252) if daily_ret.std() > 0 else 0
|
||||
running_max = eq_norm.cummax()
|
||||
max_dd = ((eq_norm - running_max) / running_max).min()
|
||||
print(f" Sharpe: {sharpe:.2f} | Max Drawdown: {max_dd:.2%} | Win Rate: {(daily_ret > 0).mean():.2%}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--market", default="us", choices=["us", "cn"])
|
||||
args = parser.parse_args()
|
||||
run(args.market, years_list=[3, 5, 10])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
324
factor_deep_analysis.py
Normal file
324
factor_deep_analysis.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""
|
||||
Deep factor analysis — orthogonality, proper correlations, residual alpha.
|
||||
|
||||
For the top factor candidates identified in factor_research.py, this script:
|
||||
1. Computes proper daily cross-sectional rank correlations between factors
|
||||
2. Tests residual IC after neutralizing known factors (momentum, recovery)
|
||||
3. Runs sub-period breakdown (2-year windows)
|
||||
4. Tests factor combinations
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
from universe import UNIVERSES
|
||||
from factor_research import (
|
||||
factor_momentum_12_1,
|
||||
factor_recovery,
|
||||
factor_inverse_vol,
|
||||
factor_short_term_reversal,
|
||||
factor_idio_vol_change,
|
||||
factor_max_drawdown_recovery,
|
||||
factor_mean_reversion_residual,
|
||||
factor_skewness,
|
||||
factor_high_low_range as factor_range_compression,
|
||||
factor_52w_high_distance as factor_near_52w_high,
|
||||
factor_downside_beta_proxy as factor_low_downside_beta,
|
||||
factor_lottery_demand,
|
||||
factor_turnover_reversal,
|
||||
factor_gap_momentum as factor_smooth_momentum,
|
||||
factor_up_down_vol_ratio,
|
||||
factor_trend_strength,
|
||||
factor_consecutive_up_days,
|
||||
factor_volume_price_divergence,
|
||||
factor_recovery_acceleration,
|
||||
factor_relative_volume_momentum,
|
||||
factor_price_level,
|
||||
factor_liquidity_premium,
|
||||
compute_ic,
|
||||
)
|
||||
|
||||
warnings.filterwarnings("ignore", category=FutureWarning)
|
||||
|
||||
|
||||
def daily_cross_sectional_correlation(
|
||||
sig_a: pd.DataFrame, sig_b: pd.DataFrame
|
||||
) -> pd.Series:
|
||||
"""Daily cross-sectional Spearman correlation between two factor signals."""
|
||||
common_idx = sig_a.index.intersection(sig_b.index)
|
||||
common_cols = sig_a.columns.intersection(sig_b.columns)
|
||||
a = sig_a.loc[common_idx, common_cols]
|
||||
b = sig_b.loc[common_idx, common_cols]
|
||||
|
||||
corrs = {}
|
||||
for date in common_idx:
|
||||
va = a.loc[date].dropna()
|
||||
vb = b.loc[date].dropna()
|
||||
common = va.index.intersection(vb.index)
|
||||
if len(common) < 30:
|
||||
continue
|
||||
c = va[common].corr(vb[common], method="spearman")
|
||||
if np.isfinite(c):
|
||||
corrs[date] = c
|
||||
return pd.Series(corrs)
|
||||
|
||||
|
||||
def proper_factor_correlation_matrix(factors: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
||||
"""Compute average daily cross-sectional Spearman correlations."""
|
||||
names = list(factors.keys())
|
||||
n = len(names)
|
||||
matrix = pd.DataFrame(1.0, index=names, columns=names)
|
||||
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
corr_series = daily_cross_sectional_correlation(factors[names[i]], factors[names[j]])
|
||||
avg_corr = corr_series.mean() if len(corr_series) > 0 else np.nan
|
||||
matrix.loc[names[i], names[j]] = avg_corr
|
||||
matrix.loc[names[j], names[i]] = avg_corr
|
||||
|
||||
return matrix
|
||||
|
||||
|
||||
def residual_signal(
|
||||
target: pd.DataFrame,
|
||||
controls: list[pd.DataFrame],
|
||||
) -> pd.DataFrame:
|
||||
"""Cross-sectionally orthogonalize target signal against control signals.
|
||||
For each day, regress target ranks on control ranks, return residual."""
|
||||
ranked_target = target.rank(axis=1, pct=True, na_option="keep")
|
||||
ranked_controls = [c.rank(axis=1, pct=True, na_option="keep") for c in controls]
|
||||
|
||||
residuals = pd.DataFrame(index=target.index, columns=target.columns, dtype=float)
|
||||
|
||||
for date in target.index:
|
||||
y = ranked_target.loc[date].dropna()
|
||||
xs = [rc.loc[date].reindex(y.index) for rc in ranked_controls if date in rc.index]
|
||||
if not xs:
|
||||
residuals.loc[date] = y
|
||||
continue
|
||||
|
||||
x_df = pd.concat(xs, axis=1).dropna()
|
||||
common = y.index.intersection(x_df.index)
|
||||
if len(common) < 30:
|
||||
continue
|
||||
|
||||
y_c = y[common].values
|
||||
x_c = x_df.loc[common].values
|
||||
x_c = np.column_stack([np.ones(len(common)), x_c])
|
||||
|
||||
try:
|
||||
coef, _, _, _ = np.linalg.lstsq(x_c, y_c, rcond=None)
|
||||
resid = y_c - x_c @ coef
|
||||
residuals.loc[date, common] = resid
|
||||
except np.linalg.LinAlgError:
|
||||
residuals.loc[date, common] = y[common].values
|
||||
|
||||
return residuals
|
||||
|
||||
|
||||
def subperiod_ic(signal: pd.DataFrame, prices: pd.DataFrame, horizon: int = 5, window_years: int = 2):
|
||||
"""Compute IC for each rolling sub-period."""
|
||||
fwd_ret = prices.pct_change(horizon).shift(-horizon)
|
||||
ic_series = compute_ic(signal, fwd_ret)
|
||||
if len(ic_series) == 0:
|
||||
return pd.DataFrame()
|
||||
|
||||
window = 252 * window_years
|
||||
results = []
|
||||
start = ic_series.index[0]
|
||||
while start < ic_series.index[-1]:
|
||||
end = start + pd.DateOffset(years=window_years)
|
||||
subset = ic_series[(ic_series.index >= start) & (ic_series.index < end)]
|
||||
if len(subset) > 100:
|
||||
results.append({
|
||||
"period": f"{start.year}-{end.year}",
|
||||
"ic_mean": subset.mean(),
|
||||
"ic_std": subset.std(),
|
||||
"icir": subset.mean() / subset.std() if subset.std() > 0 else 0,
|
||||
"pct_positive": (subset > 0).mean(),
|
||||
"n_days": len(subset),
|
||||
})
|
||||
start = end
|
||||
return pd.DataFrame(results)
|
||||
|
||||
|
||||
def test_factor_combination(
|
||||
factors: dict[str, pd.DataFrame],
|
||||
factor_names: list[str],
|
||||
weights: list[float],
|
||||
prices: pd.DataFrame,
|
||||
label: str,
|
||||
):
|
||||
"""Test a weighted combination of factors."""
|
||||
ranked = [factors[n].rank(axis=1, pct=True, na_option="keep") for n in factor_names]
|
||||
combo = sum(w * r for w, r in zip(weights, ranked))
|
||||
|
||||
fwd_5d = prices.pct_change(5).shift(-5)
|
||||
ic_series = compute_ic(combo, fwd_5d)
|
||||
if len(ic_series) == 0:
|
||||
return None
|
||||
|
||||
return {
|
||||
"combo": label,
|
||||
"ic_5d": ic_series.mean(),
|
||||
"icir_5d": ic_series.mean() / ic_series.std() if ic_series.std() > 0 else 0,
|
||||
"ic_stab": (ic_series.rolling(252).mean().dropna() > 0).mean() if len(ic_series) > 252 else np.nan,
|
||||
}
|
||||
|
||||
|
||||
def run_analysis(market: str):
|
||||
config = UNIVERSES[market]
|
||||
benchmark = config["benchmark"]
|
||||
|
||||
print(f"Loading {market.upper()} price data...")
|
||||
prices = data_manager.load(market)
|
||||
stocks = prices.drop(columns=[benchmark], errors="ignore")
|
||||
print(f"Universe: {stocks.shape[1]} stocks, {stocks.shape[0]} days")
|
||||
|
||||
# Build factors
|
||||
print("Computing factors...")
|
||||
factors = {}
|
||||
factors["momentum_12_1"] = factor_momentum_12_1(stocks)
|
||||
factors["recovery"] = factor_recovery(stocks)
|
||||
factors["inverse_vol"] = factor_inverse_vol(stocks)
|
||||
factors["short_term_reversal"] = factor_short_term_reversal(stocks)
|
||||
factors["drawdown_recovery"] = factor_max_drawdown_recovery(stocks)
|
||||
factors["mean_rev_zscore"] = factor_mean_reversion_residual(stocks)
|
||||
factors["neg_skewness"] = factor_skewness(stocks)
|
||||
factors["near_52w_high"] = factor_near_52w_high(stocks)
|
||||
factors["low_downside_beta"] = factor_low_downside_beta(stocks)
|
||||
factors["smooth_momentum"] = factor_smooth_momentum(stocks)
|
||||
factors["recovery_accel"] = factor_recovery_acceleration(stocks)
|
||||
factors["range_compression"] = factor_range_compression(stocks)
|
||||
|
||||
if market == "cn":
|
||||
factors["anti_lottery"] = factor_lottery_demand(stocks)
|
||||
factors["vol_reversal"] = factor_turnover_reversal(stocks)
|
||||
factors["low_price"] = factor_price_level(stocks)
|
||||
factors["illiquidity"] = factor_liquidity_premium(stocks)
|
||||
|
||||
# ---- 1. Proper Cross-Sectional Correlation Matrix ----
|
||||
print("\n" + "=" * 90)
|
||||
print(f" 1. CROSS-SECTIONAL FACTOR CORRELATIONS — {market.upper()}")
|
||||
print("=" * 90)
|
||||
print("(Average daily Spearman correlation between factor ranks)\n")
|
||||
|
||||
corr = proper_factor_correlation_matrix(factors)
|
||||
print(corr.round(3).to_string())
|
||||
|
||||
# ---- 2. Residual IC after neutralizing known factors ----
|
||||
print("\n" + "=" * 90)
|
||||
print(f" 2. RESIDUAL IC AFTER NEUTRALIZING KNOWN FACTORS — {market.upper()}")
|
||||
print("=" * 90)
|
||||
print("(IC of factor after cross-sectionally regressing out momentum + recovery)\n")
|
||||
|
||||
known = [factors["momentum_12_1"], factors["recovery"]]
|
||||
fwd_5d = stocks.pct_change(5).shift(-5)
|
||||
|
||||
new_candidates = [k for k in factors if k not in ("momentum_12_1", "recovery", "inverse_vol")]
|
||||
rows = []
|
||||
for name in new_candidates:
|
||||
resid = residual_signal(factors[name], known)
|
||||
ic_series = compute_ic(resid, fwd_5d)
|
||||
if len(ic_series) > 0:
|
||||
rows.append({
|
||||
"factor": name,
|
||||
"raw_ic_5d": compute_ic(factors[name], fwd_5d).mean(),
|
||||
"residual_ic_5d": ic_series.mean(),
|
||||
"residual_icir_5d": ic_series.mean() / ic_series.std() if ic_series.std() > 0 else 0,
|
||||
"pct_pos": (ic_series > 0).mean(),
|
||||
})
|
||||
|
||||
resid_df = pd.DataFrame(rows).set_index("factor").sort_values("residual_icir_5d", ascending=False)
|
||||
print(resid_df.round(4).to_string())
|
||||
|
||||
# ---- 3. Sub-Period Stability ----
|
||||
print("\n" + "=" * 90)
|
||||
print(f" 3. SUB-PERIOD IC STABILITY (2-year windows, 5-day horizon) — {market.upper()}")
|
||||
print("=" * 90)
|
||||
|
||||
# Test top factors
|
||||
if market == "us":
|
||||
top_factors = ["low_downside_beta", "drawdown_recovery", "mean_rev_zscore", "short_term_reversal", "momentum_12_1"]
|
||||
else:
|
||||
top_factors = ["momentum_12_1", "anti_lottery", "inverse_vol", "vol_reversal", "near_52w_high"]
|
||||
|
||||
for name in top_factors:
|
||||
if name not in factors:
|
||||
continue
|
||||
print(f"\n {name}:")
|
||||
sp = subperiod_ic(factors[name], stocks, horizon=5)
|
||||
if not sp.empty:
|
||||
print(sp.to_string(index=False))
|
||||
else:
|
||||
print(" (insufficient data)")
|
||||
|
||||
# ---- 4. Factor Combinations ----
|
||||
print("\n" + "=" * 90)
|
||||
print(f" 4. FACTOR COMBINATIONS — {market.upper()}")
|
||||
print("=" * 90)
|
||||
print("(Testing multi-factor composites)\n")
|
||||
|
||||
combos = []
|
||||
if market == "us":
|
||||
tests = [
|
||||
(["momentum_12_1", "low_downside_beta"], [0.5, 0.5], "mom+low_dbeta"),
|
||||
(["momentum_12_1", "drawdown_recovery"], [0.5, 0.5], "mom+dd_recovery"),
|
||||
(["momentum_12_1", "mean_rev_zscore"], [0.5, 0.5], "mom+mean_rev"),
|
||||
(["momentum_12_1", "short_term_reversal"], [0.5, 0.5], "mom+STR"),
|
||||
(["recovery", "low_downside_beta"], [0.5, 0.5], "recovery+low_dbeta"),
|
||||
(["momentum_12_1", "recovery", "low_downside_beta"], [0.33, 0.33, 0.34], "mom+rec+low_dbeta"),
|
||||
(["momentum_12_1", "recovery", "drawdown_recovery"], [0.33, 0.33, 0.34], "mom+rec+dd_rec"),
|
||||
(["momentum_12_1", "recovery", "short_term_reversal"], [0.33, 0.33, 0.34], "mom+rec+STR"),
|
||||
(["momentum_12_1", "recovery", "mean_rev_zscore"], [0.33, 0.33, 0.34], "mom+rec+meanrev"),
|
||||
(["momentum_12_1", "recovery", "low_downside_beta", "short_term_reversal"],
|
||||
[0.25, 0.25, 0.25, 0.25], "mom+rec+dbeta+STR"),
|
||||
(["momentum_12_1", "recovery", "drawdown_recovery", "mean_rev_zscore"],
|
||||
[0.25, 0.25, 0.25, 0.25], "mom+rec+ddrec+meanrev"),
|
||||
]
|
||||
else: # cn
|
||||
tests = [
|
||||
(["momentum_12_1", "anti_lottery"], [0.5, 0.5], "mom+anti_lottery"),
|
||||
(["momentum_12_1", "inverse_vol"], [0.5, 0.5], "mom+inv_vol"),
|
||||
(["momentum_12_1", "vol_reversal"], [0.5, 0.5], "mom+vol_reversal"),
|
||||
(["momentum_12_1", "near_52w_high"], [0.5, 0.5], "mom+near52wh"),
|
||||
(["momentum_12_1", "anti_lottery", "inverse_vol"], [0.33, 0.33, 0.34], "mom+alot+invvol"),
|
||||
(["momentum_12_1", "anti_lottery", "vol_reversal"], [0.33, 0.33, 0.34], "mom+alot+volrev"),
|
||||
(["momentum_12_1", "anti_lottery", "near_52w_high"], [0.33, 0.33, 0.34], "mom+alot+near52w"),
|
||||
(["momentum_12_1", "recovery", "anti_lottery"], [0.33, 0.33, 0.34], "mom+rec+alot"),
|
||||
(["momentum_12_1", "anti_lottery", "inverse_vol", "vol_reversal"],
|
||||
[0.25, 0.25, 0.25, 0.25], "mom+alot+invvol+volrev"),
|
||||
(["momentum_12_1", "anti_lottery", "near_52w_high", "vol_reversal"],
|
||||
[0.25, 0.25, 0.25, 0.25], "mom+alot+52wh+volrev"),
|
||||
]
|
||||
|
||||
# Also test the existing recovery+momentum baseline
|
||||
baseline = test_factor_combination(factors, ["momentum_12_1", "recovery"], [0.5, 0.5], stocks, "BASELINE: mom+recovery")
|
||||
if baseline:
|
||||
combos.append(baseline)
|
||||
|
||||
for names, weights, label in tests:
|
||||
if all(n in factors for n in names):
|
||||
result = test_factor_combination(factors, names, weights, stocks, label)
|
||||
if result:
|
||||
combos.append(result)
|
||||
|
||||
combo_df = pd.DataFrame(combos).set_index("combo").sort_values("icir_5d", ascending=False)
|
||||
print(combo_df.round(4).to_string())
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--market", default="us", choices=["us", "cn"])
|
||||
args = parser.parse_args()
|
||||
run_analysis(args.market)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
150
factor_final_check.py
Normal file
150
factor_final_check.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Final robustness check on champions from the discovery loop."""
|
||||
|
||||
from __future__ import annotations
|
||||
import warnings
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import data_manager
|
||||
from universe import UNIVERSES
|
||||
from factor_loop import (
|
||||
strat, bt, stats, combo, yearly,
|
||||
f_rec_mom, f_rec_126, f_rec_63,
|
||||
f_mom_12_1, f_mom_6_1, f_mom_intermediate,
|
||||
f_above_ma200, f_golden_cross,
|
||||
f_up_volume_proxy, f_gap_up_freq,
|
||||
f_rec_mom_filtered, f_down_resilience,
|
||||
f_up_capture, f_52w_high, f_str_10d,
|
||||
f_earnings_drift, f_reversal_vol,
|
||||
)
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
|
||||
def f_quality_mom(p):
|
||||
mom = f_mom_12_1(p)
|
||||
consist_ret = p.pct_change()
|
||||
consist = (consist_ret > 0).astype(float).rolling(252, min_periods=126).mean()
|
||||
mom_r = mom.rank(axis=1, pct=True, na_option="keep")
|
||||
con_r = consist.rank(axis=1, pct=True, na_option="keep")
|
||||
up_r = f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.4 * mom_r + 0.3 * con_r + 0.3 * up_r
|
||||
|
||||
|
||||
def f_mom_x_gap(p):
|
||||
mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
gap_r = f_gap_up_freq(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return mom_r * gap_r
|
||||
|
||||
|
||||
def rolling_2yr(eq):
|
||||
dr = eq.pct_change().dropna()
|
||||
results = []
|
||||
for end_i in range(504, len(dr), 63):
|
||||
chunk = dr.iloc[end_i - 504:end_i]
|
||||
tot = (1 + chunk).prod() - 1
|
||||
ann = (1 + tot) ** (252 / len(chunk)) - 1
|
||||
sh = chunk.mean() / chunk.std() * np.sqrt(252) if chunk.std() > 0 else 0
|
||||
results.append({"end": chunk.index[-1].date(), "ann": ann, "sh": sh})
|
||||
return pd.DataFrame(results)
|
||||
|
||||
|
||||
def run_robustness(name, func, prices, label_prefix):
|
||||
print(f"\n {name}:")
|
||||
|
||||
# Top-N sensitivity
|
||||
print(f" Top-N: ", end="")
|
||||
for n in [5, 10, 15, 20]:
|
||||
w = strat(prices, func, top_n=n)
|
||||
eq = bt(w, prices)
|
||||
s = stats(eq)
|
||||
print(f"N={n}: {s['cagr']:+.1%}/{s['sharpe']:.2f} ", end="")
|
||||
print()
|
||||
|
||||
# Rebal sensitivity
|
||||
print(f" Rebal: ", end="")
|
||||
for r in [5, 10, 21, 42]:
|
||||
w = strat(prices, func, top_n=10, rebal=r)
|
||||
eq = bt(w, prices)
|
||||
s = stats(eq)
|
||||
print(f"{r}d: {s['cagr']:+.1%}/{s['sharpe']:.2f} ", end="")
|
||||
print()
|
||||
|
||||
# Cost sensitivity
|
||||
print(f" Cost: ", end="")
|
||||
for c in [0, 0.001, 0.002, 0.005]:
|
||||
w = strat(prices, func, top_n=10)
|
||||
eq = bt(w, prices, cost=c)
|
||||
s = stats(eq)
|
||||
print(f"{c*1e4:.0f}bp: {s['cagr']:+.1%} ", end="")
|
||||
print()
|
||||
|
||||
# Rolling 2-year
|
||||
w = strat(prices, func, top_n=10)
|
||||
eq = bt(w, prices)
|
||||
roll = rolling_2yr(eq)
|
||||
if not roll.empty:
|
||||
pct_pos = (roll["ann"] > 0).mean()
|
||||
print(f" 2yr rolling: mean={roll['ann'].mean():+.1%} min={roll['ann'].min():+.1%} "
|
||||
f"max={roll['ann'].max():+.1%} %pos={pct_pos:.0%} mean_sharpe={roll['sh'].mean():.2f}")
|
||||
|
||||
|
||||
def main():
|
||||
# ============= US =============
|
||||
prices_us = data_manager.load("us")
|
||||
stocks_us = prices_us.drop(columns=["SPY"], errors="ignore")
|
||||
|
||||
print("=" * 95)
|
||||
print(" US FINAL ROBUSTNESS — Champions vs Baseline")
|
||||
print("=" * 95)
|
||||
|
||||
us_champs = [
|
||||
("BASELINE: rec+mom", f_rec_mom),
|
||||
("rec_mom_filtered+rec_deep×upvol",
|
||||
combo([(f_rec_mom_filtered, 0.5),
|
||||
combo([(f_rec_126, 0.5), (f_up_volume_proxy, 0.5)]), (lambda x: x, 0.0)])), # hack
|
||||
("above_ma200+mom_7m+rec_126d",
|
||||
combo([(f_above_ma200, 0.33), (f_mom_intermediate, 0.33), (f_rec_126, 0.34)])),
|
||||
("rec_mom_filtered+above_ma200",
|
||||
combo([(f_rec_mom_filtered, 0.5), (f_above_ma200, 0.5)])),
|
||||
("mom_7m+rec_126d",
|
||||
combo([(f_mom_intermediate, 0.5), (f_rec_126, 0.5)])),
|
||||
]
|
||||
|
||||
# Fix the first champion - need proper 2-factor combo
|
||||
us_champs[1] = (
|
||||
"rec_mom_filt + rec_deep×upvol",
|
||||
combo([
|
||||
(f_rec_mom_filtered, 0.5),
|
||||
(combo([(f_rec_126, 0.5), (f_up_volume_proxy, 0.5)]), 0.5),
|
||||
])
|
||||
)
|
||||
|
||||
for name, func in us_champs:
|
||||
run_robustness(name, func, stocks_us, "US")
|
||||
|
||||
# ============= CN =============
|
||||
prices_cn = data_manager.load("cn")
|
||||
stocks_cn = prices_cn.drop(columns=["000300.SS"], errors="ignore")
|
||||
|
||||
print(f"\n{'='*95}")
|
||||
print(" CN FINAL ROBUSTNESS — Champions vs Baseline")
|
||||
print("=" * 95)
|
||||
|
||||
cn_champs = [
|
||||
("BASELINE: rec+mom", f_rec_mom),
|
||||
("up_capture+quality_mom",
|
||||
combo([(f_up_capture, 0.5), (f_quality_mom, 0.5)])),
|
||||
("recovery_63d+mom×gap",
|
||||
combo([(f_rec_63, 0.5), (f_mom_x_gap, 0.5)])),
|
||||
("down_resilience+quality_mom",
|
||||
combo([(f_down_resilience, 0.5), (f_quality_mom, 0.5)])),
|
||||
("up_capture+mom×gap",
|
||||
combo([(f_up_capture, 0.5), (f_mom_x_gap, 0.5)])),
|
||||
]
|
||||
|
||||
for name, func in cn_champs:
|
||||
run_robustness(name, func, stocks_cn, "CN")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
654
factor_loop.py
Normal file
654
factor_loop.py
Normal file
@@ -0,0 +1,654 @@
|
||||
"""
|
||||
Iterative Factor Discovery Loop.
|
||||
|
||||
Round 1: Academic & practitioner hypotheses (30+ factors)
|
||||
Round 2: Data-driven variations on Round 1 winners
|
||||
Round 3: Interaction and conditional factors
|
||||
Round 4: Parameter optimization on finalists
|
||||
Round 5: Best combinations
|
||||
|
||||
Each factor is tested immediately as a top-10 equal-weight strategy
|
||||
with monthly rebalancing and 10bps transaction costs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import warnings
|
||||
from typing import Callable
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
from universe import UNIVERSES
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
FactorFunc = Callable[[pd.DataFrame], pd.DataFrame]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backtest infrastructure
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def strat(
|
||||
prices: pd.DataFrame,
|
||||
signal_func: FactorFunc,
|
||||
top_n: int = 10,
|
||||
rebal: int = 21,
|
||||
warmup: int = 252,
|
||||
) -> pd.DataFrame:
|
||||
sig = signal_func(prices)
|
||||
rank = sig.rank(axis=1, ascending=False, na_option="bottom")
|
||||
n_valid = sig.notna().sum(axis=1)
|
||||
enough = n_valid >= top_n
|
||||
mask = (rank <= top_n) & enough.values.reshape(-1, 1)
|
||||
raw = mask.astype(float)
|
||||
w = raw.div(raw.sum(axis=1).replace(0, np.nan), axis=0).fillna(0.0)
|
||||
rmask = pd.Series(False, index=prices.index)
|
||||
rmask.iloc[list(range(warmup, len(prices), rebal))] = True
|
||||
w[~rmask] = np.nan
|
||||
w = w.ffill().fillna(0.0)
|
||||
w.iloc[:warmup] = 0.0
|
||||
return w.shift(1).fillna(0.0)
|
||||
|
||||
|
||||
def bt(weights: pd.DataFrame, prices: pd.DataFrame, cost: float = 0.001) -> pd.Series:
|
||||
ret = prices.pct_change().fillna(0.0)
|
||||
pr = (weights * ret).sum(axis=1)
|
||||
pr -= weights.diff().abs().sum(axis=1) * cost
|
||||
return (1 + pr).cumprod() * 100000
|
||||
|
||||
|
||||
def stats(eq: pd.Series) -> dict:
|
||||
dr = eq.pct_change().dropna()
|
||||
if len(dr) < 200 or dr.std() == 0:
|
||||
return {"cagr": np.nan, "sharpe": np.nan, "sortino": np.nan,
|
||||
"maxdd": np.nan, "calmar": np.nan}
|
||||
ny = len(dr) / 252
|
||||
tot = eq.iloc[-1] / eq.iloc[0] - 1
|
||||
cagr = (1 + tot) ** (1 / ny) - 1
|
||||
sh = dr.mean() / dr.std() * np.sqrt(252)
|
||||
sd = dr[dr < 0].std()
|
||||
so = dr.mean() / sd * np.sqrt(252) if sd > 0 else 0
|
||||
rm = eq.cummax()
|
||||
dd = ((eq - rm) / rm).min()
|
||||
cal = cagr / abs(dd) if dd != 0 else 0
|
||||
return {"cagr": cagr, "sharpe": sh, "sortino": so, "maxdd": dd, "calmar": cal}
|
||||
|
||||
|
||||
def yearly(eq: pd.Series) -> dict[int, float]:
|
||||
dr = eq.pct_change().fillna(0)
|
||||
return {y: float((1 + dr[dr.index.year == y]).prod() - 1) for y in sorted(dr.index.year.unique())}
|
||||
|
||||
|
||||
def test_factor(name: str, func: FactorFunc, prices: pd.DataFrame,
|
||||
top_n: int = 10) -> dict:
|
||||
w = strat(prices, func, top_n=top_n)
|
||||
eq = bt(w, prices)
|
||||
s = stats(eq)
|
||||
s["name"] = name
|
||||
s["equity"] = eq
|
||||
return s
|
||||
|
||||
|
||||
def combo(fws: list[tuple[FactorFunc, float]]) -> FactorFunc:
|
||||
def _c(p):
|
||||
return sum(w * f(p).rank(axis=1, pct=True, na_option="keep") for f, w in fws)
|
||||
return _c
|
||||
|
||||
|
||||
def print_results(results: list[dict], title: str):
|
||||
df = pd.DataFrame([{k: v for k, v in r.items() if k != "equity"} for r in results])
|
||||
df = df.set_index("name").sort_values("cagr", ascending=False)
|
||||
print(f"\n{'='*95}")
|
||||
print(f" {title}")
|
||||
print(f"{'='*95}")
|
||||
print(f" {'Factor':<45} {'CAGR':>7} {'Sharpe':>7} {'Sortino':>8} {'MaxDD':>7} {'Calmar':>7}")
|
||||
print(f" {'-'*85}")
|
||||
for name, row in df.iterrows():
|
||||
flag = " <<<" if "BASELINE" in str(name) else ""
|
||||
c = row['cagr']
|
||||
if np.isnan(c):
|
||||
continue
|
||||
print(f" {str(name):<45} {c:>+6.1%} {row['sharpe']:>7.2f} {row['sortino']:>8.2f} "
|
||||
f"{row['maxdd']:>+6.1%} {row['calmar']:>7.2f}{flag}")
|
||||
return df
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# ROUND 1 — Academic & Practitioner Hypotheses
|
||||
# =====================================================================
|
||||
|
||||
# --- Momentum family ---
|
||||
def f_mom_12_1(p): return p.shift(21).pct_change(231)
|
||||
def f_mom_6_1(p): return p.shift(21).pct_change(105)
|
||||
def f_mom_3_1(p): return p.shift(21).pct_change(42)
|
||||
def f_mom_1_0(p): return p.pct_change(21) # 1-month (reversal in US)
|
||||
|
||||
# --- Recovery family ---
|
||||
def f_rec_63(p): return p / p.rolling(63, min_periods=63).min() - 1
|
||||
def f_rec_126(p): return p / p.rolling(126, min_periods=126).min() - 1
|
||||
def f_rec_21(p): return p / p.rolling(21, min_periods=21).min() - 1
|
||||
|
||||
# Novy-Marx 2012: intermediate momentum (7-12 month)
|
||||
def f_mom_intermediate(p): return p.shift(21).pct_change(147) # ~7 month
|
||||
|
||||
# Asness et al: quality/profitability proxy via return consistency
|
||||
def f_consistent_returns(p):
|
||||
ret = p.pct_change()
|
||||
return (ret > 0).astype(float).rolling(252, min_periods=126).mean()
|
||||
|
||||
# Da, Liu, Schaumburg 2014: information discreteness
|
||||
# Stocks with many small positive days > stocks with few large positive days
|
||||
def f_info_discrete(p):
|
||||
ret = p.pct_change()
|
||||
n_pos = (ret > 0).astype(float).rolling(60, min_periods=40).sum()
|
||||
sum_pos = ret.where(ret > 0, 0).rolling(60, min_periods=40).sum()
|
||||
avg_pos = sum_pos / n_pos.replace(0, np.nan)
|
||||
# High count of positive days + low average positive = smooth accumulation
|
||||
return n_pos / avg_pos.replace(0, np.nan)
|
||||
|
||||
# Accumulation proxy (worked in Round 1)
|
||||
def f_up_volume_proxy(p):
|
||||
ret = p.pct_change()
|
||||
return ret.where(ret > 0, 0).rolling(20, min_periods=15).sum()
|
||||
|
||||
# George & Hwang 2004: 52-week high ratio
|
||||
def f_52w_high(p):
|
||||
return p / p.rolling(252, min_periods=126).max()
|
||||
|
||||
# Frequency of large up-moves (worked in Round 1)
|
||||
def f_gap_up_freq(p):
|
||||
ret = p.pct_change()
|
||||
return (ret > 0.01).astype(float).rolling(60, min_periods=40).mean()
|
||||
|
||||
# Bali, Cakici, Whitelaw 2011: MAX effect (lottery demand)
|
||||
def f_anti_max(p):
|
||||
ret = p.pct_change()
|
||||
return -ret.rolling(20, min_periods=15).max()
|
||||
|
||||
# Ang et al 2006: idiosyncratic volatility (negative)
|
||||
def f_neg_ivol(p):
|
||||
ret = p.pct_change()
|
||||
return -ret.rolling(20, min_periods=15).std()
|
||||
|
||||
# Blitz & van Vliet 2007: low volatility anomaly
|
||||
def f_low_vol_60(p):
|
||||
ret = p.pct_change()
|
||||
return -ret.rolling(60, min_periods=40).std()
|
||||
|
||||
# Hurst exponent proxy — autocorrelation of returns
|
||||
# Stocks with positive autocorrelation = trending
|
||||
def f_autocorrelation(p):
|
||||
ret = p.pct_change()
|
||||
def _ac(x):
|
||||
x = x.dropna()
|
||||
if len(x) < 20:
|
||||
return np.nan
|
||||
return np.corrcoef(x[:-1], x[1:])[0, 1]
|
||||
return ret.rolling(60, min_periods=40).apply(_ac, raw=False)
|
||||
|
||||
# Short-term reversal (Jegadeesh 1990)
|
||||
def f_str_5d(p): return -p.pct_change(5)
|
||||
def f_str_10d(p): return -p.pct_change(10)
|
||||
|
||||
# Earnings drift proxy (worked in Round 1)
|
||||
def f_earnings_drift(p):
|
||||
ret_5d = p.pct_change(5)
|
||||
vol = p.pct_change().rolling(60, min_periods=40).std() * np.sqrt(5)
|
||||
z = ret_5d / vol.replace(0, np.nan)
|
||||
return z.rolling(60, min_periods=20).mean()
|
||||
|
||||
# Risk-adjusted momentum (Sharpe-momentum)
|
||||
def f_sharpe_mom(p):
|
||||
ret = p.pct_change()
|
||||
mu = ret.rolling(252, min_periods=126).mean()
|
||||
sigma = ret.rolling(252, min_periods=126).std()
|
||||
return mu / sigma.replace(0, np.nan)
|
||||
|
||||
# Trend strength: slope of log-price regression
|
||||
def f_trend_slope(p):
|
||||
log_p = np.log(p.replace(0, np.nan))
|
||||
def _slope(x):
|
||||
x = x.dropna().values
|
||||
if len(x) < 30:
|
||||
return np.nan
|
||||
t = np.arange(len(x), dtype=float)
|
||||
t -= t.mean()
|
||||
return (t * (x - x.mean())).sum() / (t * t).sum()
|
||||
return log_p.rolling(60, min_periods=30).apply(_slope, raw=False)
|
||||
|
||||
# Acceleration: recent momentum vs. longer-term momentum
|
||||
def f_mom_accel(p):
|
||||
m3 = p.shift(5).pct_change(58) # ~3mo
|
||||
m12 = p.shift(21).pct_change(231) # ~12mo
|
||||
return m3 - m12
|
||||
|
||||
# Mean reversion z-score
|
||||
def f_mean_rev_z(p):
|
||||
ma20 = p.rolling(20, min_periods=20).mean()
|
||||
vol = p.pct_change().rolling(60, min_periods=40).std() * p
|
||||
return -(p - ma20) / vol.replace(0, np.nan)
|
||||
|
||||
# Price relative to moving averages
|
||||
def f_above_ma200(p):
|
||||
return p / p.rolling(200, min_periods=200).mean() - 1
|
||||
|
||||
def f_above_ma50(p):
|
||||
return p / p.rolling(50, min_periods=50).mean() - 1
|
||||
|
||||
# Dual MA signal: 50-day MA / 200-day MA
|
||||
def f_golden_cross(p):
|
||||
ma50 = p.rolling(50, min_periods=50).mean()
|
||||
ma200 = p.rolling(200, min_periods=200).mean()
|
||||
return ma50 / ma200 - 1
|
||||
|
||||
# Drawdown recovery rate
|
||||
def f_dd_recovery_rate(p):
|
||||
rm = p.rolling(252, min_periods=126).max()
|
||||
dd = p / rm - 1 # negative when in drawdown
|
||||
return dd - dd.shift(20) # positive = recovering from drawdown
|
||||
|
||||
# A-share specific: short-term reversal x volatility
|
||||
def f_reversal_vol(p):
|
||||
return -p.pct_change(5) * p.pct_change().rolling(20, min_periods=15).std()
|
||||
|
||||
# Recovery + momentum (baseline)
|
||||
def f_rec_mom(p):
|
||||
r1 = f_rec_63(p).rank(axis=1, pct=True, na_option="keep")
|
||||
r2 = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.5 * r1 + 0.5 * r2
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# ROUND 2 — Second-order ideas from Round 1 analysis
|
||||
# =====================================================================
|
||||
|
||||
# The key insight: "quality of returns" matters more than "magnitude of returns"
|
||||
# Factors that measure HOW a stock goes up, not just that it went up.
|
||||
|
||||
# Smoothness-weighted momentum
|
||||
def f_smooth_momentum(p):
|
||||
"""Momentum penalized by path volatility. Stocks that go up smoothly."""
|
||||
mom = p.shift(21).pct_change(231)
|
||||
ret = p.pct_change()
|
||||
vol = ret.rolling(252, min_periods=126).std()
|
||||
return mom / (vol.replace(0, np.nan) ** 0.5) # sqrt to dampen
|
||||
|
||||
# Positive return ratio (like Sharpe numerator)
|
||||
def f_pos_ratio_60(p):
|
||||
"""Fraction of positive return days in 60 days. Quality signal."""
|
||||
ret = p.pct_change()
|
||||
return (ret > 0).astype(float).rolling(60, min_periods=40).mean()
|
||||
|
||||
# Cumulative positive returns vs cumulative negative returns
|
||||
def f_up_down_asymmetry(p):
|
||||
"""Ratio of cumulative up-move to cumulative down-move."""
|
||||
ret = p.pct_change()
|
||||
up = ret.where(ret > 0, 0).rolling(60, min_periods=40).sum()
|
||||
down = (-ret.where(ret < 0, 0)).rolling(60, min_periods=40).sum()
|
||||
return up / down.replace(0, np.nan)
|
||||
|
||||
# Streak momentum: max consecutive up days in last 40 days
|
||||
def f_max_streak(p):
|
||||
ret = p.pct_change()
|
||||
pos = (ret > 0).astype(float)
|
||||
def _max_streak(x):
|
||||
x = x.dropna().values
|
||||
if len(x) == 0:
|
||||
return 0
|
||||
best = cur = 0
|
||||
for v in x:
|
||||
cur = cur + 1 if v > 0.5 else 0
|
||||
best = max(best, cur)
|
||||
return best
|
||||
return pos.rolling(40, min_periods=20).apply(_max_streak, raw=False)
|
||||
|
||||
# Overnight proxy: gap between yesterday's close and today's pattern
|
||||
# Since we only have close prices, use close-to-close 1d return decomposition
|
||||
def f_up_capture(p):
|
||||
"""Up-market capture ratio over 60 days."""
|
||||
ret = p.pct_change()
|
||||
mkt = ret.mean(axis=1)
|
||||
up_mkt = mkt > 0
|
||||
arr = ret.values.copy()
|
||||
arr[~up_mkt.values, :] = np.nan
|
||||
stock_up = pd.DataFrame(arr, index=ret.index, columns=ret.columns)
|
||||
mkt_up_vals = mkt.where(up_mkt, np.nan)
|
||||
stock_avg = stock_up.rolling(60, min_periods=20).mean()
|
||||
mkt_avg = mkt_up_vals.rolling(60, min_periods=20).mean()
|
||||
return stock_avg.div(mkt_avg, axis=0)
|
||||
|
||||
# Down-market resilience
|
||||
def f_down_resilience(p):
|
||||
"""How much LESS a stock falls on down-market days."""
|
||||
ret = p.pct_change()
|
||||
mkt = ret.mean(axis=1)
|
||||
down_mkt = mkt < 0
|
||||
arr = ret.values.copy()
|
||||
arr[~down_mkt.values, :] = np.nan
|
||||
down_ret = pd.DataFrame(arr, index=ret.index, columns=ret.columns)
|
||||
return -down_ret.rolling(120, min_periods=30).mean()
|
||||
|
||||
# Recovery from rolling max with momentum filter
|
||||
def f_rec_mom_filtered(p):
|
||||
"""Recovery factor only for stocks with positive 6-month momentum.
|
||||
Filters out dead-cat bounces."""
|
||||
rec = p / p.rolling(126, min_periods=126).min() - 1
|
||||
mom = p.shift(21).pct_change(105)
|
||||
return rec.where(mom > 0, np.nan)
|
||||
|
||||
# Information discreteness v2: using the sign ratio
|
||||
def f_sign_ratio(p):
|
||||
"""Ratio of (count of positive days)^2 * avg_size to total return.
|
||||
High ratio = many small ups = institutional flow."""
|
||||
ret = p.pct_change()
|
||||
n_total = 60
|
||||
n_pos = (ret > 0).astype(float).rolling(n_total, min_periods=40).sum()
|
||||
total_ret = ret.rolling(n_total, min_periods=40).sum()
|
||||
sign_vol = n_pos / n_total
|
||||
# Stocks where most of the return comes from many small positive days
|
||||
return sign_vol * total_ret.clip(lower=0)
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# ROUND 3 — Interaction & conditional factors
|
||||
# =====================================================================
|
||||
|
||||
def f_mom_x_recovery(p):
|
||||
"""Momentum × Recovery interaction. The product, not the sum."""
|
||||
mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
rec_r = f_rec_63(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return mom_r * rec_r
|
||||
|
||||
def f_mom_x_upvol(p):
|
||||
"""Momentum × Up-volume-proxy interaction."""
|
||||
mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
up_r = f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return mom_r * up_r
|
||||
|
||||
def f_rec_deep_x_upvol(p):
|
||||
"""Deep recovery × Up-volume interaction."""
|
||||
rec_r = f_rec_126(p).rank(axis=1, pct=True, na_option="keep")
|
||||
up_r = f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return rec_r * up_r
|
||||
|
||||
def f_trend_x_mom(p):
|
||||
"""Trend strength × Momentum. Trending + momentum = double signal."""
|
||||
tr_r = f_trend_slope(p).rank(axis=1, pct=True, na_option="keep")
|
||||
mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return tr_r * mom_r
|
||||
|
||||
def f_quality_mom(p):
|
||||
"""Momentum filtered by consistency. Only persistent winners."""
|
||||
mom = f_mom_12_1(p)
|
||||
consist = f_consistent_returns(p)
|
||||
mom_r = mom.rank(axis=1, pct=True, na_option="keep")
|
||||
con_r = consist.rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.4 * mom_r + 0.3 * con_r + 0.3 * f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep")
|
||||
|
||||
def f_rec_deep_x_gap(p):
|
||||
"""Deep recovery × gap-up frequency."""
|
||||
rec_r = f_rec_126(p).rank(axis=1, pct=True, na_option="keep")
|
||||
gap_r = f_gap_up_freq(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return rec_r * gap_r
|
||||
|
||||
def f_mom_x_gap(p):
|
||||
"""Momentum × gap-up frequency."""
|
||||
mom_r = f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
gap_r = f_gap_up_freq(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return mom_r * gap_r
|
||||
|
||||
# Regime-conditional: momentum with volatility filter
|
||||
def f_mom_low_vol_regime(p):
|
||||
"""Momentum only when market vol is below median.
|
||||
Momentum crashes in high-vol regimes."""
|
||||
mom = f_mom_12_1(p)
|
||||
mkt_vol = p.pct_change().mean(axis=1).rolling(60).std()
|
||||
vol_median = mkt_vol.rolling(252, min_periods=126).median()
|
||||
low_vol = mkt_vol <= vol_median
|
||||
mask = pd.DataFrame(
|
||||
np.tile(low_vol.values[:, None], (1, mom.shape[1])),
|
||||
index=mom.index, columns=mom.columns,
|
||||
)
|
||||
return mom.where(mask, 0)
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# Main loop
|
||||
# =====================================================================
|
||||
|
||||
def run_round(
|
||||
name: str,
|
||||
factors: list[tuple[str, FactorFunc]],
|
||||
prices: pd.DataFrame,
|
||||
top_n: int = 10,
|
||||
) -> list[dict]:
|
||||
results = []
|
||||
for fname, func in factors:
|
||||
r = test_factor(fname, func, prices, top_n=top_n)
|
||||
results.append(r)
|
||||
print_results(results, name)
|
||||
return results
|
||||
|
||||
|
||||
def run_market(market: str):
|
||||
config = UNIVERSES[market]
|
||||
benchmark = config["benchmark"]
|
||||
prices = data_manager.load(market)
|
||||
bench = prices[benchmark].dropna() if benchmark in prices.columns else None
|
||||
stocks = prices.drop(columns=[benchmark], errors="ignore")
|
||||
print(f"\n{'#'*95}")
|
||||
print(f" FACTOR DISCOVERY LOOP — {market.upper()} MARKET")
|
||||
print(f" {stocks.shape[1]} stocks, {stocks.shape[0]} days, "
|
||||
f"{stocks.index[0].date()} → {stocks.index[-1].date()}")
|
||||
print(f"{'#'*95}")
|
||||
|
||||
# Benchmark
|
||||
if bench is not None:
|
||||
eq_bench = bench / bench.iloc[0] * 100000
|
||||
bs = stats(eq_bench)
|
||||
print(f"\n Benchmark: CAGR {bs['cagr']:+.1%}, Sharpe {bs['sharpe']:.2f}")
|
||||
|
||||
# ================================================================
|
||||
# ROUND 1: Academic & practitioner factors
|
||||
# ================================================================
|
||||
r1_factors = [
|
||||
("BASELINE:rec+mom", f_rec_mom),
|
||||
# Momentum family
|
||||
("mom_12_1", f_mom_12_1),
|
||||
("mom_6_1", f_mom_6_1),
|
||||
("mom_3_1", f_mom_3_1),
|
||||
("mom_1_0", f_mom_1_0),
|
||||
("mom_intermediate_7m", f_mom_intermediate),
|
||||
("sharpe_momentum", f_sharpe_mom),
|
||||
# Recovery family
|
||||
("recovery_63d", f_rec_63),
|
||||
("recovery_126d", f_rec_126),
|
||||
("recovery_21d", f_rec_21),
|
||||
# Trend
|
||||
("trend_slope_60d", f_trend_slope),
|
||||
("golden_cross", f_golden_cross),
|
||||
("above_ma200", f_above_ma200),
|
||||
# Volatility
|
||||
("low_vol_60d", f_low_vol_60),
|
||||
("neg_ivol_20d", f_neg_ivol),
|
||||
# Reversal
|
||||
("STR_5d", f_str_5d),
|
||||
("STR_10d", f_str_10d),
|
||||
# Quality / accumulation
|
||||
("consistent_returns", f_consistent_returns),
|
||||
("up_volume_proxy", f_up_volume_proxy),
|
||||
("gap_up_freq", f_gap_up_freq),
|
||||
("info_discrete", f_info_discrete),
|
||||
("earnings_drift", f_earnings_drift),
|
||||
# Other academic
|
||||
("52w_high", f_52w_high),
|
||||
("anti_max_20d", f_anti_max),
|
||||
("dd_recovery_rate", f_dd_recovery_rate),
|
||||
("mom_acceleration", f_mom_accel),
|
||||
]
|
||||
if market == "cn":
|
||||
r1_factors.append(("reversal_vol_cn", f_reversal_vol))
|
||||
r1 = run_round("ROUND 1 — Academic & Practitioner Hypotheses", r1_factors, stocks)
|
||||
|
||||
# Identify top-10 from round 1
|
||||
r1_sorted = sorted(r1, key=lambda x: x.get("cagr", 0) or 0, reverse=True)
|
||||
r1_top_names = [r["name"] for r in r1_sorted[:10] if r.get("cagr") and r["cagr"] > 0]
|
||||
baseline_cagr = next((r["cagr"] for r in r1 if "BASELINE" in r["name"]), 0)
|
||||
print(f"\n Baseline CAGR: {baseline_cagr:+.1%}")
|
||||
print(f" Top 10: {r1_top_names}")
|
||||
|
||||
# ================================================================
|
||||
# ROUND 2: Second-order ideas based on what worked
|
||||
# ================================================================
|
||||
r2_factors = [
|
||||
("BASELINE:rec+mom", f_rec_mom),
|
||||
("smooth_momentum", f_smooth_momentum),
|
||||
("pos_ratio_60d", f_pos_ratio_60),
|
||||
("up_down_asymmetry", f_up_down_asymmetry),
|
||||
("max_streak_40d", f_max_streak),
|
||||
("up_capture_60d", f_up_capture),
|
||||
("down_resilience_120d", f_down_resilience),
|
||||
("rec_mom_filtered", f_rec_mom_filtered),
|
||||
("sign_ratio", f_sign_ratio),
|
||||
("autocorrelation_60d", f_autocorrelation),
|
||||
("mean_rev_z", f_mean_rev_z),
|
||||
]
|
||||
r2 = run_round("ROUND 2 — Return Quality & Behavioral Factors", r2_factors, stocks)
|
||||
|
||||
# ================================================================
|
||||
# ROUND 3: Interaction & conditional factors
|
||||
# ================================================================
|
||||
r3_factors = [
|
||||
("BASELINE:rec+mom", f_rec_mom),
|
||||
("mom×recovery", f_mom_x_recovery),
|
||||
("mom×upvol", f_mom_x_upvol),
|
||||
("rec_deep×upvol", f_rec_deep_x_upvol),
|
||||
("trend×mom", f_trend_x_mom),
|
||||
("quality_mom", f_quality_mom),
|
||||
("rec_deep×gap", f_rec_deep_x_gap),
|
||||
("mom×gap", f_mom_x_gap),
|
||||
("mom_low_vol_regime", f_mom_low_vol_regime),
|
||||
]
|
||||
r3 = run_round("ROUND 3 — Interaction & Conditional Factors", r3_factors, stocks)
|
||||
|
||||
# ================================================================
|
||||
# Collect ALL results from all rounds
|
||||
# ================================================================
|
||||
all_results = r1 + r2 + r3
|
||||
# Deduplicate baseline
|
||||
seen = set()
|
||||
unique = []
|
||||
for r in all_results:
|
||||
if r["name"] not in seen:
|
||||
seen.add(r["name"])
|
||||
unique.append(r)
|
||||
unique_sorted = sorted(unique, key=lambda x: x.get("cagr", 0) or 0, reverse=True)
|
||||
|
||||
print(f"\n{'='*95}")
|
||||
print(f" ALL ROUNDS COMBINED — TOP 15 FACTORS — {market.upper()}")
|
||||
print(f"{'='*95}")
|
||||
print(f" {'Factor':<45} {'CAGR':>7} {'Sharpe':>7} {'Sortino':>8} {'MaxDD':>7} {'Calmar':>7}")
|
||||
print(f" {'-'*85}")
|
||||
for r in unique_sorted[:15]:
|
||||
flag = " <<<" if "BASELINE" in r["name"] else ""
|
||||
print(f" {r['name']:<45} {r['cagr']:>+6.1%} {r['sharpe']:>7.2f} {r['sortino']:>8.2f} "
|
||||
f"{r['maxdd']:>+6.1%} {r['calmar']:>7.2f}{flag}")
|
||||
|
||||
# ================================================================
|
||||
# ROUND 4: Combine top non-baseline factors
|
||||
# ================================================================
|
||||
top_funcs = {}
|
||||
func_map_all = dict(r1_factors + r2_factors + r3_factors)
|
||||
non_baseline = [r for r in unique_sorted if "BASELINE" not in r["name"] and r.get("cagr", 0)]
|
||||
for r in non_baseline[:12]:
|
||||
if r["name"] in func_map_all:
|
||||
top_funcs[r["name"]] = func_map_all[r["name"]]
|
||||
|
||||
top_names = list(top_funcs.keys())
|
||||
print(f"\n Building combinations from top {len(top_names)} factors: {top_names}")
|
||||
|
||||
combo_factors = [("BASELINE:rec+mom", f_rec_mom)]
|
||||
|
||||
# All pairs from top-8
|
||||
for i in range(min(8, len(top_names))):
|
||||
for j in range(i + 1, min(8, len(top_names))):
|
||||
n1, n2 = top_names[i], top_names[j]
|
||||
combo_factors.append((
|
||||
f"{n1[:20]}+{n2[:20]}",
|
||||
combo([(top_funcs[n1], 0.5), (top_funcs[n2], 0.5)])
|
||||
))
|
||||
|
||||
# Triple combos from top-5
|
||||
for i in range(min(5, len(top_names))):
|
||||
for j in range(i + 1, min(5, len(top_names))):
|
||||
for k in range(j + 1, min(5, len(top_names))):
|
||||
n1, n2, n3 = top_names[i], top_names[j], top_names[k]
|
||||
combo_factors.append((
|
||||
f"{n1[:15]}+{n2[:15]}+{n3[:15]}",
|
||||
combo([(top_funcs[n1], 0.33), (top_funcs[n2], 0.33), (top_funcs[n3], 0.34)])
|
||||
))
|
||||
|
||||
r4 = run_round("ROUND 4 — Factor Combinations", combo_factors, stocks)
|
||||
|
||||
# ================================================================
|
||||
# ROUND 5: Yearly breakdown of top 5 combos
|
||||
# ================================================================
|
||||
r4_sorted = sorted(r4, key=lambda x: x.get("cagr", 0) or 0, reverse=True)
|
||||
top5 = r4_sorted[:5]
|
||||
# Make sure baseline is included
|
||||
base = next((r for r in r4 if "BASELINE" in r["name"]), None)
|
||||
if base and base not in top5:
|
||||
top5.append(base)
|
||||
|
||||
print(f"\n{'='*95}")
|
||||
print(f" ROUND 5 — YEARLY RETURNS OF BEST STRATEGIES — {market.upper()}")
|
||||
print(f"{'='*95}")
|
||||
|
||||
cols = [(r["name"], r["equity"]) for r in top5]
|
||||
if bench is not None:
|
||||
eq_bench = bench / bench.iloc[0] * 100000
|
||||
cols.append(("BENCHMARK", eq_bench))
|
||||
|
||||
# Header
|
||||
header = f" {'Year':<6}"
|
||||
for name, _ in cols:
|
||||
header += f" | {name[:22]:>22}"
|
||||
print(header)
|
||||
print(" " + "-" * (6 + 25 * len(cols)))
|
||||
|
||||
all_years = sorted(set(y for _, eq in cols for y in eq.index.year.unique()))
|
||||
for year in all_years:
|
||||
line = f" {year:<6}"
|
||||
for _, eq in cols:
|
||||
dr = eq.pct_change().fillna(0)
|
||||
yr = dr[dr.index.year == year]
|
||||
r = float((1 + yr).prod() - 1) if len(yr) > 0 else 0
|
||||
line += f" | {r:>+21.1%}"
|
||||
print(line)
|
||||
|
||||
# Period CAGRs
|
||||
for ny in [3, 5, 10]:
|
||||
cutoff = stocks.index[-1] - pd.DateOffset(years=ny)
|
||||
print(f"\n --- {ny}-year CAGR ---")
|
||||
for name, eq in cols:
|
||||
sl = eq[eq.index >= cutoff]
|
||||
if len(sl) < 50:
|
||||
continue
|
||||
tot = sl.iloc[-1] / sl.iloc[0] - 1
|
||||
cagr = (1 + tot) ** (1 / ny) - 1
|
||||
print(f" {name[:50]:<50} {cagr:>+8.1%}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--market", default="us", choices=["us", "cn"])
|
||||
args = parser.parse_args()
|
||||
run_market(args.market)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
449
factor_real_backtest.py
Normal file
449
factor_real_backtest.py
Normal file
@@ -0,0 +1,449 @@
|
||||
"""
|
||||
Factor research v2 — Portfolio-first approach.
|
||||
|
||||
Instead of IC → portfolio, we go directly to:
|
||||
1. Build factor signal
|
||||
2. Select top-N stocks
|
||||
3. Run real backtest with transaction costs
|
||||
4. Measure CAGR, Sharpe, MaxDD, yearly returns
|
||||
|
||||
Tests single factors AND combinations. Compares everything against
|
||||
the baseline recovery+momentum strategy.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
import metrics
|
||||
from universe import UNIVERSES
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Factor signals — each returns DataFrame (dates x stocks), higher = better
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def f_momentum_12_1(p: pd.DataFrame) -> pd.DataFrame:
|
||||
return p.shift(21).pct_change(231)
|
||||
|
||||
def f_recovery(p: pd.DataFrame) -> pd.DataFrame:
|
||||
return p / p.rolling(63, min_periods=63).min() - 1
|
||||
|
||||
def f_recovery_mom(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""The baseline composite: 50/50 recovery + momentum ranks."""
|
||||
r1 = f_recovery(p).rank(axis=1, pct=True, na_option="keep")
|
||||
r2 = f_momentum_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.5 * r1 + 0.5 * r2
|
||||
|
||||
# --- New single factors ---
|
||||
|
||||
def f_short_term_reversal(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""5-day return reversal."""
|
||||
return -p.pct_change(5)
|
||||
|
||||
def f_vol_adjusted_mom(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Momentum divided by recent volatility. Sharpe-like signal.
|
||||
Hypothesis: risk-adjusted momentum is more persistent."""
|
||||
mom = p.shift(21).pct_change(231)
|
||||
vol = p.pct_change().rolling(60, min_periods=40).std()
|
||||
return mom / vol.replace(0, np.nan)
|
||||
|
||||
def f_acceleration(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""3-month momentum minus 12-month momentum.
|
||||
Hypothesis: accelerating stocks continue accelerating."""
|
||||
mom_3m = p.shift(5).pct_change(63 - 5)
|
||||
mom_12m = p.shift(21).pct_change(231)
|
||||
return mom_3m - mom_12m
|
||||
|
||||
def f_breakout(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Price relative to 20-day high. Close to 1 = breaking out.
|
||||
Hypothesis: breakouts from consolidation continue."""
|
||||
return p / p.rolling(20, min_periods=20).max()
|
||||
|
||||
def f_recovery_deep(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Recovery from 126-day (6 month) low instead of 63-day.
|
||||
Hypothesis: deeper recovery = stronger signal."""
|
||||
return p / p.rolling(126, min_periods=126).min() - 1
|
||||
|
||||
def f_recovery_rate(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Speed of recovery: 20-day change in recovery factor.
|
||||
Hypothesis: accelerating recovery predicts continuation."""
|
||||
recovery = p / p.rolling(63, min_periods=63).min() - 1
|
||||
return recovery - recovery.shift(20)
|
||||
|
||||
def f_drawdown_bounce(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""20-day return from drawdown trough, only for stocks in drawdown.
|
||||
Hypothesis: strong bounces from drawdowns persist."""
|
||||
rolling_max = p.rolling(252, min_periods=126).max()
|
||||
in_drawdown = p < rolling_max * 0.9 # at least 10% below peak
|
||||
bounce_20d = p.pct_change(20)
|
||||
# Only score stocks that were recently in drawdown
|
||||
was_in_drawdown = in_drawdown.rolling(20, min_periods=1).max().astype(bool)
|
||||
return bounce_20d.where(was_in_drawdown, np.nan)
|
||||
|
||||
def f_consistent_winner(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Fraction of months with positive returns over past 12 months.
|
||||
Hypothesis: stocks that win consistently are higher quality momentum."""
|
||||
monthly_ret = p.pct_change(21)
|
||||
return (monthly_ret > 0).astype(float).rolling(252, min_periods=126).mean()
|
||||
|
||||
def f_gap_up_freq(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Fraction of days with >1% gain in past 60 days.
|
||||
Hypothesis: frequent large gains = institutional buying."""
|
||||
ret = p.pct_change()
|
||||
return (ret > 0.01).astype(float).rolling(60, min_periods=40).mean()
|
||||
|
||||
def f_low_vol_mom(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Momentum only among low-volatility stocks. Combined rank.
|
||||
Hypothesis: low-vol momentum is more persistent."""
|
||||
mom = f_momentum_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
vol = (-p.pct_change().rolling(60, min_periods=40).std()).rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.5 * mom + 0.5 * vol
|
||||
|
||||
def f_52w_channel_position(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Position within 252-day high-low channel. 1 = at high, 0 = at low.
|
||||
Hypothesis: stocks near highs continue (anchoring + trend)."""
|
||||
h = p.rolling(252, min_periods=126).max()
|
||||
l = p.rolling(252, min_periods=126).min()
|
||||
return (p - l) / (h - l).replace(0, np.nan)
|
||||
|
||||
def f_up_volume_proxy(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Proxy for accumulation: sum of returns on up days over 20 days.
|
||||
Without volume data, use magnitude of positive returns as proxy."""
|
||||
ret = p.pct_change()
|
||||
up_ret = ret.where(ret > 0, 0)
|
||||
return up_ret.rolling(20, min_periods=15).sum()
|
||||
|
||||
def f_relative_strength_ma(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Price above 50-day MA relative to 200-day MA position.
|
||||
Dual MA trend strength."""
|
||||
ma50 = p.rolling(50, min_periods=50).mean()
|
||||
ma200 = p.rolling(200, min_periods=200).mean()
|
||||
above_50 = (p / ma50 - 1)
|
||||
above_200 = (p / ma200 - 1)
|
||||
return 0.5 * above_50 + 0.5 * above_200
|
||||
|
||||
def f_earnings_drift_proxy(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Proxy for post-earnings drift using 5-day return spikes.
|
||||
Identify large 5-day moves and bet on continuation.
|
||||
Hypothesis: large moves driven by information continue."""
|
||||
ret_5d = p.pct_change(5)
|
||||
vol = p.pct_change().rolling(60, min_periods=40).std() * np.sqrt(5)
|
||||
z_score = ret_5d / vol.replace(0, np.nan)
|
||||
# Smooth: average z-score over past 60 days to capture multiple events
|
||||
return z_score.rolling(60, min_periods=20).mean()
|
||||
|
||||
# --- A-share specific ---
|
||||
|
||||
def f_reversal_vol_cn(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Short-term reversal amplified by volatility.
|
||||
High-vol oversold stocks bounce harder in A-shares."""
|
||||
ret_5d = p.pct_change(5)
|
||||
vol = p.pct_change().rolling(20, min_periods=15).std()
|
||||
# Oversold (negative return) + high vol = positive score
|
||||
return -ret_5d * vol
|
||||
|
||||
def f_momentum_6_1(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""6-1 month momentum. Shorter lookback may work better in A-shares."""
|
||||
return p.shift(21).pct_change(105)
|
||||
|
||||
def f_recovery_narrow(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Recovery from 21-day low. Faster recovery signal for A-shares."""
|
||||
return p / p.rolling(21, min_periods=21).min() - 1
|
||||
|
||||
def f_range_breakout_cn(p: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Breakout from 60-day range. Tuned for A-share volatility."""
|
||||
h60 = p.rolling(60, min_periods=40).max()
|
||||
l60 = p.rolling(60, min_periods=40).min()
|
||||
mid = (h60 + l60) / 2
|
||||
rng = (h60 - l60) / mid.replace(0, np.nan)
|
||||
position = (p - l60) / (h60 - l60).replace(0, np.nan)
|
||||
# Reward stocks breaking out of narrow ranges
|
||||
return position / rng.replace(0, np.nan)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Strategy builder and backtester
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_strategy(
|
||||
prices: pd.DataFrame,
|
||||
signal_func,
|
||||
top_n: int = 10,
|
||||
rebal_freq: int = 21,
|
||||
warmup: int = 252,
|
||||
) -> pd.DataFrame:
|
||||
"""Turn a factor signal into a rebalanced top-N equal-weight strategy."""
|
||||
signal = signal_func(prices)
|
||||
|
||||
rank = signal.rank(axis=1, ascending=False, na_option="bottom")
|
||||
n_valid = signal.notna().sum(axis=1)
|
||||
enough = n_valid >= top_n
|
||||
top_mask = (rank <= top_n) & enough.values.reshape(-1, 1)
|
||||
|
||||
raw = top_mask.astype(float)
|
||||
row_sums = raw.sum(axis=1).replace(0, np.nan)
|
||||
weights = raw.div(row_sums, axis=0).fillna(0.0)
|
||||
|
||||
# Monthly rebalance
|
||||
rebal_mask = pd.Series(False, index=prices.index)
|
||||
rebal_indices = list(range(warmup, len(prices), rebal_freq))
|
||||
rebal_mask.iloc[rebal_indices] = True
|
||||
weights[~rebal_mask] = np.nan
|
||||
weights = weights.ffill().fillna(0.0)
|
||||
weights.iloc[:warmup] = 0.0
|
||||
|
||||
return weights.shift(1).fillna(0.0)
|
||||
|
||||
|
||||
def combo_signal(funcs_and_weights: list[tuple]) -> callable:
|
||||
"""Create a combined signal function from [(func, weight), ...]."""
|
||||
def _combo(p: pd.DataFrame) -> pd.DataFrame:
|
||||
ranked = []
|
||||
for func, w in funcs_and_weights:
|
||||
sig = func(p)
|
||||
ranked.append(w * sig.rank(axis=1, pct=True, na_option="keep"))
|
||||
return sum(ranked)
|
||||
return _combo
|
||||
|
||||
|
||||
def run_backtest(
|
||||
weights: pd.DataFrame,
|
||||
prices: pd.DataFrame,
|
||||
cost: float = 0.001,
|
||||
) -> pd.Series:
|
||||
"""Vectorized backtest returning equity curve."""
|
||||
returns = prices.pct_change().fillna(0.0)
|
||||
port_ret = (weights * returns).sum(axis=1)
|
||||
turnover = weights.diff().abs().sum(axis=1)
|
||||
port_ret -= turnover * cost
|
||||
return (1 + port_ret).cumprod() * 100000
|
||||
|
||||
|
||||
def compute_stats(equity: pd.Series, label: str) -> dict:
|
||||
"""Compute strategy statistics."""
|
||||
daily_ret = equity.pct_change().dropna()
|
||||
if len(daily_ret) < 100 or daily_ret.std() == 0:
|
||||
return {"name": label, "cagr": np.nan, "sharpe": np.nan, "maxdd": np.nan,
|
||||
"total": np.nan, "win_rate": np.nan}
|
||||
|
||||
n_years = len(daily_ret) / 252
|
||||
total_ret = equity.iloc[-1] / equity.iloc[0] - 1
|
||||
cagr = (1 + total_ret) ** (1 / n_years) - 1
|
||||
sharpe = daily_ret.mean() / daily_ret.std() * np.sqrt(252)
|
||||
sortino_denom = daily_ret[daily_ret < 0].std()
|
||||
sortino = daily_ret.mean() / sortino_denom * np.sqrt(252) if sortino_denom > 0 else 0
|
||||
running_max = equity.cummax()
|
||||
maxdd = ((equity - running_max) / running_max).min()
|
||||
calmar = cagr / abs(maxdd) if maxdd != 0 else 0
|
||||
win_rate = (daily_ret > 0).mean()
|
||||
|
||||
return {
|
||||
"name": label, "cagr": cagr, "sharpe": sharpe, "sortino": sortino,
|
||||
"maxdd": maxdd, "calmar": calmar, "total": total_ret, "win_rate": win_rate,
|
||||
}
|
||||
|
||||
|
||||
def yearly_returns(equity: pd.Series) -> dict[int, float]:
|
||||
daily_ret = equity.pct_change().fillna(0)
|
||||
years = daily_ret.index.year
|
||||
result = {}
|
||||
for year in sorted(years.unique()):
|
||||
mask = years == year
|
||||
result[year] = float((1 + daily_ret[mask]).prod() - 1)
|
||||
return result
|
||||
|
||||
|
||||
def run(market: str):
|
||||
config = UNIVERSES[market]
|
||||
benchmark = config["benchmark"]
|
||||
|
||||
print(f"Loading {market.upper()} price data...")
|
||||
prices = data_manager.load(market)
|
||||
bench = prices[benchmark].dropna() if benchmark in prices.columns else None
|
||||
stocks = prices.drop(columns=[benchmark], errors="ignore")
|
||||
print(f"Universe: {stocks.shape[1]} stocks, {stocks.shape[0]} days")
|
||||
print(f"Period: {stocks.index[0].date()} to {stocks.index[-1].date()}\n")
|
||||
|
||||
# --- Define all strategies to test ---
|
||||
strategies: list[tuple[str, callable]] = []
|
||||
|
||||
# Baseline
|
||||
strategies.append(("BASELINE: recovery+mom", f_recovery_mom))
|
||||
|
||||
# Single factors
|
||||
strategies.append(("momentum_12_1", f_momentum_12_1))
|
||||
strategies.append(("recovery", f_recovery))
|
||||
strategies.append(("vol_adj_momentum", f_vol_adjusted_mom))
|
||||
strategies.append(("acceleration", f_acceleration))
|
||||
strategies.append(("breakout_20d", f_breakout))
|
||||
strategies.append(("recovery_deep_126d", f_recovery_deep))
|
||||
strategies.append(("recovery_rate", f_recovery_rate))
|
||||
strategies.append(("drawdown_bounce", f_drawdown_bounce))
|
||||
strategies.append(("consistent_winner", f_consistent_winner))
|
||||
strategies.append(("gap_up_freq", f_gap_up_freq))
|
||||
strategies.append(("low_vol_momentum", f_low_vol_mom))
|
||||
strategies.append(("52w_channel_position", f_52w_channel_position))
|
||||
strategies.append(("up_volume_proxy", f_up_volume_proxy))
|
||||
strategies.append(("relative_strength_ma", f_relative_strength_ma))
|
||||
strategies.append(("earnings_drift_proxy", f_earnings_drift_proxy))
|
||||
|
||||
if market == "cn":
|
||||
strategies.append(("reversal_vol_cn", f_reversal_vol_cn))
|
||||
strategies.append(("momentum_6_1", f_momentum_6_1))
|
||||
strategies.append(("recovery_narrow_21d", f_recovery_narrow))
|
||||
strategies.append(("range_breakout_cn", f_range_breakout_cn))
|
||||
|
||||
# Run all single-factor backtests
|
||||
print("=" * 110)
|
||||
print(f" SINGLE FACTOR BACKTESTS — {market.upper()} (Top 10, monthly rebal, 10bps cost)")
|
||||
print("=" * 110)
|
||||
|
||||
results = []
|
||||
equities = {}
|
||||
for name, func in strategies:
|
||||
print(f" Running: {name}...")
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks)
|
||||
equities[name] = eq
|
||||
results.append(compute_stats(eq, name))
|
||||
|
||||
# Benchmark
|
||||
if bench is not None:
|
||||
eq_bench = bench / bench.iloc[0] * 100000
|
||||
equities["BENCHMARK"] = eq_bench
|
||||
results.append(compute_stats(eq_bench, "BENCHMARK"))
|
||||
|
||||
# Print results table
|
||||
df = pd.DataFrame(results).set_index("name")
|
||||
df = df.sort_values("cagr", ascending=False)
|
||||
print(f"\n{'Strategy':<30} {'CAGR':>8} {'Sharpe':>8} {'Sortino':>8} {'MaxDD':>8} {'Calmar':>8} {'Total':>10}")
|
||||
print("-" * 90)
|
||||
for name, row in df.iterrows():
|
||||
flag = " ***" if name == "BASELINE: recovery+mom" else ""
|
||||
print(f"{name:<30} {row['cagr']:>+7.1%} {row['sharpe']:>8.2f} {row['sortino']:>8.2f} "
|
||||
f"{row['maxdd']:>+7.1%} {row['calmar']:>8.2f} {row['total']:>+9.0%}{flag}")
|
||||
|
||||
# --- Identify factors that beat or match baseline ---
|
||||
baseline_cagr = df.loc["BASELINE: recovery+mom", "cagr"]
|
||||
winners = df[df["cagr"] >= baseline_cagr * 0.8].index.tolist()
|
||||
winners = [w for w in winners if w not in ("BASELINE: recovery+mom", "BENCHMARK")]
|
||||
print(f"\nFactors within 80% of baseline CAGR ({baseline_cagr:.1%}): {winners}")
|
||||
|
||||
# --- Test combinations of top performers ---
|
||||
print(f"\n{'='*110}")
|
||||
print(f" FACTOR COMBINATIONS — {market.upper()}")
|
||||
print(f"{'='*110}")
|
||||
|
||||
# Get top single factors
|
||||
single_only = df.drop(["BASELINE: recovery+mom", "BENCHMARK"], errors="ignore")
|
||||
top_singles = single_only.nlargest(8, "cagr").index.tolist()
|
||||
print(f" Top 8 singles: {top_singles}\n")
|
||||
|
||||
# Map names back to functions
|
||||
func_map = dict(strategies)
|
||||
|
||||
combos: list[tuple[str, callable]] = []
|
||||
# Baseline is always included
|
||||
combos.append(("BASELINE: recovery+mom", f_recovery_mom))
|
||||
|
||||
# Top2 combinations
|
||||
for i in range(min(6, len(top_singles))):
|
||||
for j in range(i + 1, min(6, len(top_singles))):
|
||||
n1, n2 = top_singles[i], top_singles[j]
|
||||
label = f"{n1} + {n2}"
|
||||
func = combo_signal([(func_map[n1], 0.5), (func_map[n2], 0.5)])
|
||||
combos.append((label, func))
|
||||
|
||||
# Recovery+mom + each top single (3-factor)
|
||||
for name in top_singles[:6]:
|
||||
if name in ("momentum_12_1", "recovery"):
|
||||
continue
|
||||
label = f"rec+mom + {name}"
|
||||
func = combo_signal([
|
||||
(f_recovery, 0.33), (f_momentum_12_1, 0.33), (func_map[name], 0.34)
|
||||
])
|
||||
combos.append((label, func))
|
||||
|
||||
# Run combo backtests
|
||||
combo_results = []
|
||||
for name, func in combos:
|
||||
print(f" Running: {name}...")
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks)
|
||||
equities[name] = eq
|
||||
combo_results.append(compute_stats(eq, name))
|
||||
|
||||
combo_df = pd.DataFrame(combo_results).set_index("name")
|
||||
combo_df = combo_df.sort_values("cagr", ascending=False)
|
||||
|
||||
print(f"\n{'Combo':<55} {'CAGR':>8} {'Sharpe':>8} {'Sortino':>8} {'MaxDD':>8} {'Calmar':>8}")
|
||||
print("-" * 105)
|
||||
for name, row in combo_df.iterrows():
|
||||
flag = " ***" if name == "BASELINE: recovery+mom" else ""
|
||||
print(f"{name:<55} {row['cagr']:>+7.1%} {row['sharpe']:>8.2f} {row['sortino']:>8.2f} "
|
||||
f"{row['maxdd']:>+7.1%} {row['calmar']:>8.2f}{flag}")
|
||||
|
||||
# --- Yearly breakdown for top 3 combos ---
|
||||
top3 = combo_df.nlargest(3, "cagr").index.tolist()
|
||||
if "BASELINE: recovery+mom" not in top3:
|
||||
top3.append("BASELINE: recovery+mom")
|
||||
|
||||
print(f"\n{'='*110}")
|
||||
print(f" YEARLY RETURNS — TOP STRATEGIES vs BASELINE — {market.upper()}")
|
||||
print(f"{'='*110}")
|
||||
|
||||
yr_data = {}
|
||||
for name in top3:
|
||||
yr_data[name] = yearly_returns(equities[name])
|
||||
if bench is not None:
|
||||
yr_data["BENCHMARK"] = yearly_returns(equities["BENCHMARK"])
|
||||
|
||||
all_years = sorted(set(y for yd in yr_data.values() for y in yd.keys()))
|
||||
|
||||
# Print header
|
||||
col_names = top3 + (["BENCHMARK"] if bench is not None else [])
|
||||
header = f" {'Year':<6}"
|
||||
for c in col_names:
|
||||
header += f" | {c[:25]:>25}"
|
||||
print(header)
|
||||
print(" " + "-" * (6 + 28 * len(col_names)))
|
||||
|
||||
for year in all_years:
|
||||
line = f" {year:<6}"
|
||||
for c in col_names:
|
||||
r = yr_data.get(c, {}).get(year, 0)
|
||||
line += f" | {r:>+24.1%}"
|
||||
print(line)
|
||||
|
||||
# Compute period summaries
|
||||
for n_years in [3, 5, 10]:
|
||||
cutoff = stocks.index[-1] - pd.DateOffset(years=n_years)
|
||||
print(f"\n --- {n_years}-year CAGR ---")
|
||||
for name in col_names:
|
||||
eq = equities.get(name)
|
||||
if eq is None:
|
||||
continue
|
||||
eq_slice = eq[eq.index >= cutoff]
|
||||
if len(eq_slice) < 50:
|
||||
continue
|
||||
total = eq_slice.iloc[-1] / eq_slice.iloc[0] - 1
|
||||
cagr = (1 + total) ** (1 / n_years) - 1
|
||||
print(f" {name[:40]:<40} {cagr:>+8.1%}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--market", default="us", choices=["us", "cn"])
|
||||
args = parser.parse_args()
|
||||
run(args.market)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
547
factor_research.py
Normal file
547
factor_research.py
Normal file
@@ -0,0 +1,547 @@
|
||||
"""
|
||||
Factor Research Script — Professional QR-style factor mining.
|
||||
|
||||
Tests candidate alpha factors using:
|
||||
- Information Coefficient (IC): rank correlation of signal vs forward returns
|
||||
- IC Information Ratio (ICIR): mean(IC) / std(IC), measures signal consistency
|
||||
- Quintile long-short spread: monotonicity of returns across signal buckets
|
||||
- Turnover: daily rank change, proxy for trading cost
|
||||
- Decay profile: IC at 1d, 5d, 10d, 20d horizons
|
||||
- Sub-period stability: IC consistency across rolling windows
|
||||
- Factor correlation matrix: ensures new factors are orthogonal to known ones
|
||||
|
||||
Usage:
|
||||
uv run python factor_research.py --market us
|
||||
uv run python factor_research.py --market cn
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
from universe import UNIVERSES
|
||||
|
||||
warnings.filterwarnings("ignore", category=FutureWarning)
|
||||
|
||||
HORIZONS = [1, 5, 10, 20]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Factor definitions — each returns a DataFrame (dates x stocks) of scores
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _safe_rank(df: pd.DataFrame) -> pd.DataFrame:
|
||||
return df.rank(axis=1, pct=True, na_option="keep")
|
||||
|
||||
|
||||
def _rolling_ret(prices: pd.DataFrame, window: int) -> pd.DataFrame:
|
||||
return prices.pct_change(window)
|
||||
|
||||
|
||||
# --- Known factors (baselines) ---
|
||||
|
||||
def factor_momentum_12_1(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Classic 12-1 month momentum."""
|
||||
return prices.shift(21).pct_change(231)
|
||||
|
||||
|
||||
def factor_recovery(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Price / 63-day low - 1."""
|
||||
return prices / prices.rolling(63, min_periods=63).min() - 1
|
||||
|
||||
|
||||
def factor_inverse_vol(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Negative 60-day realized volatility (low vol = high score)."""
|
||||
return -prices.pct_change().rolling(60, min_periods=60).std()
|
||||
|
||||
|
||||
# --- NEW candidate factors ---
|
||||
|
||||
def factor_short_term_reversal(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""5-day return reversal. Hypothesis: short-term mean reversion."""
|
||||
return -prices.pct_change(5)
|
||||
|
||||
|
||||
def factor_idio_vol_change(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Change in idiosyncratic volatility (20d vs 60d).
|
||||
Hypothesis: declining vol = stabilizing, predicts positive returns."""
|
||||
ret = prices.pct_change()
|
||||
vol_20 = ret.rolling(20, min_periods=20).std()
|
||||
vol_60 = ret.rolling(60, min_periods=60).std()
|
||||
return -(vol_20 / vol_60.replace(0, np.nan) - 1) # negative = vol declining
|
||||
|
||||
|
||||
def factor_volume_price_divergence(prices: pd.DataFrame, volume: pd.DataFrame | None = None) -> pd.DataFrame:
|
||||
"""Price up but momentum fading — proxy via acceleration.
|
||||
Without volume data, use return acceleration as proxy."""
|
||||
ret_5 = prices.pct_change(5)
|
||||
ret_20 = prices.pct_change(20)
|
||||
return ret_5 - ret_20 / 4 # recent returns outpacing trend
|
||||
|
||||
|
||||
def factor_max_drawdown_recovery(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""How much of the 60-day max drawdown has been recovered.
|
||||
Hypothesis: stocks that recover from drawdowns continue recovering."""
|
||||
rolling_max = prices.rolling(60, min_periods=60).max()
|
||||
drawdown = prices / rolling_max - 1 # negative
|
||||
rolling_min_dd = drawdown.rolling(60, min_periods=20).min() # worst drawdown
|
||||
recovery_pct = drawdown / rolling_min_dd.replace(0, np.nan)
|
||||
return recovery_pct # closer to 0 = more recovered
|
||||
|
||||
|
||||
def factor_skewness(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Negative 20-day return skewness.
|
||||
Hypothesis: negatively skewed stocks are overpriced (lottery preference)."""
|
||||
ret = prices.pct_change()
|
||||
return -ret.rolling(20, min_periods=20).skew()
|
||||
|
||||
|
||||
def factor_high_low_range(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""20-day high-low range relative to price.
|
||||
Hypothesis: narrow range = consolidation, breakout ahead."""
|
||||
high_20 = prices.rolling(20, min_periods=20).max()
|
||||
low_20 = prices.rolling(20, min_periods=20).min()
|
||||
mid = (high_20 + low_20) / 2
|
||||
return -(high_20 - low_20) / mid.replace(0, np.nan) # negative range = narrow = high score
|
||||
|
||||
|
||||
def factor_mean_reversion_residual(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Distance from 20-day MA as fraction of 60-day vol.
|
||||
Hypothesis: stocks far below MA revert. Z-score style."""
|
||||
ma_20 = prices.rolling(20, min_periods=20).mean()
|
||||
vol_60 = prices.pct_change().rolling(60, min_periods=60).std() * prices
|
||||
return -(prices - ma_20) / vol_60.replace(0, np.nan) # below MA = high score
|
||||
|
||||
|
||||
def factor_up_down_vol_ratio(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Ratio of upside to downside semi-deviation (20d).
|
||||
Hypothesis: stocks with more upside vol have positive momentum."""
|
||||
ret = prices.pct_change()
|
||||
up_vol = ret.where(ret > 0, 0).rolling(20, min_periods=15).std()
|
||||
down_vol = ret.where(ret < 0, 0).rolling(20, min_periods=15).std()
|
||||
return up_vol / down_vol.replace(0, np.nan)
|
||||
|
||||
|
||||
def factor_consecutive_up_days(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Fraction of positive return days in last 10 days.
|
||||
Hypothesis: persistent winners keep winning (short-term)."""
|
||||
ret = prices.pct_change()
|
||||
return (ret > 0).astype(float).rolling(10, min_periods=10).mean()
|
||||
|
||||
|
||||
def factor_gap_momentum(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Cumulative overnight-like gaps: close-to-close vs intraday proxy.
|
||||
Using 1-day returns smoothed over 20 days minus 5-day return.
|
||||
Hypothesis: smooth consistent returns beat volatile ones."""
|
||||
ret_1d = prices.pct_change()
|
||||
smoothness = ret_1d.rolling(20, min_periods=20).mean() * 20
|
||||
raw_20d = prices.pct_change(20)
|
||||
return smoothness - raw_20d # positive = smoother path
|
||||
|
||||
|
||||
def factor_recovery_acceleration(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Rate of change of recovery factor.
|
||||
Hypothesis: accelerating recovery is stronger signal than level."""
|
||||
recovery = prices / prices.rolling(63, min_periods=63).min() - 1
|
||||
return recovery.pct_change(5)
|
||||
|
||||
|
||||
def factor_trend_strength(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""R-squared of log-price vs time over 60 days.
|
||||
Hypothesis: stocks trending linearly (high R2) continue."""
|
||||
log_p = np.log(prices.replace(0, np.nan))
|
||||
def _r2(series):
|
||||
y = series.dropna().values
|
||||
if len(y) < 30:
|
||||
return np.nan
|
||||
x = np.arange(len(y), dtype=float)
|
||||
x -= x.mean()
|
||||
y_dm = y - y.mean()
|
||||
ss_xy = (x * y_dm).sum()
|
||||
ss_xx = (x * x).sum()
|
||||
ss_yy = (y_dm * y_dm).sum()
|
||||
if ss_xx == 0 or ss_yy == 0:
|
||||
return np.nan
|
||||
r2 = (ss_xy ** 2) / (ss_xx * ss_yy)
|
||||
slope = ss_xy / ss_xx
|
||||
return r2 if slope > 0 else -r2 # sign by direction
|
||||
return log_p.rolling(60, min_periods=30).apply(_r2, raw=False)
|
||||
|
||||
|
||||
def factor_relative_volume_momentum(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Price momentum weighted by how 'cheap' a stock is relative to 52-week range.
|
||||
Hypothesis: momentum in stocks near lows is more likely to persist."""
|
||||
mom_20 = prices.pct_change(20)
|
||||
high_252 = prices.rolling(252, min_periods=126).max()
|
||||
low_252 = prices.rolling(252, min_periods=126).min()
|
||||
position_in_range = (prices - low_252) / (high_252 - low_252).replace(0, np.nan)
|
||||
return mom_20 * (1 - position_in_range) # momentum * cheapness
|
||||
|
||||
|
||||
def factor_52w_high_distance(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Distance from 52-week high.
|
||||
Hypothesis: stocks near their highs continue (anchoring bias)."""
|
||||
high_252 = prices.rolling(252, min_periods=126).max()
|
||||
return prices / high_252 # closer to 1 = near high
|
||||
|
||||
|
||||
def factor_downside_beta_proxy(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Proxy for downside beta using co-movement on market down days.
|
||||
Hypothesis: low downside beta outperforms (asymmetric risk)."""
|
||||
ret = prices.pct_change()
|
||||
market_ret = ret.mean(axis=1)
|
||||
down_days = market_ret < 0
|
||||
|
||||
# Mask non-down-day returns to NaN, then rolling mean
|
||||
# Use numpy for correct broadcasting, wider window (120d) so ~54 down
|
||||
# days are available, well above min_periods=20
|
||||
arr = ret.values.copy()
|
||||
arr[~down_days.values, :] = np.nan
|
||||
down_ret = pd.DataFrame(arr, index=ret.index, columns=ret.columns)
|
||||
avg_down = down_ret.rolling(120, min_periods=20).mean()
|
||||
return -avg_down # negative = less downside = good
|
||||
|
||||
|
||||
# --- A-share specific factors ---
|
||||
|
||||
def factor_liquidity_premium(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Amihud illiquidity proxy (using returns only, no volume).
|
||||
Hypothesis: illiquid stocks earn premium in A-shares (retail driven)."""
|
||||
ret = prices.pct_change()
|
||||
# Use absolute return as illiquidity proxy (higher abs ret = less liquid)
|
||||
illiq = ret.abs().rolling(20, min_periods=15).mean()
|
||||
return illiq
|
||||
|
||||
|
||||
def factor_lottery_demand(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Max daily return in past 20 days (negative).
|
||||
Hypothesis: lottery stocks (high max return) underperform.
|
||||
Strong in A-shares due to retail speculation."""
|
||||
ret = prices.pct_change()
|
||||
return -ret.rolling(20, min_periods=15).max()
|
||||
|
||||
|
||||
def factor_turnover_reversal(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Interaction of short-term returns with volatility.
|
||||
High vol + negative return = oversold bounce candidate.
|
||||
Common A-share alpha source."""
|
||||
ret_5 = prices.pct_change(5)
|
||||
vol_20 = prices.pct_change().rolling(20, min_periods=15).std()
|
||||
return -ret_5 * vol_20 # oversold + high vol = positive
|
||||
|
||||
|
||||
def factor_price_level(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Negative absolute price level.
|
||||
Hypothesis: low-priced stocks attract retail in A-shares (penny stock effect)."""
|
||||
return -prices
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# IC and analytics engine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def compute_ic(
|
||||
signal: pd.DataFrame,
|
||||
forward_ret: pd.DataFrame,
|
||||
) -> pd.Series:
|
||||
"""Cross-sectional rank IC (Spearman) per day."""
|
||||
common_idx = signal.index.intersection(forward_ret.index)
|
||||
common_cols = signal.columns.intersection(forward_ret.columns)
|
||||
sig = signal.loc[common_idx, common_cols]
|
||||
fwd = forward_ret.loc[common_idx, common_cols]
|
||||
|
||||
ics = []
|
||||
for date in common_idx:
|
||||
s = sig.loc[date].dropna()
|
||||
f = fwd.loc[date].dropna()
|
||||
common = s.index.intersection(f.index)
|
||||
if len(common) < 30:
|
||||
continue
|
||||
ic = s[common].corr(f[common], method="spearman")
|
||||
if np.isfinite(ic):
|
||||
ics.append((date, ic))
|
||||
|
||||
if not ics:
|
||||
return pd.Series(dtype=float)
|
||||
return pd.Series(dict(ics))
|
||||
|
||||
|
||||
def compute_quintile_returns(
|
||||
signal: pd.DataFrame,
|
||||
forward_ret: pd.DataFrame,
|
||||
n_quantiles: int = 5,
|
||||
) -> pd.DataFrame:
|
||||
"""Average forward return by signal quintile, per day, then time-averaged."""
|
||||
common_idx = signal.index.intersection(forward_ret.index)
|
||||
common_cols = signal.columns.intersection(forward_ret.columns)
|
||||
sig = signal.loc[common_idx, common_cols]
|
||||
fwd = forward_ret.loc[common_idx, common_cols]
|
||||
|
||||
records = []
|
||||
for date in common_idx:
|
||||
s = sig.loc[date].dropna()
|
||||
f = fwd.loc[date].dropna()
|
||||
common = s.index.intersection(f.index)
|
||||
if len(common) < 50:
|
||||
continue
|
||||
scores = s[common]
|
||||
rets = f[common]
|
||||
try:
|
||||
quintile = pd.qcut(scores, n_quantiles, labels=False, duplicates="drop")
|
||||
except ValueError:
|
||||
continue
|
||||
for q in range(n_quantiles):
|
||||
mask = quintile == q
|
||||
if mask.sum() > 0:
|
||||
records.append({"date": date, "quintile": q + 1, "return": rets[mask].mean()})
|
||||
|
||||
if not records:
|
||||
return pd.DataFrame()
|
||||
df = pd.DataFrame(records)
|
||||
return df.groupby("quintile")["return"].mean() * 252 # annualize
|
||||
|
||||
|
||||
def compute_turnover(signal: pd.DataFrame) -> float:
|
||||
"""Average daily rank change (turnover proxy)."""
|
||||
ranked = signal.rank(axis=1, pct=True, na_option="keep")
|
||||
daily_change = ranked.diff().abs().mean(axis=1)
|
||||
return float(daily_change.mean())
|
||||
|
||||
|
||||
def compute_factor_correlation(factors: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
||||
"""Cross-sectional IC correlation between all factor pairs."""
|
||||
names = list(factors.keys())
|
||||
n = len(names)
|
||||
corr_matrix = pd.DataFrame(np.nan, index=names, columns=names)
|
||||
|
||||
# Use time-series of rank-averaged signals
|
||||
avg_ranks = {}
|
||||
for name, sig in factors.items():
|
||||
ranked = sig.rank(axis=1, pct=True, na_option="keep")
|
||||
avg_ranks[name] = ranked.mean(axis=1).dropna()
|
||||
|
||||
for i in range(n):
|
||||
for j in range(i, n):
|
||||
s1 = avg_ranks[names[i]]
|
||||
s2 = avg_ranks[names[j]]
|
||||
common = s1.index.intersection(s2.index)
|
||||
if len(common) > 100:
|
||||
c = s1[common].corr(s2[common])
|
||||
corr_matrix.loc[names[i], names[j]] = c
|
||||
corr_matrix.loc[names[j], names[i]] = c
|
||||
if i == j:
|
||||
corr_matrix.loc[names[i], names[j]] = 1.0
|
||||
|
||||
return corr_matrix
|
||||
|
||||
|
||||
def analyze_factor(
|
||||
name: str,
|
||||
signal: pd.DataFrame,
|
||||
prices: pd.DataFrame,
|
||||
horizons: list[int] | None = None,
|
||||
) -> dict:
|
||||
"""Full single-factor analysis."""
|
||||
if horizons is None:
|
||||
horizons = HORIZONS
|
||||
|
||||
results = {"name": name}
|
||||
|
||||
# Forward returns at each horizon
|
||||
for h in horizons:
|
||||
fwd_ret = prices.pct_change(h).shift(-h)
|
||||
ic_series = compute_ic(signal, fwd_ret)
|
||||
|
||||
if len(ic_series) == 0:
|
||||
results[f"ic_{h}d"] = np.nan
|
||||
results[f"icir_{h}d"] = np.nan
|
||||
continue
|
||||
|
||||
ic_mean = ic_series.mean()
|
||||
ic_std = ic_series.std()
|
||||
icir = ic_mean / ic_std if ic_std > 0 else 0.0
|
||||
|
||||
results[f"ic_{h}d"] = ic_mean
|
||||
results[f"icir_{h}d"] = icir
|
||||
|
||||
if h == 1:
|
||||
results["ic_1d_series"] = ic_series
|
||||
|
||||
# Quintile analysis at 5-day horizon
|
||||
fwd_5d = prices.pct_change(5).shift(-5)
|
||||
quintiles = compute_quintile_returns(signal, fwd_5d)
|
||||
if not quintiles.empty:
|
||||
results["q5_return"] = float(quintiles.iloc[-1]) # top quintile
|
||||
results["q1_return"] = float(quintiles.iloc[0]) # bottom quintile
|
||||
results["long_short_ann"] = float(quintiles.iloc[-1] - quintiles.iloc[0])
|
||||
results["monotonicity"] = float(quintiles.corr(pd.Series(range(1, len(quintiles) + 1), index=quintiles.index)))
|
||||
results["quintile_returns"] = quintiles
|
||||
else:
|
||||
results["q5_return"] = np.nan
|
||||
results["q1_return"] = np.nan
|
||||
results["long_short_ann"] = np.nan
|
||||
results["monotonicity"] = np.nan
|
||||
|
||||
# Turnover
|
||||
results["turnover"] = compute_turnover(signal)
|
||||
|
||||
# Sub-period IC stability (rolling 252-day IC mean)
|
||||
if "ic_1d_series" in results and len(results["ic_1d_series"]) > 252:
|
||||
rolling_ic = results["ic_1d_series"].rolling(252).mean().dropna()
|
||||
results["ic_stability"] = float((rolling_ic > 0).mean()) # fraction of time IC > 0
|
||||
results["ic_worst_year"] = float(rolling_ic.min())
|
||||
results["ic_best_year"] = float(rolling_ic.max())
|
||||
else:
|
||||
results["ic_stability"] = np.nan
|
||||
results["ic_worst_year"] = np.nan
|
||||
results["ic_best_year"] = np.nan
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_all_factors(prices: pd.DataFrame, market: str) -> dict[str, pd.DataFrame]:
|
||||
"""Build all candidate factor signals."""
|
||||
factors = {}
|
||||
|
||||
# Known baselines
|
||||
factors["momentum_12_1"] = factor_momentum_12_1(prices)
|
||||
factors["recovery"] = factor_recovery(prices)
|
||||
factors["inverse_vol"] = factor_inverse_vol(prices)
|
||||
|
||||
# New candidates — universal
|
||||
factors["short_term_reversal"] = factor_short_term_reversal(prices)
|
||||
factors["idio_vol_change"] = factor_idio_vol_change(prices)
|
||||
factors["return_acceleration"] = factor_volume_price_divergence(prices)
|
||||
factors["drawdown_recovery"] = factor_max_drawdown_recovery(prices)
|
||||
factors["neg_skewness"] = factor_skewness(prices)
|
||||
factors["range_compression"] = factor_high_low_range(prices)
|
||||
factors["mean_rev_zscore"] = factor_mean_reversion_residual(prices)
|
||||
factors["up_down_vol_ratio"] = factor_up_down_vol_ratio(prices)
|
||||
factors["win_streak"] = factor_consecutive_up_days(prices)
|
||||
factors["smooth_momentum"] = factor_gap_momentum(prices)
|
||||
factors["recovery_accel"] = factor_recovery_acceleration(prices)
|
||||
factors["trend_r2"] = factor_trend_strength(prices)
|
||||
factors["cheap_momentum"] = factor_relative_volume_momentum(prices)
|
||||
factors["near_52w_high"] = factor_52w_high_distance(prices)
|
||||
factors["low_downside_beta"] = factor_downside_beta_proxy(prices)
|
||||
|
||||
# A-share specific (also test on US for comparison)
|
||||
if market == "cn":
|
||||
factors["illiquidity"] = factor_liquidity_premium(prices)
|
||||
factors["anti_lottery"] = factor_lottery_demand(prices)
|
||||
factors["vol_reversal"] = factor_turnover_reversal(prices)
|
||||
factors["low_price"] = factor_price_level(prices)
|
||||
|
||||
return factors
|
||||
|
||||
|
||||
def print_summary_table(results: list[dict], market: str) -> None:
|
||||
"""Print a ranked summary of all factors."""
|
||||
rows = []
|
||||
for r in results:
|
||||
rows.append({
|
||||
"Factor": r["name"],
|
||||
"IC_1d": r.get("ic_1d", np.nan),
|
||||
"ICIR_1d": r.get("icir_1d", np.nan),
|
||||
"IC_5d": r.get("ic_5d", np.nan),
|
||||
"ICIR_5d": r.get("icir_5d", np.nan),
|
||||
"IC_20d": r.get("ic_20d", np.nan),
|
||||
"ICIR_20d": r.get("icir_20d", np.nan),
|
||||
"LS_5d_ann": r.get("long_short_ann", np.nan),
|
||||
"Mono": r.get("monotonicity", np.nan),
|
||||
"Turnover": r.get("turnover", np.nan),
|
||||
"IC_stab": r.get("ic_stability", np.nan),
|
||||
"IC_worst_yr": r.get("ic_worst_year", np.nan),
|
||||
})
|
||||
|
||||
df = pd.DataFrame(rows).set_index("Factor")
|
||||
df = df.sort_values("ICIR_5d", ascending=False)
|
||||
|
||||
print(f"\n{'='*100}")
|
||||
print(f" FACTOR RESEARCH RESULTS — {market.upper()} MARKET")
|
||||
print(f"{'='*100}")
|
||||
print("\nRanked by 5-day ICIR (most important metric for tradeable alpha):\n")
|
||||
print(df.round(4).to_string())
|
||||
|
||||
# Highlight top factors
|
||||
print(f"\n{'='*100}")
|
||||
print(" TOP FACTORS (ICIR_5d > 0.05 and IC_stability > 0.6)")
|
||||
print(f"{'='*100}")
|
||||
top = df[(df["ICIR_5d"].abs() > 0.05) & (df["IC_stab"] > 0.6)]
|
||||
if top.empty:
|
||||
top = df.head(5)
|
||||
print(" (No factor met strict threshold; showing top 5 by ICIR_5d)")
|
||||
print(top.round(4).to_string())
|
||||
|
||||
# Quintile details for top factors
|
||||
print(f"\n{'='*100}")
|
||||
print(" QUINTILE RETURN PROFILES (annualized, 5-day forward)")
|
||||
print(f"{'='*100}")
|
||||
for r in sorted(results, key=lambda x: abs(x.get("icir_5d", 0)), reverse=True)[:8]:
|
||||
qr = r.get("quintile_returns")
|
||||
if qr is not None and not qr.empty:
|
||||
q_str = " ".join(f"Q{int(k)}: {v:+.1%}" for k, v in qr.items())
|
||||
ls = r.get("long_short_ann", 0)
|
||||
print(f" {r['name']:25s} | {q_str} | L/S: {ls:+.1%}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Factor research")
|
||||
parser.add_argument("--market", default="us", choices=["us", "cn"])
|
||||
parser.add_argument("--years", type=int, default=None, help="Limit to last N years")
|
||||
args = parser.parse_args()
|
||||
|
||||
market = args.market
|
||||
config = UNIVERSES[market]
|
||||
benchmark = config["benchmark"]
|
||||
|
||||
print(f"Loading {market.upper()} price data...")
|
||||
prices = data_manager.load(market)
|
||||
|
||||
# Remove benchmark from stock universe
|
||||
stocks = prices.drop(columns=[benchmark], errors="ignore")
|
||||
|
||||
if args.years:
|
||||
cutoff = stocks.index[-1] - pd.DateOffset(years=args.years)
|
||||
stocks = stocks[stocks.index >= cutoff]
|
||||
|
||||
print(f"Universe: {stocks.shape[1]} stocks, {stocks.shape[0]} trading days")
|
||||
print(f"Date range: {stocks.index[0].date()} to {stocks.index[-1].date()}")
|
||||
|
||||
# Build all factor signals
|
||||
print("\nComputing factor signals...")
|
||||
factors = get_all_factors(stocks, market)
|
||||
|
||||
# Analyze each factor
|
||||
print("Running factor analysis (this may take a few minutes)...")
|
||||
results = []
|
||||
for name, signal in factors.items():
|
||||
print(f" Analyzing: {name}...")
|
||||
r = analyze_factor(name, signal, stocks)
|
||||
results.append(r)
|
||||
|
||||
# Print results
|
||||
print_summary_table(results, market)
|
||||
|
||||
# Factor correlation matrix
|
||||
print(f"\n{'='*100}")
|
||||
print(" FACTOR CORRELATION MATRIX (rank-averaged cross-sectional)")
|
||||
print(f"{'='*100}")
|
||||
corr = compute_factor_correlation(factors)
|
||||
# Show only top factors
|
||||
top_names = [r["name"] for r in sorted(results, key=lambda x: abs(x.get("icir_5d", 0)), reverse=True)[:10]]
|
||||
top_names_present = [n for n in top_names if n in corr.index]
|
||||
print(corr.loc[top_names_present, top_names_present].round(2).to_string())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
323
factor_robustness.py
Normal file
323
factor_robustness.py
Normal file
@@ -0,0 +1,323 @@
|
||||
"""
|
||||
Robustness checks for winning factor strategies.
|
||||
|
||||
Tests:
|
||||
1. Rolling 2-year window performance (stability)
|
||||
2. Top-N sensitivity (5, 10, 15, 20)
|
||||
3. Rebalance frequency sensitivity (5d, 10d, 21d, 42d)
|
||||
4. Transaction cost sensitivity (0, 10bps, 20bps, 50bps)
|
||||
5. Drawdown analysis
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
from universe import UNIVERSES
|
||||
from factor_real_backtest import (
|
||||
f_recovery_mom,
|
||||
f_momentum_12_1,
|
||||
f_recovery,
|
||||
f_recovery_deep,
|
||||
f_up_volume_proxy,
|
||||
f_gap_up_freq,
|
||||
f_earnings_drift_proxy,
|
||||
f_reversal_vol_cn,
|
||||
f_consistent_winner,
|
||||
combo_signal,
|
||||
make_strategy,
|
||||
run_backtest,
|
||||
compute_stats,
|
||||
)
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
|
||||
def rolling_window_performance(equity: pd.Series, window_years: int = 2):
|
||||
"""Compute rolling window returns."""
|
||||
daily_ret = equity.pct_change().dropna()
|
||||
window = 252 * window_years
|
||||
results = []
|
||||
for end_idx in range(window, len(daily_ret), 63): # step 3 months
|
||||
start_idx = end_idx - window
|
||||
chunk = daily_ret.iloc[start_idx:end_idx]
|
||||
total = (1 + chunk).prod() - 1
|
||||
ann = (1 + total) ** (252 / len(chunk)) - 1
|
||||
sharpe = chunk.mean() / chunk.std() * np.sqrt(252) if chunk.std() > 0 else 0
|
||||
results.append({
|
||||
"end_date": chunk.index[-1].date(),
|
||||
"ann_return": ann,
|
||||
"sharpe": sharpe,
|
||||
})
|
||||
return pd.DataFrame(results)
|
||||
|
||||
|
||||
def drawdown_analysis(equity: pd.Series) -> pd.DataFrame:
|
||||
"""Find top 5 drawdown episodes."""
|
||||
running_max = equity.cummax()
|
||||
drawdown = (equity - running_max) / running_max
|
||||
|
||||
# Find drawdown episodes
|
||||
episodes = []
|
||||
in_dd = False
|
||||
start = None
|
||||
for i in range(len(drawdown)):
|
||||
if drawdown.iloc[i] < -0.05 and not in_dd:
|
||||
in_dd = True
|
||||
start = i
|
||||
elif drawdown.iloc[i] >= 0 and in_dd:
|
||||
in_dd = False
|
||||
trough_idx = drawdown.iloc[start:i].idxmin()
|
||||
episodes.append({
|
||||
"start": drawdown.index[start].date(),
|
||||
"trough": trough_idx.date(),
|
||||
"end": drawdown.index[i].date(),
|
||||
"depth": drawdown.loc[trough_idx],
|
||||
"duration_days": i - start,
|
||||
})
|
||||
# Handle ongoing drawdown
|
||||
if in_dd:
|
||||
trough_idx = drawdown.iloc[start:].idxmin()
|
||||
episodes.append({
|
||||
"start": drawdown.index[start].date(),
|
||||
"trough": trough_idx.date(),
|
||||
"end": "ongoing",
|
||||
"depth": drawdown.loc[trough_idx],
|
||||
"duration_days": len(drawdown) - start,
|
||||
})
|
||||
|
||||
df = pd.DataFrame(episodes)
|
||||
if df.empty:
|
||||
return df
|
||||
return df.nsmallest(5, "depth")
|
||||
|
||||
|
||||
def run_us(stocks: pd.DataFrame):
|
||||
print("=" * 100)
|
||||
print(" US ROBUSTNESS — Winner: momentum_12_1 + up_volume_proxy")
|
||||
print("=" * 100)
|
||||
|
||||
winner_func = combo_signal([(f_momentum_12_1, 0.5), (f_up_volume_proxy, 0.5)])
|
||||
baseline_func = f_recovery_mom
|
||||
|
||||
# 1. Rolling 2-year performance
|
||||
print("\n--- 1. Rolling 2-Year Performance ---\n")
|
||||
for label, func in [("Winner: mom+upvol", winner_func),
|
||||
("Baseline: rec+mom", baseline_func)]:
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks)
|
||||
roll = rolling_window_performance(eq)
|
||||
if roll.empty:
|
||||
continue
|
||||
win_pct = (roll["ann_return"] > 0).mean()
|
||||
print(f" {label}:")
|
||||
print(f" Mean 2yr ann return: {roll['ann_return'].mean():+.1%}")
|
||||
print(f" Min 2yr ann return: {roll['ann_return'].min():+.1%}")
|
||||
print(f" Max 2yr ann return: {roll['ann_return'].max():+.1%}")
|
||||
print(f" % positive 2yr: {win_pct:.0%}")
|
||||
print(f" Mean 2yr Sharpe: {roll['sharpe'].mean():.2f}")
|
||||
print()
|
||||
|
||||
# 2. Top-N sensitivity
|
||||
print("--- 2. Top-N Sensitivity ---\n")
|
||||
header = f" {'Top-N':<8}"
|
||||
for label in ["Winner: mom+upvol", "Baseline: rec+mom"]:
|
||||
header += f" | {'CAGR':>8} {'Sharpe':>8} {'MaxDD':>8}"
|
||||
print(header)
|
||||
print(" " + "-" * 70)
|
||||
|
||||
for top_n in [5, 10, 15, 20, 30]:
|
||||
line = f" {top_n:<8}"
|
||||
for func in [winner_func, baseline_func]:
|
||||
w = make_strategy(stocks, func, top_n=top_n)
|
||||
eq = run_backtest(w, stocks)
|
||||
s = compute_stats(eq, "")
|
||||
line += f" | {s['cagr']:>+7.1%} {s['sharpe']:>8.2f} {s['maxdd']:>+7.1%}"
|
||||
print(line)
|
||||
|
||||
# 3. Rebalance frequency sensitivity
|
||||
print("\n--- 3. Rebalance Frequency Sensitivity ---\n")
|
||||
header = f" {'Rebal':<8}"
|
||||
for label in ["Winner: mom+upvol", "Baseline: rec+mom"]:
|
||||
header += f" | {'CAGR':>8} {'Sharpe':>8} {'MaxDD':>8}"
|
||||
print(header)
|
||||
print(" " + "-" * 70)
|
||||
|
||||
for rebal in [5, 10, 21, 42, 63]:
|
||||
line = f" {rebal}d{'':<5}"
|
||||
for func in [winner_func, baseline_func]:
|
||||
w = make_strategy(stocks, func, top_n=10, rebal_freq=rebal)
|
||||
eq = run_backtest(w, stocks)
|
||||
s = compute_stats(eq, "")
|
||||
line += f" | {s['cagr']:>+7.1%} {s['sharpe']:>8.2f} {s['maxdd']:>+7.1%}"
|
||||
print(line)
|
||||
|
||||
# 4. Transaction cost sensitivity
|
||||
print("\n--- 4. Transaction Cost Sensitivity ---\n")
|
||||
header = f" {'Cost':<8}"
|
||||
for label in ["Winner: mom+upvol", "Baseline: rec+mom"]:
|
||||
header += f" | {'CAGR':>8} {'Sharpe':>8}"
|
||||
print(header)
|
||||
print(" " + "-" * 50)
|
||||
|
||||
for cost in [0, 0.001, 0.002, 0.005]:
|
||||
line = f" {cost*10000:.0f}bps{'':<4}"
|
||||
for func in [winner_func, baseline_func]:
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks, cost=cost)
|
||||
s = compute_stats(eq, "")
|
||||
line += f" | {s['cagr']:>+7.1%} {s['sharpe']:>8.2f}"
|
||||
print(line)
|
||||
|
||||
# 5. Drawdown analysis
|
||||
print("\n--- 5. Drawdown Episodes ---\n")
|
||||
for label, func in [("Winner: mom+upvol", winner_func),
|
||||
("Baseline: rec+mom", baseline_func)]:
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks)
|
||||
dd = drawdown_analysis(eq)
|
||||
print(f" {label}:")
|
||||
if dd.empty:
|
||||
print(" No significant drawdowns")
|
||||
else:
|
||||
for _, row in dd.iterrows():
|
||||
print(f" {row['start']} → {row['trough']} → {row['end']}: "
|
||||
f"{row['depth']:+.1%} ({row['duration_days']}d)")
|
||||
print()
|
||||
|
||||
# 6. Also test the runner-up combos
|
||||
print("--- 6. Other Strong Combos (Top-10, 21d rebal, 10bps) ---\n")
|
||||
other_combos = [
|
||||
("rec_deep+upvol", combo_signal([(f_recovery_deep, 0.5), (f_up_volume_proxy, 0.5)])),
|
||||
("rec_deep+mom", combo_signal([(f_recovery_deep, 0.5), (f_momentum_12_1, 0.5)])),
|
||||
("mom+gap_up", combo_signal([(f_momentum_12_1, 0.5), (f_gap_up_freq, 0.5)])),
|
||||
("rec_deep+upvol+mom", combo_signal([(f_recovery_deep, 0.33), (f_up_volume_proxy, 0.33), (f_momentum_12_1, 0.34)])),
|
||||
("mom+upvol+gap", combo_signal([(f_momentum_12_1, 0.33), (f_up_volume_proxy, 0.33), (f_gap_up_freq, 0.34)])),
|
||||
]
|
||||
for label, func in other_combos:
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks)
|
||||
s = compute_stats(eq, "")
|
||||
print(f" {label:<25} CAGR: {s['cagr']:>+7.1%} Sharpe: {s['sharpe']:.2f} MaxDD: {s['maxdd']:>+7.1%} Calmar: {s['calmar']:.2f}")
|
||||
|
||||
|
||||
def run_cn(stocks: pd.DataFrame):
|
||||
print("\n" + "=" * 100)
|
||||
print(" CN ROBUSTNESS — Winners: reversal_vol + gap_up, earn_drift + reversal_vol")
|
||||
print("=" * 100)
|
||||
|
||||
winner1_func = combo_signal([(f_reversal_vol_cn, 0.5), (f_gap_up_freq, 0.5)])
|
||||
winner2_func = combo_signal([(f_earnings_drift_proxy, 0.5), (f_reversal_vol_cn, 0.5)])
|
||||
baseline_func = f_recovery_mom
|
||||
|
||||
# 1. Rolling 2-year performance
|
||||
print("\n--- 1. Rolling 2-Year Performance ---\n")
|
||||
for label, func in [("W1: rev_vol+gap_up", winner1_func),
|
||||
("W2: earn_drift+rev_vol", winner2_func),
|
||||
("Baseline: rec+mom", baseline_func)]:
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks)
|
||||
roll = rolling_window_performance(eq)
|
||||
if roll.empty:
|
||||
continue
|
||||
win_pct = (roll["ann_return"] > 0).mean()
|
||||
print(f" {label}:")
|
||||
print(f" Mean 2yr ann return: {roll['ann_return'].mean():+.1%}")
|
||||
print(f" Min 2yr ann return: {roll['ann_return'].min():+.1%}")
|
||||
print(f" Max 2yr ann return: {roll['ann_return'].max():+.1%}")
|
||||
print(f" % positive 2yr: {win_pct:.0%}")
|
||||
print(f" Mean 2yr Sharpe: {roll['sharpe'].mean():.2f}")
|
||||
print()
|
||||
|
||||
# 2. Top-N sensitivity
|
||||
print("--- 2. Top-N Sensitivity ---\n")
|
||||
header = f" {'Top-N':<8}"
|
||||
for label in ["W1: rev+gap", "W2: earn+rev", "Baseline"]:
|
||||
header += f" | {'CAGR':>8} {'Sharpe':>8} {'MaxDD':>8}"
|
||||
print(header)
|
||||
print(" " + "-" * 100)
|
||||
|
||||
for top_n in [5, 10, 15, 20]:
|
||||
line = f" {top_n:<8}"
|
||||
for func in [winner1_func, winner2_func, baseline_func]:
|
||||
w = make_strategy(stocks, func, top_n=top_n)
|
||||
eq = run_backtest(w, stocks)
|
||||
s = compute_stats(eq, "")
|
||||
line += f" | {s['cagr']:>+7.1%} {s['sharpe']:>8.2f} {s['maxdd']:>+7.1%}"
|
||||
print(line)
|
||||
|
||||
# 3. Rebalance frequency
|
||||
print("\n--- 3. Rebalance Frequency ---\n")
|
||||
header = f" {'Rebal':<8}"
|
||||
for label in ["W1: rev+gap", "W2: earn+rev", "Baseline"]:
|
||||
header += f" | {'CAGR':>8} {'Sharpe':>8}"
|
||||
print(header)
|
||||
print(" " + "-" * 75)
|
||||
|
||||
for rebal in [5, 10, 21, 42]:
|
||||
line = f" {rebal}d{'':<5}"
|
||||
for func in [winner1_func, winner2_func, baseline_func]:
|
||||
w = make_strategy(stocks, func, top_n=10, rebal_freq=rebal)
|
||||
eq = run_backtest(w, stocks)
|
||||
s = compute_stats(eq, "")
|
||||
line += f" | {s['cagr']:>+7.1%} {s['sharpe']:>8.2f}"
|
||||
print(line)
|
||||
|
||||
# 4. Transaction cost sensitivity
|
||||
print("\n--- 4. Transaction Cost Sensitivity ---\n")
|
||||
header = f" {'Cost':<8}"
|
||||
for label in ["W1: rev+gap", "W2: earn+rev", "Baseline"]:
|
||||
header += f" | {'CAGR':>8} {'Sharpe':>8}"
|
||||
print(header)
|
||||
print(" " + "-" * 75)
|
||||
|
||||
for cost in [0, 0.001, 0.002, 0.005]:
|
||||
line = f" {cost*10000:.0f}bps{'':<4}"
|
||||
for func in [winner1_func, winner2_func, baseline_func]:
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks, cost=cost)
|
||||
s = compute_stats(eq, "")
|
||||
line += f" | {s['cagr']:>+7.1%} {s['sharpe']:>8.2f}"
|
||||
print(line)
|
||||
|
||||
# 5. Drawdown analysis
|
||||
print("\n--- 5. Drawdown Episodes ---\n")
|
||||
for label, func in [("W1: rev_vol+gap_up", winner1_func),
|
||||
("W2: earn_drift+rev_vol", winner2_func),
|
||||
("Baseline: rec+mom", baseline_func)]:
|
||||
w = make_strategy(stocks, func, top_n=10)
|
||||
eq = run_backtest(w, stocks)
|
||||
dd = drawdown_analysis(eq)
|
||||
print(f" {label}:")
|
||||
if dd.empty:
|
||||
print(" No significant drawdowns")
|
||||
else:
|
||||
for _, row in dd.iterrows():
|
||||
print(f" {row['start']} → {row['trough']} → {row['end']}: "
|
||||
f"{row['depth']:+.1%} ({row['duration_days']}d)")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--market", default="both", choices=["us", "cn", "both"])
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.market in ("us", "both"):
|
||||
prices = data_manager.load("us")
|
||||
stocks = prices.drop(columns=["SPY"], errors="ignore")
|
||||
run_us(stocks)
|
||||
|
||||
if args.market in ("cn", "both"):
|
||||
prices = data_manager.load("cn")
|
||||
stocks = prices.drop(columns=["000300.SS"], errors="ignore")
|
||||
run_cn(stocks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
259
factor_yearly_fresh.py
Normal file
259
factor_yearly_fresh.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Rebalancing frequency comparison: daily (1d) vs weekly (5d) vs biweekly (10d) vs monthly (21d).
|
||||
Shows yearly returns and max drawdown for each frequency, for all champion strategies.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import warnings
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import data_manager
|
||||
from factor_loop import (
|
||||
strat, bt, stats, combo,
|
||||
f_rec_mom, f_rec_126, f_rec_63,
|
||||
f_mom_12_1, f_mom_6_1, f_mom_intermediate,
|
||||
f_above_ma200, f_golden_cross,
|
||||
f_up_volume_proxy, f_gap_up_freq,
|
||||
f_rec_mom_filtered, f_down_resilience,
|
||||
f_up_capture, f_52w_high, f_str_10d,
|
||||
f_earnings_drift, f_reversal_vol,
|
||||
)
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
INITIAL = 10_000
|
||||
|
||||
REBAL_CONFIGS = [
|
||||
("daily", 1),
|
||||
("weekly", 5),
|
||||
("biweekly", 10),
|
||||
("monthly", 21),
|
||||
]
|
||||
|
||||
|
||||
def f_quality_mom(p):
|
||||
mom = f_mom_12_1(p)
|
||||
consist = (p.pct_change() > 0).astype(float).rolling(252, min_periods=126).mean()
|
||||
mom_r = mom.rank(axis=1, pct=True, na_option="keep")
|
||||
con_r = consist.rank(axis=1, pct=True, na_option="keep")
|
||||
up_r = f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.4 * mom_r + 0.3 * con_r + 0.3 * up_r
|
||||
|
||||
|
||||
def f_mom_x_gap(p):
|
||||
return (f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep") *
|
||||
f_gap_up_freq(p).rank(axis=1, pct=True, na_option="keep"))
|
||||
|
||||
|
||||
def run_equity(func, prices, rebal=21, cost=0.001):
|
||||
w = strat(prices, func, top_n=10, rebal=rebal)
|
||||
eq = bt(w, prices, cost=cost)
|
||||
return eq / eq.iloc[0] * INITIAL
|
||||
|
||||
|
||||
def year_returns(eq: pd.Series) -> dict[int, float]:
|
||||
dr = eq.pct_change().fillna(0)
|
||||
return {y: float((1 + dr[dr.index.year == y]).prod() - 1)
|
||||
for y in sorted(dr.index.year.unique())}
|
||||
|
||||
|
||||
def max_drawdown(eq: pd.Series) -> float:
|
||||
rm = eq.cummax()
|
||||
dd = (eq - rm) / rm
|
||||
return float(dd.min())
|
||||
|
||||
|
||||
def max_drawdown_yearly(eq: pd.Series) -> dict[int, float]:
|
||||
result = {}
|
||||
for y in sorted(eq.index.year.unique()):
|
||||
chunk = eq[eq.index.year == y]
|
||||
if len(chunk) < 5:
|
||||
continue
|
||||
rm = chunk.cummax()
|
||||
dd = (chunk - rm) / rm
|
||||
result[y] = float(dd.min())
|
||||
return result
|
||||
|
||||
|
||||
def cagr(eq: pd.Series) -> float:
|
||||
dr = eq.pct_change().dropna()
|
||||
if len(dr) < 100:
|
||||
return np.nan
|
||||
ny = len(dr) / 252
|
||||
tot = eq.iloc[-1] / eq.iloc[0] - 1
|
||||
return (1 + tot) ** (1 / ny) - 1
|
||||
|
||||
|
||||
def sharpe(eq: pd.Series) -> float:
|
||||
dr = eq.pct_change().dropna()
|
||||
if len(dr) < 100 or dr.std() == 0:
|
||||
return np.nan
|
||||
return float(dr.mean() / dr.std() * np.sqrt(252))
|
||||
|
||||
|
||||
def turnover_annual(func, prices, rebal):
|
||||
"""Estimate annualised turnover (one-way)."""
|
||||
w = strat(prices, func, top_n=10, rebal=rebal)
|
||||
daily_turn = w.diff().abs().sum(axis=1).mean()
|
||||
return daily_turn * 252
|
||||
|
||||
|
||||
def print_by_year(strat_defs, prices, bench_eq, bench_label, market_label, years):
|
||||
"""For each year, print a table: strategies as rows, rebal frequencies as columns."""
|
||||
|
||||
freq_labels = [r for r, _ in REBAL_CONFIGS]
|
||||
|
||||
# Pre-compute all equities and returns
|
||||
all_eqs = {} # {(sname, freq): equity}
|
||||
for sname, func in strat_defs.items():
|
||||
for rlabel, rdays in REBAL_CONFIGS:
|
||||
all_eqs[(sname, rlabel)] = run_equity(func, prices, rebal=rdays)
|
||||
|
||||
all_rets = {} # {(sname, freq): {year: ret}}
|
||||
for key, eq in all_eqs.items():
|
||||
all_rets[key] = year_returns(eq)
|
||||
|
||||
bench_rets = year_returns(bench_eq)
|
||||
snames = list(strat_defs.keys())
|
||||
name_w = max(len(s) for s in snames) + 1
|
||||
|
||||
for year in years:
|
||||
line_w = name_w + 4 + 20 * (len(freq_labels) + 1)
|
||||
print(f"\n{'=' * line_w}")
|
||||
print(f" {market_label} — {year} (fresh $10,000)")
|
||||
print(f"{'=' * line_w}")
|
||||
|
||||
# Header
|
||||
print(f" {'Strategy':<{name_w}}", end="")
|
||||
for f in freq_labels:
|
||||
print(f" {f:>18}", end="")
|
||||
print(f" {bench_label:>18}")
|
||||
print(f" {'-'*name_w}", end="")
|
||||
for _ in range(len(freq_labels) + 1):
|
||||
print(f" {'-'*18}", end="")
|
||||
print()
|
||||
|
||||
for sname in snames:
|
||||
print(f" {sname:<{name_w}}", end="")
|
||||
|
||||
# Find best freq for this strategy this year
|
||||
freq_vals = {}
|
||||
for f in freq_labels:
|
||||
r = all_rets[(sname, f)].get(year)
|
||||
if r is not None and abs(r) > 0.0005:
|
||||
freq_vals[f] = r
|
||||
|
||||
best_f = max(freq_vals, key=freq_vals.get) if freq_vals else None
|
||||
|
||||
for f in freq_labels:
|
||||
r = all_rets[(sname, f)].get(year)
|
||||
if r is not None and abs(r) > 0.0005:
|
||||
v = INITIAL * (1 + r)
|
||||
marker = " ★" if f == best_f else " "
|
||||
print(f" ${v:>9,.0f} {r:>+5.0%}{marker}", end="")
|
||||
else:
|
||||
print(f" {'—':>18}", end="")
|
||||
|
||||
# Benchmark (same for all strategies)
|
||||
br = bench_rets.get(year)
|
||||
if br is not None and abs(br) > 0.0005:
|
||||
print(f" ${INITIAL*(1+br):>9,.0f} {br:>+5.0%} ", end="")
|
||||
else:
|
||||
print(f" {'—':>18}", end="")
|
||||
print()
|
||||
|
||||
# Best strategy per freq
|
||||
print(f" {'-'*name_w}", end="")
|
||||
for _ in range(len(freq_labels) + 1):
|
||||
print(f" {'-'*18}", end="")
|
||||
print()
|
||||
|
||||
print(f" {'BEST':<{name_w}}", end="")
|
||||
for f in freq_labels:
|
||||
best_r = -999
|
||||
best_s = ""
|
||||
for sname in snames:
|
||||
r = all_rets[(sname, f)].get(year)
|
||||
if r is not None and abs(r) > 0.0005 and r > best_r:
|
||||
best_r = r
|
||||
best_s = sname
|
||||
if best_r > -999:
|
||||
print(f" ${INITIAL*(1+best_r):>9,.0f} {best_r:>+5.0%} ", end="")
|
||||
else:
|
||||
print(f" {'—':>18}", end="")
|
||||
# bench
|
||||
br = bench_rets.get(year)
|
||||
if br is not None and abs(br) > 0.0005:
|
||||
print(f" ${INITIAL*(1+br):>9,.0f} {br:>+5.0%} ", end="")
|
||||
else:
|
||||
print(f" {'—':>18}", end="")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
years = list(range(2015, 2027))
|
||||
|
||||
# ===== US =====
|
||||
print(f"\n{'#'*130}")
|
||||
print(f"{'#'*50} US MARKET {'#'*50}")
|
||||
print(f"{'#'*130}")
|
||||
|
||||
prices_us = data_manager.load("us")
|
||||
bench_us = prices_us["SPY"].dropna()
|
||||
stocks_us = prices_us.drop(columns=["SPY"], errors="ignore")
|
||||
eq_spy = bench_us / bench_us.iloc[0] * INITIAL
|
||||
|
||||
us_strats = {
|
||||
"rec_mfilt+deep×upvol": combo([
|
||||
(f_rec_mom_filtered, 0.5),
|
||||
(combo([(f_rec_126, 0.5), (f_up_volume_proxy, 0.5)]), 0.5),
|
||||
]),
|
||||
"ma200+mom7m+rec126": combo([
|
||||
(f_above_ma200, 0.33), (f_mom_intermediate, 0.33), (f_rec_126, 0.34)
|
||||
]),
|
||||
"rec_mfilt+ma200": combo([
|
||||
(f_rec_mom_filtered, 0.5), (f_above_ma200, 0.5)
|
||||
]),
|
||||
"mom7m+rec126": combo([
|
||||
(f_mom_intermediate, 0.5), (f_rec_126, 0.5)
|
||||
]),
|
||||
"BASELINE:rec+mom": f_rec_mom,
|
||||
}
|
||||
|
||||
print_by_year(us_strats, stocks_us, eq_spy, "SPY", "US", years)
|
||||
|
||||
# ===== CN =====
|
||||
print(f"\n\n{'#'*130}")
|
||||
print(f"{'#'*50} CN MARKET {'#'*50}")
|
||||
print(f"{'#'*130}")
|
||||
|
||||
prices_cn = data_manager.load("cn")
|
||||
bench_cn = prices_cn["000300.SS"].dropna() if "000300.SS" in prices_cn.columns else None
|
||||
stocks_cn = prices_cn.drop(columns=["000300.SS"], errors="ignore")
|
||||
|
||||
cn_strats = {
|
||||
"up_cap+quality_mom": combo([
|
||||
(f_up_capture, 0.5), (f_quality_mom, 0.5)
|
||||
]),
|
||||
"down_resil+qual_mom": combo([
|
||||
(f_down_resilience, 0.5), (f_quality_mom, 0.5)
|
||||
]),
|
||||
"rec63+mom×gap": combo([
|
||||
(f_rec_63, 0.5), (f_mom_x_gap, 0.5)
|
||||
]),
|
||||
"up_cap+mom×gap": combo([
|
||||
(f_up_capture, 0.5), (f_mom_x_gap, 0.5)
|
||||
]),
|
||||
"BASELINE:rec+mom": f_rec_mom,
|
||||
}
|
||||
|
||||
if bench_cn is not None:
|
||||
eq_csi = bench_cn / bench_cn.iloc[0] * INITIAL
|
||||
else:
|
||||
eq_csi = pd.Series(dtype=float)
|
||||
|
||||
print_by_year(cn_strats, stocks_cn, eq_csi, "CSI300", "CN", years)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
219
factor_yearly_report.py
Normal file
219
factor_yearly_report.py
Normal file
@@ -0,0 +1,219 @@
|
||||
"""
|
||||
Yearly ROI report for champion strategies vs SPY, starting from $10,000.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import warnings
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import data_manager
|
||||
from universe import UNIVERSES
|
||||
from factor_loop import (
|
||||
strat, bt, stats, combo,
|
||||
f_rec_mom, f_rec_126, f_rec_63,
|
||||
f_mom_12_1, f_mom_6_1, f_mom_intermediate,
|
||||
f_above_ma200, f_golden_cross,
|
||||
f_up_volume_proxy, f_gap_up_freq,
|
||||
f_rec_mom_filtered, f_down_resilience,
|
||||
f_up_capture, f_52w_high, f_str_10d,
|
||||
f_earnings_drift, f_reversal_vol,
|
||||
)
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
INITIAL = 10_000
|
||||
|
||||
|
||||
def f_quality_mom(p):
|
||||
mom = f_mom_12_1(p)
|
||||
consist = (p.pct_change() > 0).astype(float).rolling(252, min_periods=126).mean()
|
||||
mom_r = mom.rank(axis=1, pct=True, na_option="keep")
|
||||
con_r = consist.rank(axis=1, pct=True, na_option="keep")
|
||||
up_r = f_up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.4 * mom_r + 0.3 * con_r + 0.3 * up_r
|
||||
|
||||
|
||||
def f_mom_x_gap(p):
|
||||
return (f_mom_12_1(p).rank(axis=1, pct=True, na_option="keep") *
|
||||
f_gap_up_freq(p).rank(axis=1, pct=True, na_option="keep"))
|
||||
|
||||
|
||||
def run_equity(func, prices, cost=0.001):
|
||||
w = strat(prices, func, top_n=10)
|
||||
eq = bt(w, prices, cost=cost)
|
||||
return eq / eq.iloc[0] * INITIAL
|
||||
|
||||
|
||||
def yearly_table(equities: dict[str, pd.Series], title: str):
|
||||
print(f"\n{'='*130}")
|
||||
print(f" {title}")
|
||||
print(f" Starting capital: ${INITIAL:,.0f}")
|
||||
print(f"{'='*130}")
|
||||
|
||||
names = list(equities.keys())
|
||||
all_years = sorted(set(y for eq in equities.values() for y in eq.index.year.unique()))
|
||||
|
||||
# Header
|
||||
print(f"\n {'Year':<6}", end="")
|
||||
for n in names:
|
||||
print(f" | {n[:24]:>24}", end="")
|
||||
print()
|
||||
print(f" {'-'*6}", end="")
|
||||
for _ in names:
|
||||
print(f"-+-{'-'*24}", end="")
|
||||
print()
|
||||
|
||||
# Track portfolio values
|
||||
year_end_vals = {n: INITIAL for n in names}
|
||||
|
||||
for year in all_years:
|
||||
print(f" {year:<6}", end="")
|
||||
for n in names:
|
||||
eq = equities[n]
|
||||
yr_data = eq[eq.index.year == year]
|
||||
if len(yr_data) < 2:
|
||||
print(f" | {'—':>24}", end="")
|
||||
continue
|
||||
start_val = yr_data.iloc[0]
|
||||
end_val = yr_data.iloc[-1]
|
||||
ret = end_val / start_val - 1
|
||||
year_end_vals[n] = end_val
|
||||
# Show both return % and portfolio value
|
||||
print(f" | {ret:>+7.1%} ${end_val:>12,.0f}", end="")
|
||||
print()
|
||||
|
||||
# Summary rows
|
||||
print(f" {'-'*6}", end="")
|
||||
for _ in names:
|
||||
print(f"-+-{'-'*24}", end="")
|
||||
print()
|
||||
|
||||
# Total return
|
||||
print(f" {'Total':<6}", end="")
|
||||
for n in names:
|
||||
eq = equities[n]
|
||||
total = eq.iloc[-1] / INITIAL - 1
|
||||
print(f" | {total:>+7.0%} ${eq.iloc[-1]:>12,.0f}", end="")
|
||||
print()
|
||||
|
||||
# CAGR
|
||||
print(f" {'CAGR':<6}", end="")
|
||||
for n in names:
|
||||
eq = equities[n]
|
||||
ny = len(eq) / 252
|
||||
total = eq.iloc[-1] / INITIAL - 1
|
||||
cagr = (1 + total) ** (1 / ny) - 1
|
||||
print(f" | {cagr:>+7.1%} {'':>12}", end="")
|
||||
print()
|
||||
|
||||
# Sharpe
|
||||
print(f" {'Sharpe':<6}", end="")
|
||||
for n in names:
|
||||
eq = equities[n]
|
||||
dr = eq.pct_change().dropna()
|
||||
sh = dr.mean() / dr.std() * np.sqrt(252) if dr.std() > 0 else 0
|
||||
print(f" | {sh:>7.2f} {'':>12}", end="")
|
||||
print()
|
||||
|
||||
# Max DD
|
||||
print(f" {'MaxDD':<6}", end="")
|
||||
for n in names:
|
||||
eq = equities[n]
|
||||
rm = eq.cummax()
|
||||
dd = ((eq - rm) / rm).min()
|
||||
print(f" | {dd:>+7.1%} {'':>12}", end="")
|
||||
print()
|
||||
|
||||
# Best/Worst year
|
||||
print(f" {'Best':<6}", end="")
|
||||
for n in names:
|
||||
eq = equities[n]
|
||||
dr = eq.pct_change().fillna(0)
|
||||
yr_rets = {y: float((1 + dr[dr.index.year == y]).prod() - 1) for y in all_years}
|
||||
# skip warmup year
|
||||
active = {y: r for y, r in yr_rets.items() if abs(r) > 0.001}
|
||||
if active:
|
||||
best_y = max(active, key=active.get)
|
||||
print(f" | {active[best_y]:>+7.1%} ({best_y}) ", end="")
|
||||
else:
|
||||
print(f" | {'—':>24}", end="")
|
||||
print()
|
||||
|
||||
print(f" {'Worst':<6}", end="")
|
||||
for n in names:
|
||||
eq = equities[n]
|
||||
dr = eq.pct_change().fillna(0)
|
||||
yr_rets = {y: float((1 + dr[dr.index.year == y]).prod() - 1) for y in all_years}
|
||||
active = {y: r for y, r in yr_rets.items() if abs(r) > 0.001}
|
||||
if active:
|
||||
worst_y = min(active, key=active.get)
|
||||
print(f" | {active[worst_y]:>+7.1%} ({worst_y}) ", end="")
|
||||
else:
|
||||
print(f" | {'—':>24}", end="")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
# ===== US =====
|
||||
prices_us = data_manager.load("us")
|
||||
bench_us = prices_us["SPY"].dropna()
|
||||
stocks_us = prices_us.drop(columns=["SPY"], errors="ignore")
|
||||
|
||||
eq_spy = bench_us / bench_us.iloc[0] * INITIAL
|
||||
|
||||
us_strats = {
|
||||
"rec_mfilt+deep×upvol": combo([
|
||||
(f_rec_mom_filtered, 0.5),
|
||||
(combo([(f_rec_126, 0.5), (f_up_volume_proxy, 0.5)]), 0.5),
|
||||
]),
|
||||
"ma200+mom7m+rec126": combo([
|
||||
(f_above_ma200, 0.33), (f_mom_intermediate, 0.33), (f_rec_126, 0.34)
|
||||
]),
|
||||
"rec_mfilt+ma200": combo([
|
||||
(f_rec_mom_filtered, 0.5), (f_above_ma200, 0.5)
|
||||
]),
|
||||
"mom7m+rec126": combo([
|
||||
(f_mom_intermediate, 0.5), (f_rec_126, 0.5)
|
||||
]),
|
||||
"BASELINE:rec+mom": f_rec_mom,
|
||||
}
|
||||
|
||||
us_equities = {}
|
||||
for name, func in us_strats.items():
|
||||
us_equities[name] = run_equity(func, stocks_us)
|
||||
us_equities["SPY (Benchmark)"] = eq_spy
|
||||
|
||||
yearly_table(us_equities, "US MARKET — Champion Strategies vs SPY — $10,000 Starting Capital")
|
||||
|
||||
# ===== CN =====
|
||||
prices_cn = data_manager.load("cn")
|
||||
bench_cn = prices_cn["000300.SS"].dropna() if "000300.SS" in prices_cn.columns else None
|
||||
stocks_cn = prices_cn.drop(columns=["000300.SS"], errors="ignore")
|
||||
|
||||
cn_strats = {
|
||||
"up_cap+quality_mom": combo([
|
||||
(f_up_capture, 0.5), (f_quality_mom, 0.5)
|
||||
]),
|
||||
"down_resil+qual_mom": combo([
|
||||
(f_down_resilience, 0.5), (f_quality_mom, 0.5)
|
||||
]),
|
||||
"rec63+mom×gap": combo([
|
||||
(f_rec_63, 0.5), (f_mom_x_gap, 0.5)
|
||||
]),
|
||||
"up_cap+mom×gap": combo([
|
||||
(f_up_capture, 0.5), (f_mom_x_gap, 0.5)
|
||||
]),
|
||||
"BASELINE:rec+mom": f_rec_mom,
|
||||
}
|
||||
|
||||
cn_equities = {}
|
||||
for name, func in cn_strats.items():
|
||||
cn_equities[name] = run_equity(func, stocks_cn)
|
||||
if bench_cn is not None:
|
||||
cn_equities["CSI300 (Benchmark)"] = bench_cn / bench_cn.iloc[0] * INITIAL
|
||||
|
||||
yearly_table(cn_equities, "CN MARKET — Champion Strategies vs CSI 300 — $10,000 Starting Capital")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
27
main.py
27
main.py
@@ -5,6 +5,7 @@ import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
import factor_attribution
|
||||
import metrics
|
||||
from strategies.adaptive_momentum import AdaptiveMomentumStrategy
|
||||
from strategies.buy_and_hold import BuyAndHoldStrategy
|
||||
@@ -163,6 +164,18 @@ def main() -> None:
|
||||
help="Execution mode: 'close' (default, signal & execute on close) or "
|
||||
"'open-close' (signal on morning open, execute at close)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attribution", action="store_true",
|
||||
help="Run factor attribution after performance metrics",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attribution-model", choices=["capm", "ff5", "ff5plus", "all"], default="all",
|
||||
help="Factor model selection for attribution output",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attribution-export", default=None,
|
||||
help="Directory to export factor attribution CSVs",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
initial_capital = args.capital if args.capital is not None else 10_000
|
||||
use_open = args.execution == "open-close"
|
||||
@@ -238,6 +251,20 @@ def main() -> None:
|
||||
continue
|
||||
metrics.summary(eq, name=name)
|
||||
|
||||
if args.attribution:
|
||||
summary_df, loadings_df = factor_attribution.attribute_strategies(
|
||||
results_df=results_df,
|
||||
benchmark_label=benchmark_label,
|
||||
benchmark=benchmark,
|
||||
price_data=data,
|
||||
market=args.market,
|
||||
model_selection=args.attribution_model,
|
||||
)
|
||||
factor_attribution.print_attribution_summary(summary_df)
|
||||
if args.attribution_export:
|
||||
factor_attribution.export_attribution(summary_df, loadings_df, args.attribution_export)
|
||||
print(f"Attribution CSVs written to {args.attribution_export}")
|
||||
|
||||
# --- Visualization ---
|
||||
if not args.no_plot:
|
||||
plot_results(results_df.dropna())
|
||||
|
||||
0
research/__init__.py
Normal file
0
research/__init__.py
Normal file
34
research/event_factors.py
Normal file
34
research/event_factors.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
TRAILING_HIGH_WINDOW = 60
|
||||
COMPRESSION_WINDOW = 20
|
||||
VOLUME_WINDOW = 20
|
||||
|
||||
|
||||
def breakout_after_compression_score(
|
||||
close: pd.DataFrame,
|
||||
high: pd.DataFrame,
|
||||
low: pd.DataFrame,
|
||||
volume: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""Score breakout setups and shift the result so it is tradable next day."""
|
||||
close = close.sort_index()
|
||||
high = high.reindex(index=close.index, columns=close.columns).sort_index()
|
||||
low = low.reindex(index=close.index, columns=close.columns).sort_index()
|
||||
volume = volume.reindex(index=close.index, columns=close.columns).sort_index()
|
||||
|
||||
trailing_high = close.rolling(TRAILING_HIGH_WINDOW, min_periods=TRAILING_HIGH_WINDOW).max()
|
||||
proximity_to_high = close / trailing_high.replace(0, np.nan)
|
||||
|
||||
recent_high = high.rolling(COMPRESSION_WINDOW, min_periods=COMPRESSION_WINDOW).max()
|
||||
recent_low = low.rolling(COMPRESSION_WINDOW, min_periods=COMPRESSION_WINDOW).min()
|
||||
recent_mid = (recent_high + recent_low) / 2
|
||||
compressed_range = -((recent_high - recent_low) / recent_mid.replace(0, np.nan))
|
||||
|
||||
median_volume = volume.rolling(VOLUME_WINDOW, min_periods=VOLUME_WINDOW).median()
|
||||
abnormal_volume = volume / median_volume.replace(0, np.nan)
|
||||
|
||||
score = proximity_to_high + compressed_range + abnormal_volume
|
||||
return score.shift(1)
|
||||
152
research/fetch_historical.py
Normal file
152
research/fetch_historical.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
Fetch price history for all tickers that were ever S&P 500 members — including
|
||||
delisted ones — and save to data/us_pit.csv. This is the foundation for a
|
||||
survivorship-bias-free backtest.
|
||||
|
||||
NOTE: Yahoo Finance no longer serves price data for many fully-delisted tickers
|
||||
(bankruptcies, old mergers). Those are silently skipped. The result is still
|
||||
a major improvement over "today's S&P 500 extrapolated 10 years back", but it
|
||||
is NOT a perfect point-in-time dataset — only a dataset where the universe
|
||||
mask is correct at each date. A subset of worst-outcome tickers (e.g., ABK,
|
||||
ACAS) will be missing entirely. This caveat is documented in the run summary.
|
||||
"""
|
||||
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pandas as pd
|
||||
import yfinance as yf
|
||||
|
||||
import universe_history as uh
|
||||
|
||||
DATA_DIR = "data"
|
||||
OUT_PATH = os.path.join(DATA_DIR, "us_pit.csv")
|
||||
YEARS = 10
|
||||
BATCH_SIZE = 50
|
||||
|
||||
|
||||
def _field_out_paths() -> dict[str, str]:
|
||||
return {
|
||||
"Close": os.path.join(DATA_DIR, "us_pit_close.csv"),
|
||||
"High": os.path.join(DATA_DIR, "us_pit_high.csv"),
|
||||
"Low": os.path.join(DATA_DIR, "us_pit_low.csv"),
|
||||
"Volume": os.path.join(DATA_DIR, "us_pit_volume.csv"),
|
||||
}
|
||||
|
||||
|
||||
def fetch_all_historical(force: bool = False) -> pd.DataFrame:
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
intervals = uh.load_sp500_history()
|
||||
tickers = uh.all_tickers_ever(intervals) + ["SPY"]
|
||||
tickers = sorted(set(tickers))
|
||||
|
||||
existing = None
|
||||
if os.path.exists(OUT_PATH) and not force:
|
||||
existing = pd.read_csv(OUT_PATH, index_col=0, parse_dates=True)
|
||||
missing = [t for t in tickers if t not in existing.columns]
|
||||
if not missing:
|
||||
# Just append latest dates
|
||||
last_date = existing.index[-1]
|
||||
if (datetime.now() - last_date.to_pydatetime()).days < 2:
|
||||
print(f"--- us_pit.csv already up to date: {existing.shape} ---")
|
||||
return existing
|
||||
tickers = list(existing.columns)
|
||||
start = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
print(f"--- Appending new dates from {start} for {len(tickers)} tickers ---")
|
||||
new = _download_batched(tickers, start=start)
|
||||
if new is not None and not new.empty:
|
||||
combined = pd.concat([existing, new]).sort_index()
|
||||
combined = combined[~combined.index.duplicated(keep="last")]
|
||||
combined.to_csv(OUT_PATH)
|
||||
print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
|
||||
return combined
|
||||
return existing
|
||||
else:
|
||||
print(f"--- Have {existing.shape[1]} cols; need {len(missing)} more ---")
|
||||
tickers = missing
|
||||
|
||||
start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d")
|
||||
new = _download_batched(tickers, start=start)
|
||||
|
||||
if existing is not None and new is not None and not new.empty:
|
||||
combined = pd.concat([existing, new.reindex(existing.index)], axis=1)
|
||||
# Add any new rows from `new` not in existing
|
||||
new_only_idx = new.index.difference(existing.index)
|
||||
if len(new_only_idx) > 0:
|
||||
combined_new = new.loc[new_only_idx].reindex(columns=combined.columns)
|
||||
combined = pd.concat([combined, combined_new]).sort_index()
|
||||
else:
|
||||
combined = new
|
||||
|
||||
combined.to_csv(OUT_PATH)
|
||||
print(f"--- Saved {combined.shape} to {OUT_PATH} ---")
|
||||
return combined
|
||||
|
||||
|
||||
def fetch_all_historical_ohlcv(force: bool = False) -> dict[str, pd.DataFrame]:
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
intervals = uh.load_sp500_history()
|
||||
tickers = uh.all_tickers_ever(intervals) + ["SPY"]
|
||||
tickers = sorted(set(tickers))
|
||||
start = (datetime.now() - timedelta(days=365 * YEARS)).strftime("%Y-%m-%d")
|
||||
panels = _download_batched_fields(tickers, start=start, fields=["Close", "High", "Low", "Volume"])
|
||||
if not panels:
|
||||
raise RuntimeError("No PIT OHLCV data downloaded")
|
||||
|
||||
close = panels["Close"]
|
||||
close.to_csv(OUT_PATH)
|
||||
print(f"--- Saved {close.shape} to {OUT_PATH} ---")
|
||||
result: dict[str, pd.DataFrame] = {"close": close}
|
||||
for field, path in _field_out_paths().items():
|
||||
panel = panels[field]
|
||||
panel.to_csv(path)
|
||||
print(f"--- Saved {panel.shape} to {path} ---")
|
||||
result[field.lower()] = panel
|
||||
return result
|
||||
|
||||
|
||||
def _download_batched(tickers: list[str], start: str) -> pd.DataFrame | None:
|
||||
panels = _download_batched_fields(tickers, start=start, fields=["Close"])
|
||||
if not panels:
|
||||
return None
|
||||
return panels["Close"]
|
||||
|
||||
|
||||
def _download_batched_fields(
|
||||
tickers: list[str],
|
||||
start: str,
|
||||
fields: list[str],
|
||||
) -> dict[str, pd.DataFrame]:
|
||||
frames = {field: [] for field in fields}
|
||||
n = len(tickers)
|
||||
for i in range(0, n, BATCH_SIZE):
|
||||
batch = tickers[i:i + BATCH_SIZE]
|
||||
print(f" [{i}/{n}] fetching {len(batch)} tickers...", flush=True)
|
||||
try:
|
||||
raw = yf.download(batch, start=start, auto_adjust=True,
|
||||
progress=False, threads=True)
|
||||
if raw.empty:
|
||||
continue
|
||||
for field in fields:
|
||||
if isinstance(raw.columns, pd.MultiIndex):
|
||||
panel = raw[field]
|
||||
else:
|
||||
panel = raw[[field]].rename(columns={field: batch[0]})
|
||||
panel = panel.dropna(axis=1, how="all")
|
||||
if not panel.empty:
|
||||
frames[field].append(panel)
|
||||
except Exception as e:
|
||||
print(f" batch failed: {e}")
|
||||
result = {}
|
||||
for field, field_frames in frames.items():
|
||||
if field_frames:
|
||||
panel = pd.concat(field_frames, axis=1).sort_index()
|
||||
panel = panel.loc[:, ~panel.columns.duplicated()]
|
||||
result[field] = panel
|
||||
else:
|
||||
result[field] = pd.DataFrame()
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fetch_all_historical()
|
||||
299
research/optimize.py
Normal file
299
research/optimize.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""
|
||||
End-to-end optimization study for the US recovery+momentum strategy family,
|
||||
run on a point-in-time (survivorship-bias-mitigated) S&P 500 universe.
|
||||
|
||||
Experiments:
|
||||
E1 — Baseline drift: biased vs point-in-time universe, current top10 params.
|
||||
E2 — Hyperparameter sweep with 2016-2022 train / 2023-2026 test split.
|
||||
E3 — SPY MA200 regime filter (compare base vs filtered).
|
||||
E4 — Weighting schemes: equal vs inverse-vol vs rank.
|
||||
E5 — Ensemble of top-3 uncorrelated configs.
|
||||
|
||||
Usage: uv run python -m research.optimize
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
import research.pit_backtest as pit
|
||||
from research.strategies_plus import (EnsembleStrategy, RecoveryMomentumPlus,
|
||||
spy_ma200_filter)
|
||||
from strategies.recovery_momentum import RecoveryMomentumStrategy
|
||||
|
||||
DATA_DIR = "data"
|
||||
BIASED_CSV = os.path.join(DATA_DIR, "us.csv")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def slice_period(df: pd.DataFrame, start: str | None, end: str | None) -> pd.DataFrame:
|
||||
out = df
|
||||
if start:
|
||||
out = out[out.index >= start]
|
||||
if end:
|
||||
out = out[out.index <= end]
|
||||
return out
|
||||
|
||||
|
||||
def run_strategy(strategy, prices, benchmark=None, regime_filter=None,
|
||||
fixed_fee: float = 0.0) -> pd.Series:
|
||||
return pit.backtest(
|
||||
strategy=strategy, prices=prices, initial_capital=10_000,
|
||||
transaction_cost=0.001, fixed_fee=fixed_fee,
|
||||
benchmark=benchmark, regime_filter=regime_filter,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Experiment 1: bias drift
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def exp1_bias_drift(pit_prices_masked: pd.DataFrame) -> pd.DataFrame:
|
||||
print("\n" + "=" * 90)
|
||||
print("E1 — Biased universe vs Point-in-time universe (recovery_mom_top10)")
|
||||
print("=" * 90)
|
||||
rows = []
|
||||
|
||||
# Biased: current 503 tickers extrapolated backward
|
||||
biased = pd.read_csv(BIASED_CSV, index_col=0, parse_dates=True)
|
||||
# Use same date range as PIT for a fair comparison
|
||||
common_start = max(biased.index[0], pit_prices_masked.index[0])
|
||||
common_end = min(biased.index[-1], pit_prices_masked.index[-1])
|
||||
biased_window = slice_period(biased, str(common_start.date()), str(common_end.date()))
|
||||
pit_window = slice_period(pit_prices_masked, str(common_start.date()), str(common_end.date()))
|
||||
|
||||
# Drop non-ticker columns (SPY is in PIT but not in the masked tickers)
|
||||
biased_tickers = [c for c in biased_window.columns if c != "SPY"]
|
||||
pit_tickers = [c for c in pit_window.columns if c != "SPY"]
|
||||
|
||||
# Use RecoveryMomentumPlus with identical defaults to recovery_mom_top10.
|
||||
# The original strategy uses na_option="bottom" which misranks NaN-masked
|
||||
# data (non-members appear "top"); the Plus variant uses na_option="keep".
|
||||
strat = RecoveryMomentumPlus(top_n=10) # defaults match RecoveryMomentumStrategy
|
||||
eq_biased = run_strategy(strat, biased_window[biased_tickers])
|
||||
eq_pit = run_strategy(RecoveryMomentumPlus(top_n=10), pit_window[pit_tickers])
|
||||
|
||||
rows.append(pit.summarize(eq_biased, name="recovery_mom_top10 (BIASED)"))
|
||||
rows.append(pit.summarize(eq_pit, name="recovery_mom_top10 (POINT-IN-TIME)"))
|
||||
# Benchmark: SPY buy-and-hold in same window
|
||||
if "SPY" in biased_window.columns:
|
||||
spy_bh = (biased_window["SPY"] / biased_window["SPY"].iloc[0]) * 10_000
|
||||
rows.append(pit.summarize(spy_bh, name="SPY buy-and-hold"))
|
||||
|
||||
for r in rows:
|
||||
print(pit.fmt_row(r))
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Experiment 2: hyperparameter sweep with train/test split
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def exp2_sweep(pit_masked: pd.DataFrame) -> pd.DataFrame:
|
||||
print("\n" + "=" * 90)
|
||||
print("E2 — Hyperparameter sweep (train: 2016-2022, test: 2023-2026)")
|
||||
print("=" * 90)
|
||||
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||
prices = pit_masked[tickers]
|
||||
|
||||
train = slice_period(prices, "2016-04-01", "2022-12-31")
|
||||
test = slice_period(prices, "2023-01-01", None)
|
||||
|
||||
grid = []
|
||||
for top_n in [5, 8, 10, 15]:
|
||||
for rec_win in [42, 63, 126]:
|
||||
for rec_w in [0.3, 0.5, 0.7]:
|
||||
for rebal in [10, 21]:
|
||||
grid.append(dict(top_n=top_n, recovery_window=rec_win,
|
||||
rec_weight=rec_w, rebal_freq=rebal))
|
||||
|
||||
results = []
|
||||
for i, cfg in enumerate(grid):
|
||||
strat_train = RecoveryMomentumPlus(**cfg)
|
||||
eq_tr = run_strategy(strat_train, train)
|
||||
sum_tr = pit.summarize(eq_tr, name="train")
|
||||
|
||||
strat_test = RecoveryMomentumPlus(**cfg)
|
||||
eq_te = run_strategy(strat_test, test)
|
||||
sum_te = pit.summarize(eq_te, name="test")
|
||||
|
||||
results.append({
|
||||
**cfg,
|
||||
"train_CAGR": sum_tr["CAGR"],
|
||||
"train_Sharpe": sum_tr["Sharpe"],
|
||||
"train_MaxDD": sum_tr["MaxDD"],
|
||||
"test_CAGR": sum_te["CAGR"],
|
||||
"test_Sharpe": sum_te["Sharpe"],
|
||||
"test_MaxDD": sum_te["MaxDD"],
|
||||
"test_Calmar": sum_te["Calmar"],
|
||||
})
|
||||
if (i + 1) % 10 == 0 or i == len(grid) - 1:
|
||||
print(f" ... {i+1}/{len(grid)} configs evaluated")
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
df = df.sort_values("test_Sharpe", ascending=False)
|
||||
|
||||
# Print top 10 by TEST Sharpe, then top 10 by TRAIN Sharpe to see overfit gap
|
||||
print("\n --- Top 10 by TEST Sharpe (out-of-sample, 2023-2026) ---")
|
||||
disp_cols = ["top_n", "recovery_window", "rec_weight", "rebal_freq",
|
||||
"train_Sharpe", "test_Sharpe", "train_CAGR", "test_CAGR",
|
||||
"test_MaxDD", "test_Calmar"]
|
||||
print(df.head(10)[disp_cols].to_string(index=False,
|
||||
formatters={"train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format,
|
||||
"train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format,
|
||||
"test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format}))
|
||||
|
||||
print("\n --- Top 10 by TRAIN Sharpe (for comparison / overfit check) ---")
|
||||
df_tr = df.sort_values("train_Sharpe", ascending=False)
|
||||
print(df_tr.head(10)[disp_cols].to_string(index=False,
|
||||
formatters={"train_Sharpe": "{:.2f}".format, "test_Sharpe": "{:.2f}".format,
|
||||
"train_CAGR": "{:.1%}".format, "test_CAGR": "{:.1%}".format,
|
||||
"test_MaxDD": "{:.1%}".format, "test_Calmar": "{:.2f}".format}))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Experiment 3: regime filter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def exp3_regime(pit_masked: pd.DataFrame) -> pd.DataFrame:
|
||||
print("\n" + "=" * 90)
|
||||
print("E3 — SPY MA200 regime filter (out-of-sample 2023-2026)")
|
||||
print("=" * 90)
|
||||
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||
# Compute MA from FULL history so the filter is warmed up before 2023.
|
||||
spy_full = pit_masked["SPY"].dropna() if "SPY" in pit_masked.columns else None
|
||||
filt_full_200 = spy_ma200_filter(spy_full, ma_window=200) if spy_full is not None else None
|
||||
filt_full_150 = spy_ma200_filter(spy_full, ma_window=150) if spy_full is not None else None
|
||||
|
||||
test = slice_period(pit_masked, "2023-01-01", None)
|
||||
prices = test[tickers]
|
||||
filt = filt_full_200.reindex(test.index).fillna(False).astype(bool) if filt_full_200 is not None else None
|
||||
filt_150 = filt_full_150.reindex(test.index).fillna(False).astype(bool) if filt_full_150 is not None else None
|
||||
rows = []
|
||||
base = RecoveryMomentumPlus(top_n=10)
|
||||
rows.append(pit.summarize(run_strategy(base, prices), name="top10 (no filter)"))
|
||||
rows.append(pit.summarize(run_strategy(RecoveryMomentumPlus(top_n=10), prices,
|
||||
regime_filter=filt),
|
||||
name="top10 + SPY>MA200 filter"))
|
||||
rows.append(pit.summarize(run_strategy(RecoveryMomentumPlus(top_n=10), prices,
|
||||
regime_filter=filt_150),
|
||||
name="top10 + SPY>MA150 filter"))
|
||||
|
||||
for r in rows:
|
||||
print(pit.fmt_row(r))
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Experiment 4: weighting schemes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def exp4_weighting(pit_masked: pd.DataFrame) -> pd.DataFrame:
|
||||
print("\n" + "=" * 90)
|
||||
print("E4 — Weighting schemes (out-of-sample 2023-2026, top_n=10)")
|
||||
print("=" * 90)
|
||||
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||
test = slice_period(pit_masked[tickers], "2023-01-01", None)
|
||||
|
||||
rows = []
|
||||
for w in ["equal", "inv_vol", "rank"]:
|
||||
strat = RecoveryMomentumPlus(top_n=10, weighting=w)
|
||||
eq = run_strategy(strat, test)
|
||||
rows.append(pit.summarize(eq, name=f"top10 weighting={w}"))
|
||||
for r in rows:
|
||||
print(pit.fmt_row(r))
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Experiment 5: ensemble
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def exp5_ensemble(pit_masked: pd.DataFrame, sweep_df: pd.DataFrame) -> pd.DataFrame:
|
||||
print("\n" + "=" * 90)
|
||||
print("E5 — Ensemble of 3 uncorrelated top configs (out-of-sample 2023-2026)")
|
||||
print("=" * 90)
|
||||
tickers = [c for c in pit_masked.columns if c != "SPY"]
|
||||
test = slice_period(pit_masked[tickers], "2023-01-01", None)
|
||||
|
||||
# Pick top-20 by test_Sharpe, then greedily keep picks whose equity curves
|
||||
# correlate < 0.9 with already-kept picks.
|
||||
top20 = sweep_df.sort_values("test_Sharpe", ascending=False).head(20)
|
||||
curves = []
|
||||
components = []
|
||||
for _, row in top20.iterrows():
|
||||
cfg = dict(top_n=int(row["top_n"]),
|
||||
recovery_window=int(row["recovery_window"]),
|
||||
rec_weight=float(row["rec_weight"]),
|
||||
rebal_freq=int(row["rebal_freq"]))
|
||||
strat = RecoveryMomentumPlus(**cfg)
|
||||
eq = run_strategy(strat, test)
|
||||
if any(eq.pct_change().corr(c.pct_change()) > 0.9 for c in curves):
|
||||
continue
|
||||
curves.append(eq)
|
||||
components.append((RecoveryMomentumPlus(**cfg), 1.0))
|
||||
if len(components) >= 3:
|
||||
break
|
||||
|
||||
print(f" Selected {len(components)} uncorrelated configs for ensemble:")
|
||||
for strat, _ in components:
|
||||
print(f" top_n={strat.top_n}, rec_win={strat.recovery_window}, "
|
||||
f"rec_w={strat.rec_weight}, rebal={strat.rebal_freq}")
|
||||
|
||||
ens = EnsembleStrategy(components)
|
||||
eq_ens = run_strategy(ens, test)
|
||||
|
||||
rows = [
|
||||
pit.summarize(curves[i], name=f" component {i+1}") for i in range(len(curves))
|
||||
]
|
||||
rows.append(pit.summarize(eq_ens, name="ENSEMBLE (equal-weight)"))
|
||||
# Also ensemble + regime filter (compute MA from full history)
|
||||
if "SPY" in pit_masked.columns:
|
||||
spy_full = pit_masked["SPY"].dropna()
|
||||
filt = spy_ma200_filter(spy_full).reindex(test.index).fillna(False).astype(bool)
|
||||
eq_ens_reg = run_strategy(EnsembleStrategy(components), test, regime_filter=filt)
|
||||
rows.append(pit.summarize(eq_ens_reg, name="ENSEMBLE + SPY>MA200 filter"))
|
||||
|
||||
for r in rows:
|
||||
print(pit.fmt_row(r))
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
print("Loading point-in-time price data...")
|
||||
raw = pit.load_pit_prices()
|
||||
print(f" Raw (union) shape: {raw.shape}, {raw.index[0].date()} → {raw.index[-1].date()}")
|
||||
|
||||
masked = pit.pit_universe(raw)
|
||||
# Sanity: how many ticker-days are masked out?
|
||||
total = masked.size
|
||||
valid = masked.notna().sum().sum()
|
||||
print(f" Point-in-time valid ticker-days: {valid:,} / {total:,} ({valid/total*100:.1f}%)")
|
||||
daily_universe = masked.notna().sum(axis=1)
|
||||
print(f" Universe size per day: min={daily_universe.min()}, median={int(daily_universe.median())}, max={daily_universe.max()}")
|
||||
|
||||
e1 = exp1_bias_drift(masked)
|
||||
sweep = exp2_sweep(masked)
|
||||
e3 = exp3_regime(masked)
|
||||
e4 = exp4_weighting(masked)
|
||||
e5 = exp5_ensemble(masked, sweep)
|
||||
|
||||
# Save sweep for inspection
|
||||
out = os.path.join(DATA_DIR, "research_sweep.csv")
|
||||
sweep.to_csv(out, index=False)
|
||||
print(f"\n Full sweep saved to {out}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
125
research/pit_backtest.py
Normal file
125
research/pit_backtest.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
Point-in-time backtest runner.
|
||||
|
||||
Key idea: mask price data to NaN outside S&P 500 membership windows before
|
||||
passing to the strategy. The strategy's signal computations then naturally
|
||||
exclude non-members — no refactoring of strategies required.
|
||||
|
||||
Caveat: a stock joining the index has no signal for ~252 days after joining
|
||||
(rolling windows need non-NaN warm-up). This is conservative but unbiased.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import metrics
|
||||
import universe_history as uh
|
||||
|
||||
DATA_DIR = "data"
|
||||
PIT_CSV = os.path.join(DATA_DIR, "us_pit.csv")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_pit_prices() -> pd.DataFrame:
|
||||
"""Load the full historical S&P 500 price matrix (delisted included)."""
|
||||
if not os.path.exists(PIT_CSV):
|
||||
raise FileNotFoundError(
|
||||
f"{PIT_CSV} not found. Run `uv run python -m research.fetch_historical` first."
|
||||
)
|
||||
df = pd.read_csv(PIT_CSV, index_col=0, parse_dates=True)
|
||||
return df.sort_index()
|
||||
|
||||
|
||||
def pit_universe(prices: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Return prices masked to S&P 500 membership at each date (NaN outside)."""
|
||||
intervals = uh.load_sp500_history()
|
||||
return uh.mask_prices(prices, intervals)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backtest engine (mirrors main.backtest but accepts masked prices)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def backtest(
|
||||
strategy,
|
||||
prices: pd.DataFrame,
|
||||
initial_capital: float = 10_000,
|
||||
transaction_cost: float = 0.001,
|
||||
fixed_fee: float = 0.0,
|
||||
benchmark: pd.Series | None = None,
|
||||
regime_filter: pd.Series | None = None,
|
||||
) -> pd.Series:
|
||||
"""
|
||||
Vectorized backtest with optional regime filter.
|
||||
|
||||
`regime_filter`: boolean series aligned to prices.index. True → be in the
|
||||
market (use strategy weights). False → go to cash. When None, always invested.
|
||||
"""
|
||||
weights = strategy.generate_signals(prices)
|
||||
weights = weights.reindex(prices.index).fillna(0.0)
|
||||
|
||||
if regime_filter is not None:
|
||||
rf = regime_filter.reindex(prices.index).fillna(False).astype(float)
|
||||
weights = weights.mul(rf, axis=0)
|
||||
|
||||
daily_returns = prices.pct_change().fillna(0.0)
|
||||
portfolio_returns = (daily_returns * weights).sum(axis=1)
|
||||
|
||||
turnover = weights.diff().abs().sum(axis=1).fillna(0.0)
|
||||
portfolio_returns -= turnover * transaction_cost
|
||||
|
||||
if fixed_fee > 0:
|
||||
weight_changes = weights.diff().fillna(0.0)
|
||||
n_trades = (weight_changes.abs() > 1e-8).sum(axis=1)
|
||||
equity_running = (1 + portfolio_returns).cumprod() * initial_capital
|
||||
fee_impact = (n_trades * fixed_fee) / equity_running.shift(1).fillna(initial_capital)
|
||||
portfolio_returns -= fee_impact
|
||||
|
||||
equity = (1 + portfolio_returns).cumprod() * initial_capital
|
||||
return equity
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metrics helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def summarize(equity: pd.Series, name: str = "") -> dict:
|
||||
"""Return a dict of key performance metrics (no printing)."""
|
||||
eq = equity.dropna()
|
||||
if len(eq) < 2:
|
||||
return {"name": name, "error": "insufficient data"}
|
||||
daily = eq.pct_change().dropna()
|
||||
total_return = eq.iloc[-1] / eq.iloc[0] - 1
|
||||
years = (eq.index[-1] - eq.index[0]).days / 365.25
|
||||
cagr = (eq.iloc[-1] / eq.iloc[0]) ** (1 / years) - 1 if years > 0 else 0.0
|
||||
vol = daily.std() * np.sqrt(252)
|
||||
sharpe = (daily.mean() * 252) / vol if vol > 0 else 0.0
|
||||
downside = daily[daily < 0].std() * np.sqrt(252)
|
||||
sortino = (daily.mean() * 252) / downside if downside > 0 else 0.0
|
||||
dd = (eq / eq.cummax() - 1).min()
|
||||
calmar = cagr / abs(dd) if dd < 0 else 0.0
|
||||
return {
|
||||
"name": name,
|
||||
"CAGR": cagr,
|
||||
"Sharpe": sharpe,
|
||||
"Sortino": sortino,
|
||||
"MaxDD": dd,
|
||||
"Calmar": calmar,
|
||||
"TotalRet": total_return,
|
||||
"Vol": vol,
|
||||
}
|
||||
|
||||
|
||||
def fmt_row(r: dict) -> str:
|
||||
return (f" {r['name']:<38s} "
|
||||
f"CAGR={r['CAGR']*100:>6.1f}% "
|
||||
f"Sharpe={r['Sharpe']:>5.2f} "
|
||||
f"Sortino={r['Sortino']:>5.2f} "
|
||||
f"MaxDD={r['MaxDD']*100:>6.1f}% "
|
||||
f"Calmar={r['Calmar']:>5.2f} "
|
||||
f"Total={r['TotalRet']*100:>7.1f}%")
|
||||
23
research/regime_filters.py
Normal file
23
research/regime_filters.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
LONG_MA_WINDOW = 200
|
||||
RS_WINDOW = 63
|
||||
|
||||
|
||||
def build_regime_filter(etf_close: pd.DataFrame, market_col: str = "SPY") -> pd.Series:
|
||||
"""Return a next-day tradable regime flag based on market trend and ETF leadership."""
|
||||
prices = etf_close.sort_index()
|
||||
if market_col not in prices.columns:
|
||||
raise KeyError(f"{market_col} not found in etf_close")
|
||||
|
||||
market = prices[market_col]
|
||||
market_ma = market.rolling(LONG_MA_WINDOW, min_periods=LONG_MA_WINDOW).mean()
|
||||
market_ok = market.gt(market_ma)
|
||||
|
||||
rs = prices.pct_change(RS_WINDOW, fill_method=None)
|
||||
non_market_rs = rs.drop(columns=[market_col], errors="ignore")
|
||||
leader_ok = non_market_rs.gt(rs[market_col], axis=0).any(axis=1)
|
||||
|
||||
regime = (market_ok & leader_ok).astype(bool)
|
||||
return regime.shift(1, fill_value=False)
|
||||
150
research/strategies_plus.py
Normal file
150
research/strategies_plus.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""
|
||||
Optimization variants of RecoveryMomentumStrategy.
|
||||
|
||||
Four dimensions explored:
|
||||
1. Hyperparameters (top_n, recovery_window, mom_lookback, rebal_freq, weights)
|
||||
2. Regime filter: zero-out weights when SPY < MA200
|
||||
3. Weighting scheme: equal / inverse-vol / rank-weighted
|
||||
4. Ensemble: weighted blend of multiple strategies
|
||||
|
||||
All strategies follow the same Strategy protocol (generate_signals → weights DF).
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from strategies.base import Strategy
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Generalized Recovery+Momentum strategy
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class RecoveryMomentumPlus(Strategy):
|
||||
"""
|
||||
Recovery + momentum composite with configurable blend, weighting, and
|
||||
regime filter hooks.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
recovery_window : int
|
||||
Lookback for the recovery factor (price / rolling min - 1).
|
||||
mom_lookback : int
|
||||
Long-horizon momentum window total length.
|
||||
mom_skip : int
|
||||
Short-term reversal skip for momentum.
|
||||
rebal_freq : int
|
||||
Trading-day rebalance interval.
|
||||
top_n : int
|
||||
Number of stocks selected each rebalance.
|
||||
rec_weight : float in [0, 1]
|
||||
Weight of recovery factor in composite rank blend (mom_weight = 1 - rec_weight).
|
||||
weighting : {"equal", "inv_vol", "rank"}
|
||||
Portfolio weighting scheme for the selected top_n.
|
||||
vol_window : int
|
||||
Volatility lookback when weighting="inv_vol".
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
recovery_window: int = 63,
|
||||
mom_lookback: int = 252,
|
||||
mom_skip: int = 21,
|
||||
rebal_freq: int = 21,
|
||||
top_n: int = 10,
|
||||
rec_weight: float = 0.5,
|
||||
weighting: str = "equal",
|
||||
vol_window: int = 60):
|
||||
if weighting not in ("equal", "inv_vol", "rank"):
|
||||
raise ValueError(f"weighting must be equal|inv_vol|rank, got {weighting!r}")
|
||||
self.recovery_window = recovery_window
|
||||
self.mom_lookback = mom_lookback
|
||||
self.mom_skip = mom_skip
|
||||
self.rebal_freq = rebal_freq
|
||||
self.top_n = top_n
|
||||
self.rec_weight = rec_weight
|
||||
self.weighting = weighting
|
||||
self.vol_window = vol_window
|
||||
|
||||
def generate_signals(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
# Factors
|
||||
recovery = data / data.rolling(self.recovery_window).min() - 1
|
||||
momentum = data.shift(self.mom_skip).pct_change(self.mom_lookback - self.mom_skip)
|
||||
|
||||
rec_rank = recovery.rank(axis=1, pct=True, na_option="keep")
|
||||
mom_rank = momentum.rank(axis=1, pct=True, na_option="keep")
|
||||
composite = self.rec_weight * rec_rank + (1 - self.rec_weight) * mom_rank
|
||||
|
||||
# Top-N selection
|
||||
rank = composite.rank(axis=1, ascending=False, na_option="bottom")
|
||||
n_valid = composite.notna().sum(axis=1)
|
||||
enough = n_valid >= self.top_n
|
||||
top_mask = (rank <= self.top_n) & enough.values.reshape(-1, 1)
|
||||
|
||||
# Weighting within top-N
|
||||
if self.weighting == "equal":
|
||||
raw = top_mask.astype(float)
|
||||
elif self.weighting == "rank":
|
||||
# Higher composite → higher weight within top-N
|
||||
ranked_score = composite.where(top_mask, 0.0)
|
||||
raw = ranked_score
|
||||
elif self.weighting == "inv_vol":
|
||||
# Use inverse realized-volatility as weights within top-N
|
||||
rets = data.pct_change()
|
||||
vol = rets.rolling(self.vol_window).std()
|
||||
inv_vol = 1.0 / vol.replace(0, np.nan)
|
||||
raw = inv_vol.where(top_mask, 0.0).fillna(0.0)
|
||||
|
||||
row_sums = raw.sum(axis=1).replace(0, np.nan)
|
||||
signals = raw.div(row_sums, axis=0).fillna(0.0)
|
||||
|
||||
# Rebalance
|
||||
warmup = max(self.mom_lookback, self.recovery_window, self.vol_window)
|
||||
rebal_mask = pd.Series(False, index=data.index)
|
||||
rebal_indices = list(range(warmup, len(data), self.rebal_freq))
|
||||
rebal_mask.iloc[rebal_indices] = True
|
||||
signals[~rebal_mask] = np.nan
|
||||
signals = signals.ffill().fillna(0.0)
|
||||
signals.iloc[:warmup] = 0.0
|
||||
|
||||
return signals.shift(1).fillna(0.0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Ensemble
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class EnsembleStrategy(Strategy):
|
||||
"""
|
||||
Weighted blend of several sub-strategies. Each sub-strategy produces a
|
||||
weight matrix; we linearly combine them. The result still sums to (at
|
||||
most) 1 per row since each sub-strategy does.
|
||||
"""
|
||||
|
||||
def __init__(self, components: list[tuple[Strategy, float]]):
|
||||
total = sum(w for _, w in components)
|
||||
self.components = [(s, w / total) for s, w in components]
|
||||
|
||||
def generate_signals(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
out = None
|
||||
for strat, w in self.components:
|
||||
sig = strat.generate_signals(data).mul(w)
|
||||
if out is None:
|
||||
out = sig
|
||||
else:
|
||||
# Align columns (should be identical since same data passed)
|
||||
out = out.add(sig, fill_value=0.0)
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regime filter helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def spy_ma200_filter(spy: pd.Series, ma_window: int = 200) -> pd.Series:
|
||||
"""
|
||||
Boolean Series: True when SPY close > SPY MA(ma_window), shifted by 1 to
|
||||
avoid lookahead. Use as `regime_filter=...` in pit_backtest.backtest().
|
||||
"""
|
||||
ma = spy.rolling(ma_window, min_periods=ma_window).mean()
|
||||
signal = (spy > ma).fillna(False)
|
||||
return signal.shift(1).fillna(False)
|
||||
156
research/us_alpha_pipeline.py
Normal file
156
research/us_alpha_pipeline.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
import universe_history as uh
|
||||
from research.event_factors import breakout_after_compression_score
|
||||
from research.regime_filters import build_regime_filter
|
||||
from research.us_alpha_report import summarize_equity_window
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
|
||||
MIN_PRICE = 5.0
|
||||
MIN_DOLLAR_VOLUME = 20_000_000.0
|
||||
MIN_HISTORY_DAYS = 252
|
||||
MIN_VALID_VOLUME_DAYS = 40
|
||||
LIQUIDITY_WINDOW = 60
|
||||
|
||||
TREND_WINDOW = 126
|
||||
RECOVERY_WINDOW = 63
|
||||
HIGH_PROX_WINDOW = 126
|
||||
ETF_TICKERS = ["SPY", "QQQ", "IWM", "MDY", "XLK", "XLF", "XLI", "XLV"]
|
||||
|
||||
|
||||
def _price_rank_blend_score(close: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Simple price-only cross-sectional blend, shifted for next-day trading."""
|
||||
trend = close.pct_change(TREND_WINDOW, fill_method=None)
|
||||
recovery = close / close.rolling(RECOVERY_WINDOW, min_periods=RECOVERY_WINDOW).min() - 1
|
||||
high_proximity = close / close.rolling(HIGH_PROX_WINDOW, min_periods=HIGH_PROX_WINDOW).max().replace(0, np.nan)
|
||||
|
||||
trend_rank = trend.rank(axis=1, pct=True, na_option="keep")
|
||||
recovery_rank = recovery.rank(axis=1, pct=True, na_option="keep")
|
||||
high_rank = high_proximity.rank(axis=1, pct=True, na_option="keep")
|
||||
return ((trend_rank + recovery_rank + high_rank) / 3.0).shift(1)
|
||||
|
||||
|
||||
def _build_equal_weight_portfolio(
|
||||
score: pd.DataFrame,
|
||||
tradable_mask: pd.DataFrame,
|
||||
regime_filter: pd.Series,
|
||||
top_n: int,
|
||||
) -> pd.DataFrame:
|
||||
"""Build equal-weight top-n long-only weights from aligned scores."""
|
||||
aligned_score = score.reindex(index=tradable_mask.index, columns=tradable_mask.columns)
|
||||
eligible_score = aligned_score.where(tradable_mask)
|
||||
rank = eligible_score.rank(axis=1, ascending=False, na_option="bottom", method="first")
|
||||
selected = (rank <= top_n) & eligible_score.notna()
|
||||
selected = selected & regime_filter.reindex(tradable_mask.index, fill_value=False).to_numpy().reshape(-1, 1)
|
||||
|
||||
raw = selected.astype(float)
|
||||
row_sums = raw.sum(axis=1).replace(0.0, np.nan)
|
||||
return raw.div(row_sums, axis=0).fillna(0.0)
|
||||
|
||||
|
||||
def _equity_curve(close: pd.DataFrame, weights: pd.DataFrame) -> pd.Series:
|
||||
"""Convert daily weights into a simple close-to-close equity curve."""
|
||||
returns = close.pct_change(fill_method=None).fillna(0.0)
|
||||
portfolio_returns = (returns * weights).sum(axis=1)
|
||||
return (1.0 + portfolio_returns).cumprod()
|
||||
|
||||
|
||||
def _read_panel_csv(path: str) -> pd.DataFrame:
|
||||
return pd.read_csv(path, index_col=0, parse_dates=True).sort_index()
|
||||
|
||||
|
||||
def load_saved_pit_market_data(data_dir: str = "data", prefix: str = "us_pit") -> dict[str, pd.DataFrame]:
|
||||
"""Load saved PIT OHLCV panels from disk."""
|
||||
panels = {}
|
||||
for field in ("close", "high", "low", "volume"):
|
||||
panels[field] = _read_panel_csv(f"{data_dir}/{prefix}_{field}.csv")
|
||||
return panels
|
||||
|
||||
|
||||
def load_saved_etf_close(data_dir: str = "data", market: str = "us_etf") -> pd.DataFrame:
|
||||
"""Load saved ETF closes or populate them on demand."""
|
||||
path = f"{data_dir}/{market}.csv"
|
||||
try:
|
||||
return _read_panel_csv(path)
|
||||
except FileNotFoundError:
|
||||
original_data_dir = data_manager.DATA_DIR
|
||||
try:
|
||||
data_manager.DATA_DIR = data_dir
|
||||
return data_manager.update_market_data(market, ETF_TICKERS, ["close"])["close"]
|
||||
finally:
|
||||
data_manager.DATA_DIR = original_data_dir
|
||||
|
||||
|
||||
def run_alpha_pipeline(
|
||||
market_data,
|
||||
etf_close,
|
||||
pit_membership=None,
|
||||
windows=(1, 2, 3, 5, 10),
|
||||
top_n=10,
|
||||
) -> pd.DataFrame:
|
||||
"""Run a lightweight strict US alpha pipeline and summarize trailing windows."""
|
||||
close = market_data["close"].sort_index()
|
||||
high = market_data["high"].reindex(index=close.index, columns=close.columns).sort_index()
|
||||
low = market_data["low"].reindex(index=close.index, columns=close.columns).sort_index()
|
||||
volume = market_data["volume"].reindex(index=close.index, columns=close.columns).sort_index()
|
||||
|
||||
tradable_mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=pit_membership,
|
||||
min_price=MIN_PRICE,
|
||||
min_dollar_volume=MIN_DOLLAR_VOLUME,
|
||||
min_history_days=MIN_HISTORY_DAYS,
|
||||
min_valid_volume_days=MIN_VALID_VOLUME_DAYS,
|
||||
liquidity_window=LIQUIDITY_WINDOW,
|
||||
)
|
||||
regime_filter = build_regime_filter(etf_close).reindex(close.index, fill_value=False)
|
||||
|
||||
strategy_scores = {
|
||||
"breakout_regime": breakout_after_compression_score(close, high, low, volume),
|
||||
"rank_blend_regime": _price_rank_blend_score(close),
|
||||
}
|
||||
|
||||
summary_rows = []
|
||||
for strategy_name, score in strategy_scores.items():
|
||||
weights = _build_equal_weight_portfolio(score, tradable_mask, regime_filter, top_n)
|
||||
equity = _equity_curve(close, weights)
|
||||
for window_years in windows:
|
||||
summary_rows.append(summarize_equity_window(equity, strategy_name, window_years))
|
||||
|
||||
return pd.DataFrame(summary_rows)
|
||||
|
||||
|
||||
def run_saved_pit_alpha_pipeline(
|
||||
data_dir: str = "data",
|
||||
windows=(1, 2, 3, 5, 10),
|
||||
top_n: int = 10,
|
||||
) -> pd.DataFrame:
|
||||
"""Load saved PIT OHLCV inputs and run the strict alpha pipeline."""
|
||||
market_data = load_saved_pit_market_data(data_dir=data_dir)
|
||||
etf_close = load_saved_etf_close(data_dir=data_dir)
|
||||
intervals = uh.load_sp500_history()
|
||||
pit_membership = uh.membership_mask(
|
||||
market_data["close"].index,
|
||||
intervals=intervals,
|
||||
tickers=list(market_data["close"].columns),
|
||||
)
|
||||
return run_alpha_pipeline(
|
||||
market_data=market_data,
|
||||
etf_close=etf_close,
|
||||
pit_membership=pit_membership,
|
||||
windows=windows,
|
||||
top_n=top_n,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
summary = run_saved_pit_alpha_pipeline()
|
||||
print(summary.to_string(index=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
37
research/us_alpha_report.py
Normal file
37
research/us_alpha_report.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
TRADING_DAYS_PER_YEAR = 252
|
||||
|
||||
|
||||
def summarize_equity_window(equity: pd.Series, strategy: str, window_years: int | float) -> dict:
|
||||
"""Summarize a strategy equity curve over a trailing trading-day window."""
|
||||
window_days = max(int(window_years * TRADING_DAYS_PER_YEAR), 1)
|
||||
clean_equity = equity.dropna()
|
||||
if len(clean_equity) < window_days + 1:
|
||||
return {
|
||||
"strategy": strategy,
|
||||
"window_years": window_years,
|
||||
"CAGR": np.nan,
|
||||
"Sharpe": np.nan,
|
||||
"MaxDD": np.nan,
|
||||
"TotalRet": np.nan,
|
||||
}
|
||||
window_equity = clean_equity.tail(window_days + 1)
|
||||
|
||||
daily = window_equity.pct_change(fill_method=None).dropna()
|
||||
total_ret = window_equity.iloc[-1] / window_equity.iloc[0] - 1
|
||||
years = len(daily) / TRADING_DAYS_PER_YEAR
|
||||
cagr = (window_equity.iloc[-1] / window_equity.iloc[0]) ** (1 / years) - 1 if years > 0 else np.nan
|
||||
vol = daily.std() * np.sqrt(TRADING_DAYS_PER_YEAR)
|
||||
sharpe = (daily.mean() * TRADING_DAYS_PER_YEAR) / vol if vol > 0 else 0.0
|
||||
max_dd = (window_equity / window_equity.cummax() - 1).min()
|
||||
return {
|
||||
"strategy": strategy,
|
||||
"window_years": window_years,
|
||||
"CAGR": cagr,
|
||||
"Sharpe": sharpe,
|
||||
"MaxDD": max_dd,
|
||||
"TotalRet": total_ret,
|
||||
}
|
||||
53
research/us_universe.py
Normal file
53
research/us_universe.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def build_tradable_mask(
|
||||
close: pd.DataFrame,
|
||||
volume: pd.DataFrame,
|
||||
pit_membership: pd.DataFrame | None,
|
||||
min_price: float,
|
||||
min_dollar_volume: float,
|
||||
min_history_days: int,
|
||||
min_valid_volume_days: int,
|
||||
liquidity_window: int = 60,
|
||||
) -> pd.DataFrame:
|
||||
"""Build a point-in-time tradable universe mask using only lagged inputs."""
|
||||
close = close.sort_index()
|
||||
volume = volume.reindex(index=close.index, columns=close.columns).sort_index()
|
||||
if pit_membership is None:
|
||||
pit_mask = pd.DataFrame(True, index=close.index, columns=close.columns)
|
||||
else:
|
||||
pit_mask = pit_membership.reindex(
|
||||
index=close.index,
|
||||
columns=close.columns,
|
||||
fill_value=False,
|
||||
)
|
||||
pit_mask = pit_mask.where(pit_mask.notna(), False).astype(bool)
|
||||
|
||||
eligible_close = close.where(pit_mask)
|
||||
eligible_volume = volume.where(pit_mask)
|
||||
|
||||
lagged_close = eligible_close.shift(1)
|
||||
lagged_volume = eligible_volume.shift(1)
|
||||
lagged_dollar_volume = lagged_close * lagged_volume
|
||||
|
||||
price_ok = lagged_close.gt(min_price)
|
||||
liquidity_ok = (
|
||||
lagged_dollar_volume.rolling(window=liquidity_window, min_periods=1).median().gt(min_dollar_volume)
|
||||
)
|
||||
history_ok = (
|
||||
lagged_close.notna()
|
||||
.rolling(window=min_history_days, min_periods=min_history_days)
|
||||
.sum()
|
||||
.ge(min_history_days)
|
||||
)
|
||||
valid_volume_ok = (
|
||||
lagged_dollar_volume.notna()
|
||||
.rolling(window=liquidity_window, min_periods=1)
|
||||
.sum()
|
||||
.ge(min_valid_volume_days)
|
||||
)
|
||||
|
||||
mask = price_ok & liquidity_ok & history_ok & valid_volume_ok
|
||||
mask = mask & pit_mask
|
||||
return mask.astype(bool)
|
||||
218
strategies/factor_combo.py
Normal file
218
strategies/factor_combo.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
Factor combination strategies discovered through iterative factor research.
|
||||
|
||||
US champions:
|
||||
- rec_mfilt+deep×upvol: Recovery (momentum-filtered) + deep recovery × up-volume
|
||||
- ma200+mom7m+rec126: Above MA200 + intermediate momentum + deep recovery
|
||||
- rec_mfilt+ma200: Recovery (momentum-filtered) + above MA200
|
||||
- mom7m+rec126: Intermediate momentum + deep recovery
|
||||
|
||||
CN champions:
|
||||
- up_cap+quality_mom: Up-capture ratio + quality momentum composite
|
||||
- down_resil+qual_mom: Down-resilience + quality momentum composite
|
||||
- rec63+mom×gap: Recovery 63d + momentum × gap-up frequency
|
||||
- up_cap+mom×gap: Up-capture + momentum × gap-up frequency
|
||||
|
||||
Each can run at daily/weekly/biweekly/monthly rebalancing frequency.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from strategies.base import Strategy
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Factor building blocks
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _mom_12_1(p):
|
||||
return p.shift(21).pct_change(231)
|
||||
|
||||
|
||||
def _mom_intermediate(p):
|
||||
return p.shift(21).pct_change(147)
|
||||
|
||||
|
||||
def _rec_63(p):
|
||||
return p / p.rolling(63, min_periods=63).min() - 1
|
||||
|
||||
|
||||
def _rec_126(p):
|
||||
return p / p.rolling(126, min_periods=126).min() - 1
|
||||
|
||||
|
||||
def _above_ma200(p):
|
||||
return p / p.rolling(200, min_periods=200).mean() - 1
|
||||
|
||||
|
||||
def _up_volume_proxy(p):
|
||||
ret = p.pct_change()
|
||||
return ret.where(ret > 0, 0).rolling(20, min_periods=15).sum()
|
||||
|
||||
|
||||
def _gap_up_freq(p):
|
||||
ret = p.pct_change()
|
||||
return (ret > 0.01).astype(float).rolling(60, min_periods=40).mean()
|
||||
|
||||
|
||||
def _consistent_returns(p):
|
||||
ret = p.pct_change()
|
||||
return (ret > 0).astype(float).rolling(252, min_periods=126).mean()
|
||||
|
||||
|
||||
def _rec_mom_filtered(p):
|
||||
rec = p / p.rolling(126, min_periods=126).min() - 1
|
||||
mom = p.shift(21).pct_change(105)
|
||||
return rec.where(mom > 0, np.nan)
|
||||
|
||||
|
||||
def _up_capture(p):
|
||||
ret = p.pct_change()
|
||||
mkt = ret.mean(axis=1)
|
||||
up_mkt = mkt > 0
|
||||
arr = ret.values.copy()
|
||||
arr[~up_mkt.values, :] = np.nan
|
||||
stock_up = pd.DataFrame(arr, index=ret.index, columns=ret.columns)
|
||||
mkt_up_vals = mkt.where(up_mkt, np.nan)
|
||||
stock_avg = stock_up.rolling(60, min_periods=20).mean()
|
||||
mkt_avg = mkt_up_vals.rolling(60, min_periods=20).mean()
|
||||
return stock_avg.div(mkt_avg, axis=0)
|
||||
|
||||
|
||||
def _down_resilience(p):
|
||||
ret = p.pct_change()
|
||||
mkt = ret.mean(axis=1)
|
||||
down_mkt = mkt < 0
|
||||
arr = ret.values.copy()
|
||||
arr[~down_mkt.values, :] = np.nan
|
||||
down_ret = pd.DataFrame(arr, index=ret.index, columns=ret.columns)
|
||||
return -down_ret.rolling(120, min_periods=30).mean()
|
||||
|
||||
|
||||
def _quality_mom(p):
|
||||
mom_r = _mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
con_r = _consistent_returns(p).rank(axis=1, pct=True, na_option="keep")
|
||||
up_r = _up_volume_proxy(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return 0.4 * mom_r + 0.3 * con_r + 0.3 * up_r
|
||||
|
||||
|
||||
def _mom_x_gap(p):
|
||||
mom_r = _mom_12_1(p).rank(axis=1, pct=True, na_option="keep")
|
||||
gap_r = _gap_up_freq(p).rank(axis=1, pct=True, na_option="keep")
|
||||
return mom_r * gap_r
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Combo signal constructors (weighted rank sums)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _rank(df):
|
||||
return df.rank(axis=1, pct=True, na_option="keep")
|
||||
|
||||
|
||||
# US combos
|
||||
def signal_rec_mfilt_deep_upvol(p):
|
||||
rec_mfilt_r = _rank(_rec_mom_filtered(p))
|
||||
deep_upvol_r = _rank(_rec_126(p)) * _rank(_up_volume_proxy(p))
|
||||
deep_upvol_rr = _rank(deep_upvol_r)
|
||||
return 0.5 * rec_mfilt_r + 0.5 * deep_upvol_rr
|
||||
|
||||
|
||||
def signal_ma200_mom7m_rec126(p):
|
||||
return (0.33 * _rank(_above_ma200(p))
|
||||
+ 0.33 * _rank(_mom_intermediate(p))
|
||||
+ 0.34 * _rank(_rec_126(p)))
|
||||
|
||||
|
||||
def signal_rec_mfilt_ma200(p):
|
||||
return 0.5 * _rank(_rec_mom_filtered(p)) + 0.5 * _rank(_above_ma200(p))
|
||||
|
||||
|
||||
def signal_mom7m_rec126(p):
|
||||
return 0.5 * _rank(_mom_intermediate(p)) + 0.5 * _rank(_rec_126(p))
|
||||
|
||||
|
||||
# CN combos
|
||||
def signal_up_cap_quality_mom(p):
|
||||
return 0.5 * _rank(_up_capture(p)) + 0.5 * _rank(_quality_mom(p))
|
||||
|
||||
|
||||
def signal_down_resil_qual_mom(p):
|
||||
return 0.5 * _rank(_down_resilience(p)) + 0.5 * _rank(_quality_mom(p))
|
||||
|
||||
|
||||
def signal_rec63_mom_gap(p):
|
||||
return 0.5 * _rank(_rec_63(p)) + 0.5 * _rank(_mom_x_gap(p))
|
||||
|
||||
|
||||
def signal_up_cap_mom_gap(p):
|
||||
return 0.5 * _rank(_up_capture(p)) + 0.5 * _rank(_mom_x_gap(p))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Signal registry: name -> callable(prices) -> DataFrame
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SIGNAL_REGISTRY = {
|
||||
# US
|
||||
"rec_mfilt+deep_upvol": signal_rec_mfilt_deep_upvol,
|
||||
"ma200+mom7m+rec126": signal_ma200_mom7m_rec126,
|
||||
"rec_mfilt+ma200": signal_rec_mfilt_ma200,
|
||||
"mom7m+rec126": signal_mom7m_rec126,
|
||||
# CN
|
||||
"up_cap+quality_mom": signal_up_cap_quality_mom,
|
||||
"down_resil+qual_mom": signal_down_resil_qual_mom,
|
||||
"rec63+mom_gap": signal_rec63_mom_gap,
|
||||
"up_cap+mom_gap": signal_up_cap_mom_gap,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Strategy class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class FactorComboStrategy(Strategy):
|
||||
"""
|
||||
Generic factor-combination strategy with configurable rebalancing frequency.
|
||||
|
||||
Parameters:
|
||||
signal_name: key into SIGNAL_REGISTRY
|
||||
rebal_freq: rebalancing interval in trading days (1=daily, 5=weekly, 10=biweekly, 21=monthly)
|
||||
top_n: number of stocks to hold
|
||||
"""
|
||||
|
||||
REBAL_LABELS = {1: "daily", 5: "weekly", 10: "biweekly", 21: "monthly"}
|
||||
|
||||
def __init__(self, signal_name: str, rebal_freq: int = 21, top_n: int = 10):
|
||||
if signal_name not in SIGNAL_REGISTRY:
|
||||
raise ValueError(f"Unknown signal: {signal_name}. "
|
||||
f"Available: {list(SIGNAL_REGISTRY.keys())}")
|
||||
self.signal_name = signal_name
|
||||
self.signal_func = SIGNAL_REGISTRY[signal_name]
|
||||
self.rebal_freq = rebal_freq
|
||||
self.top_n = top_n
|
||||
|
||||
def generate_signals(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
sig = self.signal_func(data)
|
||||
|
||||
# Select top_n by signal rank
|
||||
rank = sig.rank(axis=1, ascending=False, na_option="bottom")
|
||||
n_valid = sig.notna().sum(axis=1)
|
||||
enough = n_valid >= self.top_n
|
||||
top_mask = (rank <= self.top_n) & enough.values.reshape(-1, 1)
|
||||
|
||||
raw = top_mask.astype(float)
|
||||
row_sums = raw.sum(axis=1).replace(0, np.nan)
|
||||
signals = raw.div(row_sums, axis=0).fillna(0.0)
|
||||
|
||||
# Rebalance at configured frequency
|
||||
warmup = 252
|
||||
rebal_mask = pd.Series(False, index=data.index)
|
||||
rebal_indices = list(range(warmup, len(data), self.rebal_freq))
|
||||
rebal_mask.iloc[rebal_indices] = True
|
||||
|
||||
signals[~rebal_mask] = np.nan
|
||||
signals = signals.ffill().fillna(0.0)
|
||||
signals.iloc[:warmup] = 0.0
|
||||
|
||||
return signals.shift(1).fillna(0.0)
|
||||
118
tests/test_alpha_signals.py
Normal file
118
tests/test_alpha_signals.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class AlphaSignalTests(unittest.TestCase):
|
||||
def test_build_regime_filter_requires_market_trend_and_non_market_leader(self):
|
||||
from research.regime_filters import build_regime_filter
|
||||
|
||||
dates = pd.date_range("2023-01-01", periods=260, freq="D")
|
||||
spy = pd.Series([100.0 + i for i in range(260)], index=dates)
|
||||
qqq_leader = pd.Series([100.0 + 1.4 * i for i in range(260)], index=dates)
|
||||
xlu = pd.Series([100.0 + 0.2 * i for i in range(260)], index=dates)
|
||||
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
warnings.simplefilter("always")
|
||||
bullish = build_regime_filter(pd.DataFrame({"SPY": spy, "QQQ": qqq_leader, "XLU": xlu}))
|
||||
qqq_laggard = pd.Series([100.0 + 0.5 * i for i in range(260)], index=dates)
|
||||
no_leader = build_regime_filter(pd.DataFrame({"SPY": spy, "QQQ": qqq_laggard, "XLU": xlu}))
|
||||
|
||||
self.assertEqual(len(caught), 0)
|
||||
self.assertFalse(bool(bullish.iloc[199]))
|
||||
self.assertTrue(bool(bullish.iloc[-1]))
|
||||
self.assertFalse(bool(no_leader.iloc[-1]))
|
||||
|
||||
def test_build_regime_filter_handles_internal_missing_prices_without_warnings(self):
|
||||
from research.regime_filters import build_regime_filter
|
||||
|
||||
dates = pd.date_range("2023-01-01", periods=260, freq="D")
|
||||
spy = pd.Series([100.0 + i for i in range(260)], index=dates)
|
||||
qqq = pd.Series([100.0 + 1.4 * i for i in range(260)], index=dates)
|
||||
qqq.iloc[120] = np.nan
|
||||
etf_close = pd.DataFrame({"SPY": spy, "QQQ": qqq, "XLU": 100.0}, index=dates)
|
||||
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
warnings.simplefilter("always")
|
||||
regime = build_regime_filter(etf_close)
|
||||
|
||||
self.assertEqual(len(caught), 0)
|
||||
self.assertEqual(str(regime.dtype), "bool")
|
||||
|
||||
def test_breakout_after_compression_score_is_shifted_and_rewards_breakout_profile(self):
|
||||
from research.event_factors import breakout_after_compression_score
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=80, freq="D")
|
||||
|
||||
aaa_close = [100.0 + i for i in range(60)] + [159.0 + 0.05 * i for i in range(20)]
|
||||
bbb_close = [100.0 + i for i in range(60)] + [150.0 - i for i in range(20)]
|
||||
close = pd.DataFrame({"AAA": aaa_close, "BBB": bbb_close}, index=dates)
|
||||
|
||||
high = pd.DataFrame(
|
||||
{
|
||||
"AAA": [value + 0.4 for value in aaa_close],
|
||||
"BBB": [value + 4.0 for value in bbb_close],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
low = pd.DataFrame(
|
||||
{
|
||||
"AAA": [value - 0.4 for value in aaa_close],
|
||||
"BBB": [value - 4.0 for value in bbb_close],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
volume = pd.DataFrame(
|
||||
{
|
||||
"AAA": [1_000.0] * 79 + [1_000.0],
|
||||
"BBB": [1_000.0] * 80,
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
volume.loc[dates[-2], "AAA"] = 6_000.0
|
||||
|
||||
shifted_result = breakout_after_compression_score(close, high, low, volume)
|
||||
self.assertGreater(
|
||||
shifted_result.loc[dates[-1], "AAA"],
|
||||
shifted_result.loc[dates[-1], "BBB"],
|
||||
)
|
||||
|
||||
changed_last_day = close.copy()
|
||||
changed_last_day_high = high.copy()
|
||||
changed_last_day_low = low.copy()
|
||||
changed_last_day_volume = volume.copy()
|
||||
changed_last_day.loc[dates[-1], "AAA"] = 120.0
|
||||
changed_last_day_high.loc[dates[-1], "AAA"] = 130.0
|
||||
changed_last_day_low.loc[dates[-1], "AAA"] = 110.0
|
||||
changed_last_day_volume.loc[dates[-1], "AAA"] = 20_000.0
|
||||
|
||||
last_day_changed_result = breakout_after_compression_score(
|
||||
changed_last_day,
|
||||
changed_last_day_high,
|
||||
changed_last_day_low,
|
||||
changed_last_day_volume,
|
||||
)
|
||||
self.assertEqual(
|
||||
shifted_result.loc[dates[-1], "AAA"],
|
||||
last_day_changed_result.loc[dates[-1], "AAA"],
|
||||
)
|
||||
|
||||
def test_breakout_after_compression_score_keeps_float_output_when_denominators_hit_zero(self):
|
||||
from research.event_factors import breakout_after_compression_score
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=70, freq="D")
|
||||
close = pd.DataFrame({"AAA": [10.0] * 70}, index=dates)
|
||||
high = pd.DataFrame({"AAA": [10.0] * 70}, index=dates)
|
||||
low = pd.DataFrame({"AAA": [10.0] * 70}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [0.0] * 70}, index=dates)
|
||||
|
||||
score = breakout_after_compression_score(close, high, low, volume)
|
||||
|
||||
self.assertEqual(str(score.dtypes["AAA"]), "float64")
|
||||
self.assertTrue(pd.isna(score.iloc[-1]["AAA"]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
1421
tests/test_factor_attribution.py
Normal file
1421
tests/test_factor_attribution.py
Normal file
File diff suppressed because it is too large
Load Diff
46
tests/test_fetch_historical.py
Normal file
46
tests/test_fetch_historical.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from research import fetch_historical
|
||||
|
||||
|
||||
class FetchHistoricalTests(unittest.TestCase):
|
||||
def test_fetch_all_historical_ohlcv_writes_field_specific_csvs(self):
|
||||
dates = pd.to_datetime(["2024-01-02", "2024-01-03"])
|
||||
raw = pd.DataFrame(
|
||||
{
|
||||
("Close", "AAA"): [10.0, 11.0],
|
||||
("Close", "BBB"): [20.0, 21.0],
|
||||
("High", "AAA"): [10.5, 11.5],
|
||||
("High", "BBB"): [20.5, 21.5],
|
||||
("Low", "AAA"): [9.5, 10.5],
|
||||
("Low", "BBB"): [19.5, 20.5],
|
||||
("Volume", "AAA"): [1000.0, 1100.0],
|
||||
("Volume", "BBB"): [2000.0, 2100.0],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
raw.columns = pd.MultiIndex.from_tuples(raw.columns)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with mock.patch.object(fetch_historical, "DATA_DIR", tmpdir):
|
||||
with mock.patch.object(fetch_historical, "OUT_PATH", str(Path(tmpdir) / "us_pit.csv")):
|
||||
with mock.patch("research.fetch_historical.uh.load_sp500_history", return_value={"AAA": [[None, None]], "BBB": [[None, None]]}):
|
||||
with mock.patch("research.fetch_historical.uh.all_tickers_ever", return_value=["AAA", "BBB"]):
|
||||
with mock.patch("research.fetch_historical.yf.download", return_value=raw):
|
||||
panels = fetch_historical.fetch_all_historical_ohlcv(force=True)
|
||||
|
||||
self.assertEqual(set(panels.keys()), {"close", "high", "low", "volume"})
|
||||
self.assertTrue((Path(tmpdir) / "us_pit.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_pit_close.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_pit_high.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_pit_low.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_pit_volume.csv").exists())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
144
tests/test_market_data.py
Normal file
144
tests/test_market_data.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import data_manager
|
||||
|
||||
|
||||
class UpdateMarketDataTests(unittest.TestCase):
|
||||
def test_update_market_data_accepts_lowercase_fields_and_does_not_fill_volume(self):
|
||||
dates = pd.to_datetime(["2024-01-02", "2024-01-03", "2024-01-04"])
|
||||
raw = pd.DataFrame(
|
||||
{
|
||||
("Close", "AAA"): [10.0, 11.0, 12.0],
|
||||
("Close", "BBB"): [20.0, float("nan"), 22.0],
|
||||
("Open", "AAA"): [9.5, 10.5, 11.5],
|
||||
("Open", "BBB"): [19.5, 20.5, 21.5],
|
||||
("High", "AAA"): [10.5, 11.5, 12.5],
|
||||
("High", "BBB"): [20.5, 21.5, 22.5],
|
||||
("Low", "AAA"): [9.0, 10.0, 11.0],
|
||||
("Low", "BBB"): [19.0, 20.0, 21.0],
|
||||
("Volume", "AAA"): [1000, 1100, 1200],
|
||||
("Volume", "BBB"): [2000, float("nan"), 2200],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
raw.columns = pd.MultiIndex.from_tuples(raw.columns)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with mock.patch.object(data_manager, "DATA_DIR", tmpdir):
|
||||
with mock.patch("data_manager.yf.download", return_value=raw) as mocked_download:
|
||||
panels = data_manager.update_market_data(
|
||||
"us",
|
||||
["AAA", "BBB"],
|
||||
["close", "open", "high", "low", "volume"],
|
||||
)
|
||||
self.assertEqual(set(panels), {"close", "open", "high", "low", "volume"})
|
||||
self.assertEqual(panels["close"].loc[dates[1], "BBB"], 20.0)
|
||||
self.assertTrue(pd.isna(panels["volume"].loc[dates[1], "BBB"]))
|
||||
self.assertTrue((Path(tmpdir) / "us.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_open.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_high.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_low.csv").exists())
|
||||
self.assertTrue((Path(tmpdir) / "us_volume.csv").exists())
|
||||
|
||||
saved_high = pd.read_csv(Path(tmpdir) / "us_high.csv", index_col=0, parse_dates=True)
|
||||
pd.testing.assert_frame_equal(saved_high, panels["high"], check_freq=False)
|
||||
|
||||
self.assertEqual(mocked_download.call_args.args[0], ["AAA", "BBB"])
|
||||
self.assertEqual(mocked_download.call_args.kwargs["auto_adjust"], True)
|
||||
self.assertIn("start", mocked_download.call_args.kwargs)
|
||||
|
||||
def test_update_market_data_rejects_unsupported_fields(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with mock.patch.object(data_manager, "DATA_DIR", tmpdir):
|
||||
with self.assertRaisesRegex(ValueError, "Unsupported market data field: adjusted_close"):
|
||||
data_manager.update_market_data("us", ["AAA"], ["adjusted_close"])
|
||||
|
||||
def test_update_market_data_preserves_existing_cache_columns_and_dates(self):
|
||||
existing_dates = pd.to_datetime(["2024-01-01", "2024-01-02"])
|
||||
new_dates = pd.to_datetime(["2024-01-02", "2024-01-03"])
|
||||
existing_close = pd.DataFrame(
|
||||
{
|
||||
"AAA": [9.0, 10.0],
|
||||
"CCC": [30.0, 31.0],
|
||||
},
|
||||
index=existing_dates,
|
||||
)
|
||||
downloaded_close = pd.DataFrame({"Close": [10.5, 11.5]}, index=new_dates)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
existing_close.to_csv(Path(tmpdir) / "us.csv")
|
||||
with mock.patch.object(data_manager, "DATA_DIR", tmpdir):
|
||||
with mock.patch("data_manager.yf.download", return_value=downloaded_close):
|
||||
panels = data_manager.update_market_data("us", ["AAA"], ["close"])
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"AAA": [9.0, 10.5, 11.5],
|
||||
"CCC": [30.0, 31.0, float("nan")],
|
||||
},
|
||||
index=pd.to_datetime(["2024-01-01", "2024-01-02", "2024-01-03"]),
|
||||
)
|
||||
saved_close = pd.read_csv(Path(tmpdir) / "us.csv", index_col=0, parse_dates=True)
|
||||
|
||||
pd.testing.assert_frame_equal(panels["close"], expected, check_freq=False)
|
||||
pd.testing.assert_frame_equal(saved_close, expected, check_freq=False)
|
||||
|
||||
def test_update_market_data_volume_merge_can_clear_stale_cached_values(self):
|
||||
existing_dates = pd.to_datetime(["2024-01-01", "2024-01-02"])
|
||||
new_dates = pd.to_datetime(["2024-01-02", "2024-01-03", "2024-01-04"])
|
||||
existing_volume = pd.DataFrame(
|
||||
{
|
||||
"AAA": [1000.0, 9999.0],
|
||||
"CCC": [3000.0, 3100.0],
|
||||
},
|
||||
index=existing_dates,
|
||||
)
|
||||
downloaded_volume = pd.DataFrame({"Volume": [float("nan"), 1200.0, 1300.0]}, index=new_dates)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
existing_volume.to_csv(Path(tmpdir) / "us_volume.csv")
|
||||
with mock.patch.object(data_manager, "DATA_DIR", tmpdir):
|
||||
with mock.patch("data_manager.yf.download", return_value=downloaded_volume):
|
||||
panels = data_manager.update_market_data("us", ["AAA"], ["volume"])
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"AAA": [1000.0, float("nan"), 1200.0, 1300.0],
|
||||
"CCC": [3000.0, 3100.0, float("nan"), float("nan")],
|
||||
},
|
||||
index=pd.to_datetime(["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"]),
|
||||
)
|
||||
saved_volume = pd.read_csv(Path(tmpdir) / "us_volume.csv", index_col=0, parse_dates=True)
|
||||
|
||||
pd.testing.assert_frame_equal(panels["volume"], expected, check_freq=False)
|
||||
pd.testing.assert_frame_equal(saved_volume, expected, check_freq=False)
|
||||
|
||||
def test_update_market_data_handles_single_ticker_multiindex_download(self):
|
||||
dates = pd.to_datetime(["2024-01-02", "2024-01-03"])
|
||||
raw = pd.DataFrame(
|
||||
{
|
||||
("Close", "AAA"): [10.0, 11.0],
|
||||
("Volume", "AAA"): [1000.0, 1100.0],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
raw.columns = pd.MultiIndex.from_tuples(raw.columns)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with mock.patch.object(data_manager, "DATA_DIR", tmpdir):
|
||||
with mock.patch("data_manager.yf.download", return_value=raw):
|
||||
panels = data_manager.update_market_data("us", ["AAA"], ["close", "volume"])
|
||||
|
||||
expected_close = pd.DataFrame({"AAA": [10.0, 11.0]}, index=dates)
|
||||
expected_volume = pd.DataFrame({"AAA": [1000.0, 1100.0]}, index=dates)
|
||||
pd.testing.assert_frame_equal(panels["close"], expected_close, check_freq=False)
|
||||
pd.testing.assert_frame_equal(panels["volume"], expected_volume, check_freq=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
164
tests/test_us_alpha_pipeline.py
Normal file
164
tests/test_us_alpha_pipeline.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class USAlphaPipelineTests(unittest.TestCase):
|
||||
def test_build_equal_weight_portfolio_caps_holdings_under_ties(self):
|
||||
from research.us_alpha_pipeline import _build_equal_weight_portfolio
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=2, freq="D")
|
||||
score = pd.DataFrame(
|
||||
{
|
||||
"AAA": [0.9, 0.9],
|
||||
"BBB": [0.9, 0.9],
|
||||
"CCC": [0.9, 0.9],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
tradable_mask = pd.DataFrame(True, index=dates, columns=score.columns)
|
||||
regime = pd.Series([True, True], index=dates)
|
||||
|
||||
weights = _build_equal_weight_portfolio(score, tradable_mask, regime, top_n=2)
|
||||
|
||||
self.assertEqual(int((weights.iloc[-1] > 0).sum()), 2)
|
||||
self.assertAlmostEqual(float(weights.iloc[-1].sum()), 1.0)
|
||||
|
||||
def test_equity_curve_uses_prior_day_weights_for_returns(self):
|
||||
from research.us_alpha_pipeline import _equity_curve
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=3, freq="D")
|
||||
close = pd.DataFrame({"AAA": [1.0, 2.0, 4.0]}, index=dates)
|
||||
weights = pd.DataFrame({"AAA": [0.0, 1.0, 0.0]}, index=dates)
|
||||
|
||||
equity = _equity_curve(close, weights)
|
||||
|
||||
self.assertEqual(float(equity.iloc[1]), 2.0)
|
||||
self.assertEqual(float(equity.iloc[2]), 2.0)
|
||||
|
||||
def test_summarize_equity_window_returns_nans_when_history_is_too_short(self):
|
||||
from research.us_alpha_report import summarize_equity_window
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=10, freq="D")
|
||||
equity = pd.Series([1.0 + 0.01 * i for i in range(10)], index=dates)
|
||||
|
||||
summary = summarize_equity_window(equity, "demo", window_years=1)
|
||||
|
||||
self.assertTrue(pd.isna(summary["CAGR"]))
|
||||
self.assertTrue(pd.isna(summary["Sharpe"]))
|
||||
self.assertTrue(pd.isna(summary["MaxDD"]))
|
||||
self.assertTrue(pd.isna(summary["TotalRet"]))
|
||||
|
||||
def test_run_alpha_pipeline_returns_expected_strategy_summary(self):
|
||||
from research.us_alpha_pipeline import run_alpha_pipeline
|
||||
|
||||
dates = pd.date_range("2023-01-01", periods=400, freq="D")
|
||||
|
||||
aaa_close = [50.0 + 0.20 * i for i in range(400)]
|
||||
bbb_close = [55.0 + 0.12 * i for i in range(400)]
|
||||
ccc_close = [60.0 + 0.05 * i for i in range(400)]
|
||||
close = pd.DataFrame(
|
||||
{
|
||||
"AAA": aaa_close,
|
||||
"BBB": bbb_close,
|
||||
"CCC": ccc_close,
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
high = pd.DataFrame(
|
||||
{
|
||||
"AAA": [value + 0.5 for value in aaa_close],
|
||||
"BBB": [value + 1.0 for value in bbb_close],
|
||||
"CCC": [value + 1.5 for value in ccc_close],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
low = pd.DataFrame(
|
||||
{
|
||||
"AAA": [value - 0.5 for value in aaa_close],
|
||||
"BBB": [value - 1.0 for value in bbb_close],
|
||||
"CCC": [value - 1.5 for value in ccc_close],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
volume = pd.DataFrame(
|
||||
{
|
||||
"AAA": [1_500_000.0] * 400,
|
||||
"BBB": [1_400_000.0] * 400,
|
||||
"CCC": [1_300_000.0] * 400,
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
volume.loc[dates[-2], "AAA"] = 4_000_000.0
|
||||
|
||||
etf_close = pd.DataFrame(
|
||||
{
|
||||
"SPY": [300.0 + 0.8 * i for i in range(400)],
|
||||
"QQQ": [280.0 + 1.1 * i for i in range(400)],
|
||||
"XLF": [200.0 + 0.4 * i for i in range(400)],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
|
||||
market_data = {
|
||||
"close": close,
|
||||
"high": high,
|
||||
"low": low,
|
||||
"volume": volume,
|
||||
}
|
||||
|
||||
summary = run_alpha_pipeline(
|
||||
market_data=market_data,
|
||||
etf_close=etf_close,
|
||||
pit_membership=None,
|
||||
windows=(1,),
|
||||
top_n=2,
|
||||
)
|
||||
|
||||
required_columns = {"strategy", "window_years", "CAGR", "Sharpe", "MaxDD", "TotalRet"}
|
||||
self.assertTrue(required_columns.issubset(summary.columns))
|
||||
self.assertEqual(set(summary["strategy"]), {"breakout_regime", "rank_blend_regime"})
|
||||
self.assertEqual(set(summary["window_years"]), {1})
|
||||
self.assertEqual(len(summary), 2)
|
||||
self.assertTrue(summary[["CAGR", "Sharpe", "MaxDD", "TotalRet"]].notna().all().all())
|
||||
|
||||
def test_run_saved_pit_alpha_pipeline_reads_saved_inputs(self):
|
||||
from research.us_alpha_pipeline import run_saved_pit_alpha_pipeline
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=320, freq="D")
|
||||
close = pd.DataFrame(
|
||||
{
|
||||
"AAA": [50.0 + 0.2 * i for i in range(320)],
|
||||
"BBB": [40.0 + 0.1 * i for i in range(320)],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
high = close + 1.0
|
||||
low = close - 1.0
|
||||
volume = pd.DataFrame({"AAA": [2_500_000.0] * 320, "BBB": [2_000_000.0] * 320}, index=dates)
|
||||
etf_close = pd.DataFrame(
|
||||
{"SPY": [300.0 + 0.8 * i for i in range(320)], "QQQ": [280.0 + 1.1 * i for i in range(320)]},
|
||||
index=dates,
|
||||
)
|
||||
|
||||
with self.subTest("saved_inputs"):
|
||||
import tempfile
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
close.to_csv(Path(tmpdir) / "us_pit_close.csv")
|
||||
high.to_csv(Path(tmpdir) / "us_pit_high.csv")
|
||||
low.to_csv(Path(tmpdir) / "us_pit_low.csv")
|
||||
volume.to_csv(Path(tmpdir) / "us_pit_volume.csv")
|
||||
etf_close.to_csv(Path(tmpdir) / "us_etf.csv")
|
||||
|
||||
intervals = {"AAA": [[None, None]], "BBB": [[None, None]]}
|
||||
with mock.patch("research.us_alpha_pipeline.uh.load_sp500_history", return_value=intervals):
|
||||
summary = run_saved_pit_alpha_pipeline(data_dir=tmpdir, windows=(1,), top_n=1)
|
||||
|
||||
self.assertEqual(set(summary["strategy"]), {"breakout_regime", "rank_blend_regime"})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
213
tests/test_us_universe.py
Normal file
213
tests/test_us_universe.py
Normal file
@@ -0,0 +1,213 @@
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class BuildTradableMaskTests(unittest.TestCase):
|
||||
def test_build_tradable_mask_uses_only_lagged_price_and_liquidity_inputs(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=4, freq="D")
|
||||
close = pd.DataFrame({"AAA": [4.0, 10.0, 10.0, 10.0]}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [float("nan"), 200.0, 200.0, 200.0]}, index=dates)
|
||||
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=None,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1000.0,
|
||||
min_history_days=2,
|
||||
min_valid_volume_days=2,
|
||||
liquidity_window=2,
|
||||
)
|
||||
|
||||
expected = pd.DataFrame({"AAA": [False, False, False, True]}, index=dates, dtype=bool)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
def test_build_tradable_mask_uses_only_lagged_history(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=4, freq="D")
|
||||
close = pd.DataFrame({"AAA": [10.0, float("nan"), 10.0, 10.0]}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [200.0, 200.0, 200.0, 200.0]}, index=dates)
|
||||
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=None,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1_000.0,
|
||||
min_history_days=2,
|
||||
min_valid_volume_days=1,
|
||||
liquidity_window=1,
|
||||
)
|
||||
|
||||
expected = pd.DataFrame({"AAA": [False, False, False, False]}, index=dates, dtype=bool)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
def test_build_tradable_mask_requires_membership_history_before_first_eligible_day(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=4, freq="D")
|
||||
close = pd.DataFrame({"AAA": [10.0, 10.0, 10.0, 10.0]}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [200.0, 200.0, 200.0, 200.0]}, index=dates)
|
||||
pit_membership = pd.DataFrame({"AAA": [False, False, True, True]}, index=dates)
|
||||
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=pit_membership,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1_000.0,
|
||||
min_history_days=1,
|
||||
min_valid_volume_days=1,
|
||||
liquidity_window=1,
|
||||
)
|
||||
|
||||
expected = pd.DataFrame({"AAA": [False, False, False, True]}, index=dates, dtype=bool)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
def test_build_tradable_mask_aligns_pit_membership_without_truthy_carryover(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=3, freq="D")
|
||||
close = pd.DataFrame(
|
||||
{
|
||||
"AAA": [10.0, 10.0, 10.0],
|
||||
"BBB": [12.0, 12.0, 12.0],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
volume = pd.DataFrame(
|
||||
{
|
||||
"AAA": [1_000_000.0, 1_000_000.0, 1_000_000.0],
|
||||
"BBB": [1_000_000.0, 1_000_000.0, 1_000_000.0],
|
||||
},
|
||||
index=dates,
|
||||
)
|
||||
pit_membership = pd.DataFrame(
|
||||
{
|
||||
"BBB": [True, True, False],
|
||||
"CCC": [True, True, True],
|
||||
},
|
||||
index=pd.date_range("2024-01-02", periods=3, freq="D"),
|
||||
)
|
||||
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
warnings.simplefilter("always")
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=pit_membership,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1_000.0,
|
||||
min_history_days=1,
|
||||
min_valid_volume_days=1,
|
||||
liquidity_window=1,
|
||||
)
|
||||
|
||||
self.assertEqual(len(caught), 0)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"AAA": [False, False, False],
|
||||
"BBB": [False, False, True],
|
||||
},
|
||||
index=dates,
|
||||
dtype=bool,
|
||||
)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
def test_build_tradable_mask_treats_missing_membership_cells_as_false(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=3, freq="D")
|
||||
close = pd.DataFrame({"AAA": [10.0, 10.0, 10.0]}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [1_000_000.0, 1_000_000.0, 1_000_000.0]}, index=dates)
|
||||
pit_membership = pd.DataFrame(
|
||||
{"AAA": [True, pd.NA, True]},
|
||||
index=dates,
|
||||
dtype="boolean",
|
||||
)
|
||||
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=pit_membership,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1_000.0,
|
||||
min_history_days=1,
|
||||
min_valid_volume_days=1,
|
||||
liquidity_window=1,
|
||||
)
|
||||
|
||||
expected = pd.DataFrame({"AAA": [False, False, False]}, index=dates, dtype=bool)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
def test_build_tradable_mask_uses_strict_thresholds(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=3, freq="D")
|
||||
close = pd.DataFrame({"AAA": [5.0, 5.0, 5.0]}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [300.0, 300.0, 300.0]}, index=dates)
|
||||
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=None,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1_000.0,
|
||||
min_history_days=1,
|
||||
min_valid_volume_days=1,
|
||||
liquidity_window=1,
|
||||
)
|
||||
|
||||
expected = pd.DataFrame({"AAA": [False, False, False]}, index=dates, dtype=bool)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
def test_build_tradable_mask_uses_strict_dollar_volume_threshold(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=3, freq="D")
|
||||
close = pd.DataFrame({"AAA": [8.0, 8.0, 8.0]}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [125.0, 125.0, 125.0]}, index=dates)
|
||||
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=None,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1_000.0,
|
||||
min_history_days=1,
|
||||
min_valid_volume_days=1,
|
||||
liquidity_window=1,
|
||||
)
|
||||
|
||||
expected = pd.DataFrame({"AAA": [False, False, False]}, index=dates, dtype=bool)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
def test_build_tradable_mask_requires_valid_dollar_volume_history(self):
|
||||
from research.us_universe import build_tradable_mask
|
||||
|
||||
dates = pd.date_range("2024-01-01", periods=4, freq="D")
|
||||
close = pd.DataFrame({"AAA": [10.0, float("nan"), 10.0, 10.0]}, index=dates)
|
||||
volume = pd.DataFrame({"AAA": [200.0, 200.0, 200.0, 200.0]}, index=dates)
|
||||
|
||||
mask = build_tradable_mask(
|
||||
close=close,
|
||||
volume=volume,
|
||||
pit_membership=None,
|
||||
min_price=5.0,
|
||||
min_dollar_volume=1_000.0,
|
||||
min_history_days=1,
|
||||
min_valid_volume_days=2,
|
||||
liquidity_window=2,
|
||||
)
|
||||
|
||||
expected = pd.DataFrame({"AAA": [False, False, False, False]}, index=dates, dtype=bool)
|
||||
pd.testing.assert_frame_equal(mask, expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
61
trader.py
61
trader.py
@@ -40,6 +40,7 @@ import yfinance as yf
|
||||
import data_manager
|
||||
from strategies.buy_and_hold import BuyAndHoldStrategy
|
||||
from strategies.dual_momentum import DualMomentumStrategy
|
||||
from strategies.factor_combo import FactorComboStrategy
|
||||
from strategies.inverse_vol import InverseVolatilityStrategy
|
||||
from strategies.momentum import MomentumStrategy
|
||||
from strategies.momentum_quality import MomentumQualityStrategy
|
||||
@@ -47,11 +48,22 @@ from strategies.recovery_momentum import RecoveryMomentumStrategy
|
||||
from strategies.trend_following import TrendFollowingStrategy
|
||||
from universe import UNIVERSES
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-market fixed trading fees (per trade, in the market's local currency)
|
||||
# ---------------------------------------------------------------------------
|
||||
# These are applied automatically by cmd_monitor and cmd_auto; they can still
|
||||
# be overridden by explicitly passing --fixed-fee on the CLI.
|
||||
MARKET_FEES = {
|
||||
"us": 2.0, # USD per trade
|
||||
"cn": 5.0, # CNY per trade (A-share minimum commission)
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Strategy registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
STRATEGY_REGISTRY = {
|
||||
# --- Original strategies ---
|
||||
"recovery_mom_top10": lambda **kw: RecoveryMomentumStrategy(top_n=10),
|
||||
"recovery_mom_top20": lambda **kw: RecoveryMomentumStrategy(top_n=20),
|
||||
"recovery_mom_top50": lambda **kw: RecoveryMomentumStrategy(top_n=50),
|
||||
@@ -61,6 +73,40 @@ STRATEGY_REGISTRY = {
|
||||
"inverse_vol": lambda **kw: InverseVolatilityStrategy(vol_window=20),
|
||||
"trend_following": lambda **kw: TrendFollowingStrategy(top_n=kw.get("top_n", 20)),
|
||||
"buy_and_hold": lambda **kw: BuyAndHoldStrategy(),
|
||||
# --- Factor combo: US champions ---
|
||||
"fc_rec_mfilt_deep_upvol_daily": lambda **kw: FactorComboStrategy("rec_mfilt+deep_upvol", rebal_freq=1),
|
||||
"fc_rec_mfilt_deep_upvol_weekly": lambda **kw: FactorComboStrategy("rec_mfilt+deep_upvol", rebal_freq=5),
|
||||
"fc_rec_mfilt_deep_upvol_biweekly": lambda **kw: FactorComboStrategy("rec_mfilt+deep_upvol", rebal_freq=10),
|
||||
"fc_rec_mfilt_deep_upvol_monthly": lambda **kw: FactorComboStrategy("rec_mfilt+deep_upvol", rebal_freq=21),
|
||||
"fc_ma200_mom7m_rec126_daily": lambda **kw: FactorComboStrategy("ma200+mom7m+rec126", rebal_freq=1),
|
||||
"fc_ma200_mom7m_rec126_weekly": lambda **kw: FactorComboStrategy("ma200+mom7m+rec126", rebal_freq=5),
|
||||
"fc_ma200_mom7m_rec126_biweekly": lambda **kw: FactorComboStrategy("ma200+mom7m+rec126", rebal_freq=10),
|
||||
"fc_ma200_mom7m_rec126_monthly": lambda **kw: FactorComboStrategy("ma200+mom7m+rec126", rebal_freq=21),
|
||||
"fc_rec_mfilt_ma200_daily": lambda **kw: FactorComboStrategy("rec_mfilt+ma200", rebal_freq=1),
|
||||
"fc_rec_mfilt_ma200_weekly": lambda **kw: FactorComboStrategy("rec_mfilt+ma200", rebal_freq=5),
|
||||
"fc_rec_mfilt_ma200_biweekly": lambda **kw: FactorComboStrategy("rec_mfilt+ma200", rebal_freq=10),
|
||||
"fc_rec_mfilt_ma200_monthly": lambda **kw: FactorComboStrategy("rec_mfilt+ma200", rebal_freq=21),
|
||||
"fc_mom7m_rec126_daily": lambda **kw: FactorComboStrategy("mom7m+rec126", rebal_freq=1),
|
||||
"fc_mom7m_rec126_weekly": lambda **kw: FactorComboStrategy("mom7m+rec126", rebal_freq=5),
|
||||
"fc_mom7m_rec126_biweekly": lambda **kw: FactorComboStrategy("mom7m+rec126", rebal_freq=10),
|
||||
"fc_mom7m_rec126_monthly": lambda **kw: FactorComboStrategy("mom7m+rec126", rebal_freq=21),
|
||||
# --- Factor combo: CN champions ---
|
||||
"fc_up_cap_quality_mom_daily": lambda **kw: FactorComboStrategy("up_cap+quality_mom", rebal_freq=1),
|
||||
"fc_up_cap_quality_mom_weekly": lambda **kw: FactorComboStrategy("up_cap+quality_mom", rebal_freq=5),
|
||||
"fc_up_cap_quality_mom_biweekly": lambda **kw: FactorComboStrategy("up_cap+quality_mom", rebal_freq=10),
|
||||
"fc_up_cap_quality_mom_monthly": lambda **kw: FactorComboStrategy("up_cap+quality_mom", rebal_freq=21),
|
||||
"fc_down_resil_qual_mom_daily": lambda **kw: FactorComboStrategy("down_resil+qual_mom", rebal_freq=1),
|
||||
"fc_down_resil_qual_mom_weekly": lambda **kw: FactorComboStrategy("down_resil+qual_mom", rebal_freq=5),
|
||||
"fc_down_resil_qual_mom_biweekly": lambda **kw: FactorComboStrategy("down_resil+qual_mom", rebal_freq=10),
|
||||
"fc_down_resil_qual_mom_monthly": lambda **kw: FactorComboStrategy("down_resil+qual_mom", rebal_freq=21),
|
||||
"fc_rec63_mom_gap_daily": lambda **kw: FactorComboStrategy("rec63+mom_gap", rebal_freq=1),
|
||||
"fc_rec63_mom_gap_weekly": lambda **kw: FactorComboStrategy("rec63+mom_gap", rebal_freq=5),
|
||||
"fc_rec63_mom_gap_biweekly": lambda **kw: FactorComboStrategy("rec63+mom_gap", rebal_freq=10),
|
||||
"fc_rec63_mom_gap_monthly": lambda **kw: FactorComboStrategy("rec63+mom_gap", rebal_freq=21),
|
||||
"fc_up_cap_mom_gap_daily": lambda **kw: FactorComboStrategy("up_cap+mom_gap", rebal_freq=1),
|
||||
"fc_up_cap_mom_gap_weekly": lambda **kw: FactorComboStrategy("up_cap+mom_gap", rebal_freq=5),
|
||||
"fc_up_cap_mom_gap_biweekly": lambda **kw: FactorComboStrategy("up_cap+mom_gap", rebal_freq=10),
|
||||
"fc_up_cap_mom_gap_monthly": lambda **kw: FactorComboStrategy("up_cap+mom_gap", rebal_freq=21),
|
||||
}
|
||||
|
||||
|
||||
@@ -484,6 +530,12 @@ def cmd_evening(args):
|
||||
|
||||
post_value = portfolio_value(state["holdings"], close_prices, state["cash"])
|
||||
state["daily_equity"][trade_date] = round(post_value, 2)
|
||||
|
||||
# Record daily snapshot so daily_log stays complete even on no-trade days
|
||||
eq_vals = list(state["daily_equity"].values())
|
||||
prev_eq = eq_vals[-2] if len(eq_vals) >= 2 else state["initial_capital"]
|
||||
record_daily_snapshot(state, trade_date, close_prices, exec_trades, prev_eq)
|
||||
|
||||
state["pending_trades"] = None
|
||||
state["last_evening"] = trade_date
|
||||
save_state(state, market, strategy_name)
|
||||
@@ -1046,12 +1098,13 @@ def cmd_monitor(args):
|
||||
print(f" MONITOR MODE — {len(markets)} market(s), "
|
||||
f"{len(strategies)} strategies each")
|
||||
print(f" Capital: ${args.capital:,.0f} | "
|
||||
f"Fee: ${args.fixed_fee:.2f}/trade | "
|
||||
f"Integer shares: {args.integer_shares}")
|
||||
for mkt, sched in market_schedules.items():
|
||||
fee = MARKET_FEES.get(mkt, args.fixed_fee)
|
||||
print(f" {sched['label']}:")
|
||||
print(f" Morning: {sched['morn_h']:02d}:{sched['morn_m']:02d} {sched['tz']}")
|
||||
print(f" Evening: {sched['eve_h']:02d}:{sched['eve_m']:02d} {sched['tz']}")
|
||||
print(f" Fixed fee: {fee:.2f}/trade")
|
||||
print(f" Strategies: {', '.join(strategies)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
@@ -1096,10 +1149,12 @@ def cmd_monitor(args):
|
||||
f"{now_local.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
print(f"[monitor] {'='*55}")
|
||||
|
||||
market_fee = MARKET_FEES.get(market, args.fixed_fee)
|
||||
for strat_name in strategies:
|
||||
sub_args = copy.copy(args)
|
||||
sub_args.strategy = strat_name
|
||||
sub_args.market = market
|
||||
sub_args.fixed_fee = market_fee
|
||||
|
||||
print(f"\n[monitor] --- {market.upper()}:{strat_name} ---")
|
||||
try:
|
||||
@@ -1253,8 +1308,10 @@ def cmd_auto(args):
|
||||
integer_shares=args.integer_shares
|
||||
)
|
||||
|
||||
# Fall back to per-market fee when the user didn't explicitly override
|
||||
fixed_fee = args.fixed_fee if args.fixed_fee > 0 else MARKET_FEES.get(market, 0.0)
|
||||
execute_trades(state, trades, close_prices,
|
||||
tx_cost=args.tx_cost, fixed_fee=args.fixed_fee,
|
||||
tx_cost=args.tx_cost, fixed_fee=fixed_fee,
|
||||
trade_date=today_str, integer_shares=args.integer_shares)
|
||||
|
||||
post_value = portfolio_value(state["holdings"], close_prices, state["cash"])
|
||||
|
||||
230
universe_history.py
Normal file
230
universe_history.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Point-in-time index membership reconstruction — fixes survivorship bias.
|
||||
|
||||
Approach: Wikipedia's "Selected changes to the list of S&P 500 components"
|
||||
table lists every add/remove event (394 rows back to 1976, as of 2026). We
|
||||
start from today's membership and walk the change log *backward*:
|
||||
- An 'Added' ticker on date D was NOT a member before D.
|
||||
- A 'Removed' ticker on date D WAS a member before D.
|
||||
Applied iteratively, this yields the set of members on any historical date.
|
||||
|
||||
The membership info is cached in data/sp500_history.json so Wikipedia is hit
|
||||
at most once per day. The cache stores per-ticker membership intervals:
|
||||
{ "ticker": [[start, end_or_null], ...] }
|
||||
where dates are YYYY-MM-DD strings.
|
||||
"""
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
from datetime import date, datetime
|
||||
|
||||
import pandas as pd
|
||||
|
||||
CACHE_DIR = "data"
|
||||
_HEADERS = {"User-Agent": "Mozilla/5.0 (quant-backtest)"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fetch + parse Wikipedia
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fetch_sp500_tables() -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
"""Return (current_list, changes_log) from the S&P 500 Wikipedia page."""
|
||||
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
|
||||
req = urllib.request.Request(url, headers=_HEADERS)
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
html = resp.read().decode("utf-8")
|
||||
tables = pd.read_html(io.StringIO(html))
|
||||
current = tables[0]
|
||||
changes = tables[1]
|
||||
changes.columns = [
|
||||
"_".join(c).strip() if isinstance(c, tuple) else c
|
||||
for c in changes.columns
|
||||
]
|
||||
changes.columns = [
|
||||
c.replace("Effective Date_Effective Date", "Date") for c in changes.columns
|
||||
]
|
||||
return current, changes
|
||||
|
||||
|
||||
def _normalize_ticker(t: str) -> str:
|
||||
"""Yahoo Finance ticker format: BRK.B → BRK-B."""
|
||||
return str(t).replace(".", "-").strip()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Membership reconstruction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_sp500_history() -> dict[str, list[list[str | None]]]:
|
||||
"""
|
||||
Reconstruct per-ticker membership intervals.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict: ticker -> list of [start_date, end_date_or_None] pairs.
|
||||
end_date=None means the ticker is still a member as of today.
|
||||
Dates are YYYY-MM-DD strings.
|
||||
|
||||
Algorithm: start from today's set of members, walk the change log from
|
||||
newest to oldest. For each event on date D:
|
||||
- The 'Added' ticker: its current (open) interval starts on D.
|
||||
Close it: [..., D] — it was NOT a member before D.
|
||||
- The 'Removed' ticker: it was a member up to D (exclusive).
|
||||
Open a new interval ending on D (start unknown for now; will be
|
||||
closed by an earlier event or left open-start).
|
||||
|
||||
After the walk, any ticker still "open" (never closed backward) has an
|
||||
interval reaching back before the earliest logged change.
|
||||
"""
|
||||
current, changes = _fetch_sp500_tables()
|
||||
|
||||
current_tickers = {_normalize_ticker(s) for s in current["Symbol"].tolist()}
|
||||
|
||||
# Parse change log
|
||||
changes["dt"] = pd.to_datetime(changes["Date"], errors="coerce")
|
||||
changes = changes.dropna(subset=["dt"]).sort_values("dt", ascending=False)
|
||||
|
||||
# For each ticker, collect intervals [start, end].
|
||||
# We track a "current open interval" per ticker during the backward walk.
|
||||
# intervals[ticker] = list of [start, end] completed intervals (oldest-first).
|
||||
# open_start[ticker] = start date of the currently open (most-recent) interval.
|
||||
intervals: dict[str, list[list[str | None]]] = {}
|
||||
open_end: dict[str, str | None] = {} # end of currently-open interval
|
||||
|
||||
# Initialize: today's members have an open interval ending = None (still in)
|
||||
for t in current_tickers:
|
||||
open_end[t] = None # still a member today
|
||||
intervals[t] = []
|
||||
|
||||
# Track the start date of each open interval as we walk backward.
|
||||
# For a member today, the interval started at the last "Added" event in the
|
||||
# changes log, OR before the log begins if never added.
|
||||
# We'll close the interval when we hit the "Added" event going backward.
|
||||
open_start: dict[str, str | None] = {t: None for t in current_tickers}
|
||||
|
||||
for _, row in changes.iterrows():
|
||||
d = row["dt"].strftime("%Y-%m-%d")
|
||||
added = row.get("Added_Ticker")
|
||||
removed = row.get("Removed_Ticker")
|
||||
|
||||
if pd.notna(added):
|
||||
a = _normalize_ticker(added)
|
||||
# This ticker was added on d → its open interval starts on d.
|
||||
if a in open_end:
|
||||
open_start[a] = d
|
||||
# Finalize the current open interval
|
||||
intervals[a].append([d, open_end[a]])
|
||||
# Pop: no further open interval backward in time for this ticker
|
||||
# (unless 'Removed' opens a new older one below)
|
||||
del open_end[a]
|
||||
|
||||
if pd.notna(removed):
|
||||
r = _normalize_ticker(removed)
|
||||
# This ticker was removed on d → it WAS a member before d.
|
||||
# Open a new interval ending on d (start unknown yet).
|
||||
if r not in open_end:
|
||||
intervals.setdefault(r, [])
|
||||
open_end[r] = d # end of the new older interval
|
||||
|
||||
# Any ticker still with an open interval → start predates the log.
|
||||
# Use the oldest logged date as a conservative "unknown earlier" marker: None.
|
||||
for t, end in open_end.items():
|
||||
intervals.setdefault(t, []).append([None, end])
|
||||
|
||||
# Sort intervals per ticker oldest→newest
|
||||
for t, ivs in intervals.items():
|
||||
ivs.sort(key=lambda iv: (iv[0] or "0000-00-00"))
|
||||
|
||||
return intervals
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cache I/O
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cache_path() -> str:
|
||||
return os.path.join(CACHE_DIR, "sp500_history.json")
|
||||
|
||||
|
||||
def load_sp500_history(force_refresh: bool = False) -> dict[str, list[list[str | None]]]:
|
||||
"""Load cached membership history, or rebuild if stale (>1 day old)."""
|
||||
path = _cache_path()
|
||||
if not force_refresh and os.path.exists(path):
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
if data.get("date") == str(date.today()):
|
||||
return data["intervals"]
|
||||
except Exception:
|
||||
pass
|
||||
print("--- Rebuilding S&P 500 membership history from Wikipedia ---")
|
||||
intervals = build_sp500_history()
|
||||
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
json.dump({"date": str(date.today()), "intervals": intervals}, f)
|
||||
print(f"--- Cached {len(intervals)} tickers' membership intervals ---")
|
||||
return intervals
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Convert intervals → aligned mask DataFrame
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def membership_mask(dates: pd.DatetimeIndex,
|
||||
intervals: dict[str, list[list[str | None]]] | None = None,
|
||||
tickers: list[str] | None = None) -> pd.DataFrame:
|
||||
"""
|
||||
Boolean DataFrame: rows = dates, columns = tickers.
|
||||
True where the ticker was an S&P 500 member on that date.
|
||||
|
||||
If `tickers` is given, restrict columns to that list (useful for aligning
|
||||
with a price DataFrame). Otherwise, include all tickers ever a member.
|
||||
"""
|
||||
if intervals is None:
|
||||
intervals = load_sp500_history()
|
||||
cols = tickers if tickers is not None else sorted(intervals.keys())
|
||||
# Tickers not in `intervals` (e.g. SPY, benchmarks, ETFs) are treated as
|
||||
# always-members so callers can pass the full price matrix through
|
||||
# mask_prices without zeroing out benchmark series.
|
||||
mask = pd.DataFrame(False, index=dates, columns=cols)
|
||||
for t in cols:
|
||||
if t not in intervals:
|
||||
mask[t] = True
|
||||
continue
|
||||
for start, end in intervals[t]:
|
||||
s = pd.Timestamp(start) if start else dates[0]
|
||||
e = pd.Timestamp(end) if end else dates[-1] + pd.Timedelta(days=1)
|
||||
# Interval semantics: member on [start, end). A ticker removed on
|
||||
# date D was no longer a member on D.
|
||||
mask.loc[(mask.index >= s) & (mask.index < e), t] = True
|
||||
return mask
|
||||
|
||||
|
||||
def all_tickers_ever(intervals: dict | None = None) -> list[str]:
|
||||
"""All tickers that were ever S&P 500 members (for price data fetching)."""
|
||||
if intervals is None:
|
||||
intervals = load_sp500_history()
|
||||
return sorted(intervals.keys())
|
||||
|
||||
|
||||
def mask_prices(prices: pd.DataFrame,
|
||||
intervals: dict | None = None) -> pd.DataFrame:
|
||||
"""
|
||||
Return a copy of `prices` with NaN set for (date, ticker) pairs where
|
||||
the ticker was not an S&P 500 member on that date.
|
||||
|
||||
This is the key survivorship-bias fix: strategies compute signals from
|
||||
the masked price data, so they naturally cannot select stocks outside
|
||||
the point-in-time index membership.
|
||||
|
||||
Warm-up note: a newly-added member needs sufficient non-NaN history for
|
||||
its rolling windows to produce a valid signal. For this codebase's
|
||||
~252-day lookbacks, a stock becomes "selectable" roughly 1 year after
|
||||
joining. This is conservative but correct: before that, we have no
|
||||
legitimate signal anyway.
|
||||
"""
|
||||
mask = membership_mask(prices.index, intervals, tickers=list(prices.columns))
|
||||
return prices.where(mask)
|
||||
Reference in New Issue
Block a user