Compare commits
23 Commits
816765071f
...
c245774d76
| Author | SHA1 | Date | |
|---|---|---|---|
| c245774d76 | |||
| d85572e7b5 | |||
| c0a9235b80 | |||
| c4173b2b3b | |||
| 6d874ecbff | |||
| 403ae2e2b7 | |||
| 861d754f29 | |||
| 76ec19224c | |||
| e67bc86240 | |||
| fd94ab9f3b | |||
| 4607711bb5 | |||
| d23b69219b | |||
| 488fae7e63 | |||
| 426151bc9f | |||
| a9d237bbfd | |||
| 5257fbc1a2 | |||
| b3156a382a | |||
| 76cca89a43 | |||
| 83162e7a64 | |||
| a3523f5601 | |||
| 95c02d7dd9 | |||
| a1b804f879 | |||
| 0c23285f39 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,6 +4,7 @@
|
|||||||
.aituner-tight/
|
.aituner-tight/
|
||||||
.aituner-prefill/
|
.aituner-prefill/
|
||||||
.aituner-compare/
|
.aituner-compare/
|
||||||
|
.aituner-run-configs/
|
||||||
.env
|
.env
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
|
|||||||
@@ -6,6 +6,10 @@
|
|||||||
- Hardware expectation: 8 NVIDIA H20 GPUs.
|
- Hardware expectation: 8 NVIDIA H20 GPUs.
|
||||||
- SSH check: use `ssh dash0` before scheduling or debugging remote runs.
|
- SSH check: use `ssh dash0` before scheduling or debugging remote runs.
|
||||||
- Remote project path: `/home/admin/cpfs/wjh/aituner/aituner`.
|
- Remote project path: `/home/admin/cpfs/wjh/aituner/aituner`.
|
||||||
|
- If remote downloads are slow or fail, start the proxy from the remote `wjh`
|
||||||
|
home directory with `./auto_proxy.sh`, then run downloads in a shell where
|
||||||
|
`proxyOn` from `~/.bashrc` has been applied. If `autossh` is unavailable,
|
||||||
|
`ssh -Nf proxy` provides the same local `127.0.0.1:11235` tunnel.
|
||||||
|
|
||||||
## Local/remote sync workflow
|
## Local/remote sync workflow
|
||||||
|
|
||||||
|
|||||||
@@ -130,9 +130,9 @@
|
|||||||
"min_input_tokens": 0,
|
"min_input_tokens": 0,
|
||||||
"max_input_tokens": 8192
|
"max_input_tokens": 8192
|
||||||
},
|
},
|
||||||
"replay_time_scale": 0.5,
|
"replay_time_scale": 0.8775,
|
||||||
"early_stop_max_lag_s": 45.0,
|
"early_stop_max_lag_s": 45.0,
|
||||||
"early_stop_max_elapsed_s": 320.0,
|
"early_stop_max_elapsed_s": 1000.0,
|
||||||
"adaptive_stop": {
|
"adaptive_stop": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"tau": 0.9,
|
"tau": 0.9,
|
||||||
@@ -141,8 +141,7 @@
|
|||||||
"max_checks": 20,
|
"max_checks": 20,
|
||||||
"min_fraction": 0.1,
|
"min_fraction": 0.1,
|
||||||
"boundary_delta": 0.02
|
"boundary_delta": 0.02
|
||||||
},
|
}
|
||||||
"completion_tokens_override": 128
|
|
||||||
},
|
},
|
||||||
"slo": {
|
"slo": {
|
||||||
"target_pass_rate": 0.95,
|
"target_pass_rate": 0.95,
|
||||||
@@ -158,7 +157,7 @@
|
|||||||
},
|
},
|
||||||
"search": {
|
"search": {
|
||||||
"low": 0.0,
|
"low": 0.0,
|
||||||
"high": 0.25,
|
"high": 0.15,
|
||||||
"tolerance": 0.001,
|
"tolerance": 0.001,
|
||||||
"max_probes": 6,
|
"max_probes": 6,
|
||||||
"sample_seed": 20260325,
|
"sample_seed": 20260325,
|
||||||
@@ -169,7 +168,9 @@
|
|||||||
"max_history_trials": 8,
|
"max_history_trials": 8,
|
||||||
"endpoint": {
|
"endpoint": {
|
||||||
"provider": "codex",
|
"provider": "codex",
|
||||||
"model": "gpt-5.4",
|
"model": "gpt-5.5",
|
||||||
|
"base_url": "https://ai.gahow.org/v1",
|
||||||
|
"wire_api": "chat.completions",
|
||||||
"stream": true,
|
"stream": true,
|
||||||
"api_key_env": "OPENAI_API_KEY",
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
"timeout_s": 180
|
"timeout_s": 180
|
||||||
|
|||||||
@@ -130,9 +130,9 @@
|
|||||||
"min_input_tokens": 0,
|
"min_input_tokens": 0,
|
||||||
"max_input_tokens": 8192
|
"max_input_tokens": 8192
|
||||||
},
|
},
|
||||||
"replay_time_scale": 0.5,
|
"replay_time_scale": 0.8775,
|
||||||
"early_stop_max_lag_s": 45.0,
|
"early_stop_max_lag_s": 45.0,
|
||||||
"early_stop_max_elapsed_s": 320.0,
|
"early_stop_max_elapsed_s": 1000.0,
|
||||||
"adaptive_stop": {
|
"adaptive_stop": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"tau": 0.9,
|
"tau": 0.9,
|
||||||
@@ -141,8 +141,7 @@
|
|||||||
"max_checks": 20,
|
"max_checks": 20,
|
||||||
"min_fraction": 0.1,
|
"min_fraction": 0.1,
|
||||||
"boundary_delta": 0.02
|
"boundary_delta": 0.02
|
||||||
},
|
}
|
||||||
"completion_tokens_override": 128
|
|
||||||
},
|
},
|
||||||
"slo": {
|
"slo": {
|
||||||
"target_pass_rate": 0.95,
|
"target_pass_rate": 0.95,
|
||||||
@@ -158,7 +157,7 @@
|
|||||||
},
|
},
|
||||||
"search": {
|
"search": {
|
||||||
"low": 0.0,
|
"low": 0.0,
|
||||||
"high": 0.25,
|
"high": 0.15,
|
||||||
"tolerance": 0.001,
|
"tolerance": 0.001,
|
||||||
"max_probes": 6,
|
"max_probes": 6,
|
||||||
"sample_seed": 20260325,
|
"sample_seed": 20260325,
|
||||||
@@ -169,7 +168,9 @@
|
|||||||
"max_history_trials": 8,
|
"max_history_trials": 8,
|
||||||
"endpoint": {
|
"endpoint": {
|
||||||
"provider": "codex",
|
"provider": "codex",
|
||||||
"model": "gpt-5.4",
|
"model": "gpt-5.5",
|
||||||
|
"base_url": "https://ai.gahow.org/v1",
|
||||||
|
"wire_api": "chat.completions",
|
||||||
"stream": true,
|
"stream": true,
|
||||||
"api_key_env": "OPENAI_API_KEY",
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
"timeout_s": 180
|
"timeout_s": 180
|
||||||
|
|||||||
26
configs/examples/tuning_report.example.json
Normal file
26
configs/examples/tuning_report.example.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"report_id": "qwen27b-abl12-harness-vs-naive",
|
||||||
|
"output_root": "../../.aituner-reports/qwen27b-abl12-harness-vs-naive",
|
||||||
|
"target_fraction": 0.95,
|
||||||
|
"min_final_ratio": 0.98,
|
||||||
|
"cases": [
|
||||||
|
{
|
||||||
|
"case_id": "qwen27b-chat-0-8k-real-output",
|
||||||
|
"description": "12-trial harness-vs-naive ablation on the 0-8k chat window with real output lengths.",
|
||||||
|
"tags": ["qwen27b", "chat", "0-8k", "h20", "real-output"],
|
||||||
|
"budgets": [1, 2, 3, 4, 6, 8, 12],
|
||||||
|
"arms": [
|
||||||
|
{
|
||||||
|
"name": "harness",
|
||||||
|
"kind": "harness",
|
||||||
|
"study_root": "../../.aituner/abl12-harness/dash0-qwen27b-ablation-harness-on"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "naive",
|
||||||
|
"kind": "naive",
|
||||||
|
"study_root": "../../.aituner/abl12-naive/dash0-qwen27b-ablation-naive-off"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
215
docs/aituner-roadmap.md
Normal file
215
docs/aituner-roadmap.md
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
# AITuner roadmap
|
||||||
|
|
||||||
|
本文只维护最小 roadmap:paper framing、claim 树、已有证据、最高优先级实验。
|
||||||
|
详细实验流水账放到对应专题文档里。
|
||||||
|
|
||||||
|
## Paper thesis
|
||||||
|
|
||||||
|
AITuner 的核心不是“用 LLM 调参”。更准确的 framing 是:
|
||||||
|
|
||||||
|
```text
|
||||||
|
black-box knob optimization
|
||||||
|
-> grey-box / mechanism-guided experimental optimization
|
||||||
|
```
|
||||||
|
|
||||||
|
也就是说,AITuner 仍然通过真实实验测量目标函数,但它不再把 serving engine 当成
|
||||||
|
完全黑盒的 `config vector -> scalar score`。Harness 将 workload、SLO failure、
|
||||||
|
probe trace、topology constraints 和 failure memory 转换成结构化的 serving
|
||||||
|
mechanism state,并把下一步搜索限制在可解释、可验证的 intervention 上。
|
||||||
|
|
||||||
|
因此 LLM 不是不可替代的核心。LLM 是 planner backend / copilot;核心系统贡献是
|
||||||
|
planner-agnostic 的 tuning substrate:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Harness H = (O, R, G, V, M)
|
||||||
|
|
||||||
|
O: observation schema
|
||||||
|
workload L/C/A profile + probe trace + latency/SLO failure + launch status
|
||||||
|
|
||||||
|
R: regime attribution
|
||||||
|
SLO violation -> prefill-bound / decode-bound / admission-bound / memory-bound / launch-bound
|
||||||
|
|
||||||
|
G: serving intervention grammar
|
||||||
|
regime -> legal intervention families, not raw arbitrary knobs
|
||||||
|
|
||||||
|
V: validator
|
||||||
|
tunable schema + topology constraints + no-repeat + failure memory + stop authority
|
||||||
|
|
||||||
|
M: measurement/scoring protocol
|
||||||
|
SLO-constrained feasible frontier, req/s/GPU, latency quantiles, pass-rate guard
|
||||||
|
```
|
||||||
|
|
||||||
|
Planner 是可替换的:
|
||||||
|
|
||||||
|
```text
|
||||||
|
pi in {LLM, BO, bandit, deterministic heuristic, tree search}
|
||||||
|
```
|
||||||
|
|
||||||
|
AITuner 的强 claim 应该是:同一个 planner 放在 harness-shaped space 里,比放在
|
||||||
|
raw knob space 里更快、更稳、更接近最优;弱模型或非 LLM planner 也能从这个 substrate
|
||||||
|
中获益。
|
||||||
|
|
||||||
|
## Why not pure white-box
|
||||||
|
|
||||||
|
我们不应 claim 完整 white-box optimization。AITuner 没有解析 vLLM scheduler、
|
||||||
|
kernel、KV cache、通信和排队的闭式性能模型。更稳妥也更强的表述是 grey-box:
|
||||||
|
|
||||||
|
- objective 仍然由真实测量决定;
|
||||||
|
- action space、constraints、failure attribution 和 intervention semantics 是系统知识驱动;
|
||||||
|
- 每个 trial 是一个 counterfactual experiment,而不是盲目采样一个 knob vector。
|
||||||
|
|
||||||
|
## 关键设计点
|
||||||
|
|
||||||
|
| 设计点 | 更强表述 | 作用 | 需要证明 |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| Observation | mechanism state | 将 workload shape、probe trace、SLO failure、launch/memory failure 结构化 | agent 看到的是可计算状态,不是自然语言日志 |
|
||||||
|
| Bottleneck classifier | SLO violation attribution | 把失败归因到 serving regime,而不是只看哪个指标超阈值 | attribution 和后续有效 intervention 有因果关联 |
|
||||||
|
| Candidate family | serving intervention grammar | 把 raw knobs 提升为 topology / batching / admission / memory interventions | 搜索空间被压缩,但不写死某个 case |
|
||||||
|
| Scoring | counterfactual verdict | 用 SLO frontier 和 req/s/GPU 判断 intervention 是否支持假设 | 最终 winner 由测量决定,不由 LLM 决定 |
|
||||||
|
| Validator / stop | fail-safe control | 禁止非法、重复、已知失败 family;只有 validator 授权 stop | 错误 attribution 最多浪费 trial,不污染 incumbent |
|
||||||
|
|
||||||
|
## Claim roadmap
|
||||||
|
|
||||||
|
| Claim | 当前状态 | 证据文档 | 关键缺口 |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| C1. Harness 将 raw knob search 转成 mechanism-guided intervention search,提升固定预算优化效果 | 已有强信号 | [Qwen27B 2x2](harness-ablation/qwen27b-tight-2x2-model-ablation-20260623.md), [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 补 Qwen235B decode 2x2 aggregate;补 mechanism ablation |
|
||||||
|
| C2. 收益来自 harness-defined substrate,不依赖某个强 LLM | 部分已有 | [Qwen27B 2x2](harness-ablation/qwen27b-tight-2x2-model-ablation-20260623.md) | 做 `BO/heuristic + harness` vs `BO/heuristic + raw knobs` |
|
||||||
|
| C3. Weak planner + harness 可以匹配或超过 strong LLM naive | Qwen27B 已支持;Qwen235B 正在补 | [Qwen27B 2x2](harness-ablation/qwen27b-tight-2x2-model-ablation-20260623.md), [Qwen235B prefill progress](harness-ablation/qwen235b-prefill-2x2-progress-20260623.md) | 完成 Qwen235B decode 2x2;更新 prefill final doc |
|
||||||
|
| C4. Attribution 和 intervention grammar 有机制贡献,不只是 prompt 信息更多 | 设计已有,严格证据不足 | [AITuner summary](aituner-harness-summary.md) | 做 shuffled attribution / no attribution / no grammar / no topology-first / no validator ablation |
|
||||||
|
| C5. AITuner 找到 near-optimal region,而不是只找到一个可行 config | Qwen30B 有解释性信号 | [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 选 1-2 个 case 做局部 grid 或专家配置对照 |
|
||||||
|
| C6. AITuner 能随 SLO tightness 移动到合适 frontier | Qwen30B 已完成 | [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 再选一个非同质 case 做 SLO sweep;同时画 SLO tightness -> frontier/regime transition |
|
||||||
|
| C7. Engine adapter 让 intervention grammar 可迁移到其他 serving engine | 设计上可行,暂不作为主实验 claim | `EngineLaunchSpec` / launch recipe / tunable schema | vLLM 主线完成后,再做 SGLang adapter 和一个低成本验证 case |
|
||||||
|
|
||||||
|
## 最高优先级实验
|
||||||
|
|
||||||
|
### P0. 完成 Qwen235B decode 2x2 并整理 aggregate
|
||||||
|
|
||||||
|
目的:补齐最核心的 `harness on/off x strong/weak planner` 证据,回答:
|
||||||
|
|
||||||
|
```text
|
||||||
|
weak LLM + harness >= strong LLM naive ?
|
||||||
|
```
|
||||||
|
|
||||||
|
预期产出:
|
||||||
|
|
||||||
|
- 2x2 表格:每个 arm 在相同 iter budget 下的 best-so-far req/s/GPU;
|
||||||
|
- convergence curve / normalized AUC;
|
||||||
|
- 每个 arm 的 trial path 和主要 config patches;
|
||||||
|
- 解释 naive 为什么走错,harness 如何通过 regime attribution 走到正确 intervention。
|
||||||
|
|
||||||
|
优先级原因:实验已经在跑,增量成本最低,而且直接支撑 C1/C3。
|
||||||
|
|
||||||
|
### P1. Planner-agnostic substrate 实验
|
||||||
|
|
||||||
|
目的:证明 AITuner 不是 LLM tuner,而是 harness-defined optimization substrate。
|
||||||
|
|
||||||
|
最小实验矩阵:
|
||||||
|
|
||||||
|
| Planner | Raw knob space | Harness intervention space |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| deterministic heuristic | raw heuristic | harness policy |
|
||||||
|
| BO 或 lightweight bandit | raw BO | harness-guided BO |
|
||||||
|
| weak LLM | naive weak LLM | weak LLM + harness |
|
||||||
|
| strong LLM | naive strong LLM | strong LLM + harness |
|
||||||
|
|
||||||
|
如果 BO 实现成本高,先用 deterministic harness policy 做 non-LLM planner baseline:
|
||||||
|
它已经能证明“没有 LLM 也能 work”。随后再补 BO,使论证更强。
|
||||||
|
|
||||||
|
预期图:
|
||||||
|
|
||||||
|
- x-axis: trial budget;
|
||||||
|
- y-axis: best-so-far SLO-constrained req/s/GPU;
|
||||||
|
- line groups: raw knob space vs harness intervention space;
|
||||||
|
- 单独 bar:invalid launch rate、repeated config rate、wasted trial rate。
|
||||||
|
|
||||||
|
优先级原因:这是新 framing 的关键实验。没有它,paper 仍然容易被读成“LLM prompt
|
||||||
|
engineering”。
|
||||||
|
|
||||||
|
### P2. Mechanism ablation
|
||||||
|
|
||||||
|
目的:证明 harness 内部不是普通信息堆叠,而是 attribution、intervention grammar、
|
||||||
|
validator 分别贡献有效机制。
|
||||||
|
|
||||||
|
建议 ablation:
|
||||||
|
|
||||||
|
| Variant | 删除/破坏什么 | 预期证明 |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| full AITuner | 无 | 最好 |
|
||||||
|
| no attribution | 不提供 regime attribution,只给 scalar score 和历史结果 | attribution 对方向选择有贡献 |
|
||||||
|
| shuffled attribution | 故意打乱 regime label,但保留文本长度 | 收益来自语义正确性,不是更多 prompt tokens |
|
||||||
|
| no intervention grammar | 允许任意 tunable knobs,移除 family guidance | action-space shaping 有贡献 |
|
||||||
|
| no topology-first | runtime knobs 可以优先于 topology intervention | topology 是 LLM serving 的一阶决策 |
|
||||||
|
| no validator/failure memory | 允许重复、已知 launch failure family | fail-safe control 减少 GPU burn |
|
||||||
|
|
||||||
|
预期图:
|
||||||
|
|
||||||
|
- mechanism ablation bar:final best、AUC、TTT;
|
||||||
|
- waste breakdown:invalid launch、repeat config、wrong-family trial;
|
||||||
|
- case study trace:每个 variant 前 3-5 个 proposal 对比。
|
||||||
|
|
||||||
|
优先级原因:这是回应 novelty 质疑的核心证据。
|
||||||
|
|
||||||
|
### P3. Near-optimum / expert baseline 证据
|
||||||
|
|
||||||
|
目的:证明 AITuner 不是只找到“能收敛但性能差”的 config。
|
||||||
|
|
||||||
|
优先选择一个成本可控 case 做局部 grid:
|
||||||
|
|
||||||
|
```text
|
||||||
|
topology: TP/DP frontier
|
||||||
|
runtime: max-num-seqs, max-num-batched-tokens, gpu-memory-utilization 的小邻域
|
||||||
|
objective: max feasible req/s/GPU under pass_rate >= 0.95
|
||||||
|
```
|
||||||
|
|
||||||
|
预期图:
|
||||||
|
|
||||||
|
- local grid heatmap;
|
||||||
|
- AITuner trial path overlay;
|
||||||
|
- AITuner best vs grid best vs expert config;
|
||||||
|
- near-optimum gap,例如 `AITuner >= 95% of local grid optimum`。
|
||||||
|
|
||||||
|
优先级原因:这是 claim “tune 出最好的 config,而不是差的收敛 config” 的必要证据。
|
||||||
|
|
||||||
|
### P4. 第二个 SLO robustness case
|
||||||
|
|
||||||
|
目的:证明 Qwen30B 的 SLO robustness 不是单 case 现象。
|
||||||
|
|
||||||
|
不要先大规模铺 sweep。先选一个和 Qwen30B 机制不同的 case:
|
||||||
|
|
||||||
|
- 一个 decode-heavy case,观察 TP/DP redistribution 和 concurrency/memory intervention;
|
||||||
|
- 或一个 long-prefill / tight-TTFT case,观察 TP 和 prefill batching intervention。
|
||||||
|
|
||||||
|
预期图:
|
||||||
|
|
||||||
|
- x-axis: SLO tightness;
|
||||||
|
- y-axis: best feasible req/s/GPU;
|
||||||
|
- marker/color: selected intervention regime;
|
||||||
|
- annotation: final TP/DP/MNS/MBT;
|
||||||
|
- 展示 SLO 放宽时 frontier/right shift 或 regime transition。
|
||||||
|
|
||||||
|
优先级原因:重要,但应排在 planner-agnostic 和 mechanism ablation 之后。
|
||||||
|
|
||||||
|
### P5. SGLang / multi-engine adapter validation
|
||||||
|
|
||||||
|
目的:证明 intervention grammar 可以通过 adapter lowering 到不同 serving engine。
|
||||||
|
|
||||||
|
当前暂缓,不作为 vLLM 主线之前的高优先级实验。等 C1-C5 稳定后再做一个低成本 case:
|
||||||
|
|
||||||
|
```text
|
||||||
|
same workload profile
|
||||||
|
same SLO objective
|
||||||
|
same intervention grammar
|
||||||
|
different engine adapter
|
||||||
|
```
|
||||||
|
|
||||||
|
优先级原因:它能扩展 generality,但不能替代 vLLM 主线的机制证明。
|
||||||
|
|
||||||
|
## 暂不做
|
||||||
|
|
||||||
|
- 暂不把主 claim 写成“LLM 比 BO 更聪明”。新 claim 是 harness substrate 对多种 planner
|
||||||
|
都有用。
|
||||||
|
- 暂不 claim full white-box 或全局最优。当前更稳妥的是 grey-box、near-optimum、
|
||||||
|
fixed-budget utility。
|
||||||
|
- 暂不横向铺大量 SLO sweep。先补机制 ablation、planner-agnostic 和 near-optimum。
|
||||||
|
- 暂不把 multi-engine support 放进主实验 claim。先写成 adapter-based design,等 vLLM
|
||||||
|
证据链完整后再补一个 SGLang validation。
|
||||||
138
docs/harness-ablation/qwen235b-prefill-2x2-progress-20260623.md
Normal file
138
docs/harness-ablation/qwen235b-prefill-2x2-progress-20260623.md
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
# Qwen235B prefill 2x2 progress - 2026-06-23
|
||||||
|
|
||||||
|
Snapshot: 2026-06-23 18:24 CST / 10:24 UTC.
|
||||||
|
|
||||||
|
本文整理当前 dash1/dash2/dash3 上的 Qwen235B prefill 2x2 实验进度。这个
|
||||||
|
case 仍在跑 strong-model arm,因此本文是 progress report,不是最终 aggregate
|
||||||
|
结论。
|
||||||
|
|
||||||
|
## 当前远端状态
|
||||||
|
|
||||||
|
| Host | 当前状态 | 说明 |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| dash1 | running | `aituner-q235b-2x2-gpt55-20260623T010038Z` 仍在跑,当前是 `gpt-5.5 + naive` 的 trial-0004;8 张 H20 被 vLLM 占用。 |
|
||||||
|
| dash2 | idle | 没有 tmux/GPU 任务;最近完成的是 `qwen235b-prefill-jointprobe-harness-dash2-20260622T132010Z` harness-only 验证。 |
|
||||||
|
| dash3 | idle | 没有 tmux/GPU 任务;`gpt-5.4-mini` 2x2 arm 已完成并生成 report。 |
|
||||||
|
|
||||||
|
注意:三台机器共享 `/home/admin/cpfs/wjh/aituner/aituner`,所以 `.aituner` 和
|
||||||
|
`.aituner-reports` 在不同 dash 节点上看到的是同一批产物。
|
||||||
|
|
||||||
|
## 已完成:gpt-5.4-mini 2x2 arm
|
||||||
|
|
||||||
|
Report:
|
||||||
|
|
||||||
|
```text
|
||||||
|
.aituner-reports/qwen235b-prefill-2x2-gpt54mini-dash3-20260623T010038Z/report.md
|
||||||
|
```
|
||||||
|
|
||||||
|
Aggregate:
|
||||||
|
|
||||||
|
| Arm | Kind | Trials | Final req/s/GPU | Final/ref | TTT | AUC | Failed | No feasible |
|
||||||
|
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||||
|
| `harness` | harness | 8 | 0.3217 | 1.0000 | 3 | 0.9483 | 0 | 1 |
|
||||||
|
| `naive` | naive | 8 | - | - | - | 0.0000 | 2 | 8 |
|
||||||
|
|
||||||
|
Interpretation:
|
||||||
|
|
||||||
|
- `gpt-5.4-mini + harness` 找到了 `0.3217 req/s/GPU`,达到该 report 的
|
||||||
|
reference best。
|
||||||
|
- `gpt-5.4-mini + naive` 8 个 trials 都没有找到 feasible config,其中 2 个是
|
||||||
|
engine launch failure。
|
||||||
|
- Report 中 `Harness-vs-naive pass/checks: 0/1` 是 aggregator 对
|
||||||
|
`best_naive_final_per_gpu = null` 的保守处理:因为 naive 没有 feasible best,
|
||||||
|
final ratio 无法计算,所以 pass 记为 false。就实际 tuning 结果而言,这个 arm
|
||||||
|
是 harness dominates naive。
|
||||||
|
|
||||||
|
Harness trajectory:
|
||||||
|
|
||||||
|
| Trial | Patch | req/s/GPU | Pass rate | 说明 |
|
||||||
|
| ---: | --- | ---: | ---: | --- |
|
||||||
|
| 1 | `TP=8, DP=1` | 0.2879 | 0.9522 | 初始 topology 满足 SLO,但未达到最终 best。 |
|
||||||
|
| 2 | `TP=8, max-num-seqs=96` | 0.2879 | 0.9537 | 单独调 `max-num-seqs` 无明显提升。 |
|
||||||
|
| 3 | `TP=8, max-num-batched-tokens=16384, max-num-seqs=96` | 0.3085 | 0.9568 | joint runtime probe 提升。 |
|
||||||
|
| 4 | `TP=8, max-num-seqs=144, max-num-batched-tokens=32768` | 0.2879 | 0.9530 | 过大的 batching/seq 组合回退。 |
|
||||||
|
| 5 | `TP=4, DP=2` | - | - | 无 feasible best,说明 DP-heavy/mixed topology 不解决该 prefill path。 |
|
||||||
|
| 6 | `TP=8, max-num-seqs=96, max-num-batched-tokens=24576` | 0.2708 | 0.9523 | batching 进一步增大后回退。 |
|
||||||
|
| 7 | `TP=4, DP=1, max-num-seqs=96, max-num-batched-tokens=16384` | 0.2338 | 0.9590 | 少用 GPU 的 TP4/DP1 per-GPU 不占优。 |
|
||||||
|
| 8 | `TP=8, DP=1, max-num-seqs=128, max-num-batched-tokens=16384` | 0.3217 | 0.9508 | 当前 best。 |
|
||||||
|
|
||||||
|
这个结果说明:在 Qwen235B prefill case 上,harness 的价值不只是 topology
|
||||||
|
选择,还包括在 TTFT/prefill 方向下做受约束的 runtime joint probe。最终 best 是
|
||||||
|
`TP=8, DP=1, max-num-seqs=128, max-num-batched-tokens=16384`。
|
||||||
|
|
||||||
|
## 正在运行:gpt-5.5 2x2 arm
|
||||||
|
|
||||||
|
Session:
|
||||||
|
|
||||||
|
```text
|
||||||
|
tmux: aituner-q235b-2x2-gpt55-20260623T010038Z
|
||||||
|
driver log: .aituner/qwen235b-prefill-2x2-gpt55-dash1-20260623T010038Z.driver.log
|
||||||
|
```
|
||||||
|
|
||||||
|
Driver timeline:
|
||||||
|
|
||||||
|
```text
|
||||||
|
harness clean pair start 2026-06-23T01:00:40+00:00
|
||||||
|
harness clean pair done 2026-06-23T08:21:13+00:00
|
||||||
|
naive clean pair start 2026-06-23T08:21:13+00:00
|
||||||
|
```
|
||||||
|
|
||||||
|
Harness side has completed all 8 trials:
|
||||||
|
|
||||||
|
| Trial | Patch | req/s/GPU | Pass rate |
|
||||||
|
| ---: | --- | ---: | ---: |
|
||||||
|
| 1 | `TP=8, DP=1` | 0.2879 | 0.9522 |
|
||||||
|
| 2 | `TP=8, max-num-seqs=96` | 0.2879 | 0.9530 |
|
||||||
|
| 3 | `TP=8, max-num-batched-tokens=16384, max-num-seqs=96` | 0.3085 | 0.9561 |
|
||||||
|
| 4 | `TP=8, max-num-batched-tokens=32768, max-num-seqs=144` | 0.2783 | 0.9543 |
|
||||||
|
| 5 | `TP=8, DP=1, max-num-batched-tokens=24576, max-num-seqs=96` | 0.2654 | 0.9513 |
|
||||||
|
| 6 | `TP=4, DP=2, max-num-batched-tokens=16384, max-num-seqs=96` | - | - |
|
||||||
|
| 7 | `TP=8, DP=1, max-num-batched-tokens=16384, max-num-seqs=80` | 0.3156 | 0.9505 |
|
||||||
|
| 8 | `TP=8, max-num-batched-tokens=32768, max-num-seqs=120` | 0.2879 | 0.9508 |
|
||||||
|
|
||||||
|
Current harness best: `trial-0007`, `0.3156 req/s/GPU`.
|
||||||
|
|
||||||
|
Naive side is still running. Current state:
|
||||||
|
|
||||||
|
- Completed/recorded through trial-0003, with current best `0.2879 req/s/GPU`.
|
||||||
|
- trial-0004 is active with `TP=8, DP=1, max-num-batched-tokens=8192,
|
||||||
|
max-num-seqs=128`.
|
||||||
|
- trial-0004 probe history so far:
|
||||||
|
|
||||||
|
| threshold | request rate | req/s/GPU | pass rate | feasible | main failures |
|
||||||
|
| ---: | ---: | ---: | ---: | --- | --- |
|
||||||
|
| 0.0625 | 1.5750 | 0.1969 | 0.9651 | true | TTFT misses and TTFT threshold violations |
|
||||||
|
| 0.09375 | 2.3650 | 0.2956 | 0.7308 | false | `slo_pass_rate_unrecoverable`, TTFT violations |
|
||||||
|
| 0.078125 | 1.9567 | 0.2446 | 0.9591 | true | TTFT misses and TTFT threshold violations |
|
||||||
|
| 0.0859375 | 2.1667 | 0.2708 | 0.9546 | true | TTFT misses and TTFT threshold violations |
|
||||||
|
|
||||||
|
As of the snapshot, vLLM is still processing requests for trial-0004, so the naive
|
||||||
|
side has not produced its final result or report yet.
|
||||||
|
|
||||||
|
## Prior Qwen235B context
|
||||||
|
|
||||||
|
These earlier runs explain why the current 2x2 matters:
|
||||||
|
|
||||||
|
| Run | Result | What it showed |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `qwen235b-prefill-clean-gpt55-dash1-20260621T160712Z` | harness 0.2879, naive 0.3217 | Earlier harness stopped/refined too weakly; naive found better final config. |
|
||||||
|
| `qwen235b-prefill-seqguard-gpt55-dash1-20260622T064445Z` | harness 0.2879, naive 0.2577 | Seq guard prevented the worst early-stop failure but still did not reach the old naive best. |
|
||||||
|
| `qwen235b-prefill-jointprobe-harness-dash2-20260622T132010Z` | harness-only 0.3085 | Joint `max-num-batched-tokens + max-num-seqs` probe improved over seqguard. |
|
||||||
|
| `qwen235b-prefill-2x2-gpt54mini-dash3-20260623T010038Z` | harness 0.3217, naive no feasible | Weak model plus harness now reaches the old best and dominates weak naive. |
|
||||||
|
|
||||||
|
The current evidence points to the harness needing both:
|
||||||
|
|
||||||
|
1. topology discipline: stay on `TP=8, DP=1` for this prefill-heavy 235B setup;
|
||||||
|
2. runtime joint probing: tune `max-num-batched-tokens` and `max-num-seqs` together
|
||||||
|
instead of stopping after the first feasible TP8 result.
|
||||||
|
|
||||||
|
## Open item
|
||||||
|
|
||||||
|
The final Qwen235B 2x2 conclusion is blocked on the still-running
|
||||||
|
`gpt-5.5 + naive` arm on dash1. Once it completes, generate an aggregate report
|
||||||
|
combining:
|
||||||
|
|
||||||
|
- `qwen235b-prefill-2x2-gpt55-dash1-20260623T010038Z`
|
||||||
|
- `qwen235b-prefill-2x2-gpt54mini-dash3-20260623T010038Z`
|
||||||
|
|
||||||
|
and then update this progress report into a final ablation report.
|
||||||
@@ -0,0 +1,366 @@
|
|||||||
|
# Qwen27B tight-SLO 2x2 harness ablation - 2026-06-23
|
||||||
|
|
||||||
|
本文整理以下 aggregate report,并解释 harness 为什么能够让 tuning 更快、更有效:
|
||||||
|
|
||||||
|
```text
|
||||||
|
.aituner-reports/qwen27b-tight-2x2-aggregate-20260623T005838Z/report.md
|
||||||
|
```
|
||||||
|
|
||||||
|
这个实验是一个 2x2 ablation:模型强弱和是否启用 `use_harness` 交叉。
|
||||||
|
核心问题是:harness 是否提供了可复用的搜索结构,而不仅仅是更强 LLM
|
||||||
|
或者更长 prompt 带来的偶然收益。
|
||||||
|
|
||||||
|
## 实验设计
|
||||||
|
|
||||||
|
Case: `qwen27b-tight-slo-2x2-aggregate`。
|
||||||
|
|
||||||
|
实验基座:
|
||||||
|
|
||||||
|
- Served model: `qwen3.5-27b-256k-0223-internal`。
|
||||||
|
- Hardware: H20,最多 8 GPUs。
|
||||||
|
- Trace: `chat_w20260311_1000`,输入长度过滤到 0-8192 tokens,
|
||||||
|
`replay_time_scale=1.0`,`max_concurrency=32`。
|
||||||
|
- SLO: pass rate >= 0.95;TTFT step rule 为 <=4096 input tokens 时 2s,
|
||||||
|
<=32768 input tokens 时 4s,更长输入时 6s;TPOT <= 50 ms。
|
||||||
|
- Search: 在 `sampling_u in [0, 0.0625]` 上二分探测,tolerance 0.001,
|
||||||
|
max 6 probes。
|
||||||
|
- Tunable envs: `VLLM_ENABLE_TORCH_COMPILE`。
|
||||||
|
- Tunable flags: `tensor-parallel-size`, `data-parallel-size`,
|
||||||
|
`expert-parallel-size`, `gpu-memory-utilization`, `block-size`,
|
||||||
|
`max-num-batched-tokens`, `max-num-seqs`, `enable-prefix-caching`,
|
||||||
|
`enable-chunked-prefill`。
|
||||||
|
- Topology constraints: TP 和 DP 均在 `{1,2,4,8}` 中,允许的 TP*DP product 为
|
||||||
|
`{1,2,4,8}`,本 case 中 EP 固定为 1。
|
||||||
|
|
||||||
|
2x2 arms:
|
||||||
|
|
||||||
|
| Arm | Tuner model | Harness | Trial budget used |
|
||||||
|
| --- | --- | --- | ---: |
|
||||||
|
| `gpt55_harness` | `gpt-5.5` | on | 2 |
|
||||||
|
| `gpt55_naive` | `gpt-5.5` | off | 10 |
|
||||||
|
| `gpt54mini_harness` | `gpt-5.4-mini` | on | 2 |
|
||||||
|
| `gpt54mini_naive` | `gpt-5.4-mini` | off | 10 |
|
||||||
|
|
||||||
|
同一个 tuner model 内,主要差异是 `use_harness`。跨模型比较则用来判断:
|
||||||
|
更弱模型加 harness 是否能匹配或超过更强模型的 naive tuning。
|
||||||
|
|
||||||
|
## Aggregate result
|
||||||
|
|
||||||
|
Reference best: `0.4429 req/s/GPU`。
|
||||||
|
Convergence target: reference 的 95%,即 `0.4208 req/s/GPU`。
|
||||||
|
|
||||||
|
| Arm | Kind | Trials | Final req/s/GPU | Final/ref | Trials to target | Normalized AUC | Failed | No feasible |
|
||||||
|
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||||
|
| `gpt55_harness` | harness | 2 | 0.4429 | 1.0000 | 2 | 0.9484 | 0 | 0 |
|
||||||
|
| `gpt55_naive` | naive | 10 | 0.0273 | 0.0616 | - | 0.0588 | 2 | 2 |
|
||||||
|
| `gpt54mini_harness` | harness | 2 | 0.4429 | 1.0000 | 2 | 0.9450 | 0 | 0 |
|
||||||
|
| `gpt54mini_naive` | naive | 10 | 0.0231 | 0.0522 | - | 0.0498 | 1 | 1 |
|
||||||
|
|
||||||
|
Harness-vs-naive 检查全部通过:
|
||||||
|
|
||||||
|
| Harness arm | Final vs best naive | AUC vs best naive | Pass |
|
||||||
|
| --- | ---: | ---: | --- |
|
||||||
|
| `gpt55_harness` | 16.2290x | 16.1296x | true |
|
||||||
|
| `gpt54mini_harness` | 16.2290x | 16.0720x | true |
|
||||||
|
|
||||||
|
最关键的 ablation 信号是:`gpt-5.4-mini + harness` 和
|
||||||
|
`gpt-5.5 + harness` 达到同一个 final throughput,也都是 2 trials 达到 target;
|
||||||
|
而两个 naive arms 用满 10 trials 后仍低于 harness arms 16x 以上。
|
||||||
|
|
||||||
|
## Agent loop 流程图
|
||||||
|
|
||||||
|
下面是当前 harness 化 agent loop 的抽象流程。LLM 仍然可以参与 proposal,
|
||||||
|
但它拿到的不是裸文本历史,而是结构化 observation、bottleneck diagnosis、
|
||||||
|
candidate actions 和 validator 约束;同时 validator 可以授权 stop,也可以阻止
|
||||||
|
重复失败或不合法配置。
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A[Study spec: trace, SLO, search range, tunable knobs] --> B[Run one engine config]
|
||||||
|
B --> C[Binary-search probes over sampling_u]
|
||||||
|
C --> D[Build observation o_t]
|
||||||
|
D --> E[Bottleneck classifier]
|
||||||
|
E --> F[Candidate family generator]
|
||||||
|
F --> G[Score candidate actions]
|
||||||
|
G --> H[Prompt renderer / planner]
|
||||||
|
H --> I[LLM or deterministic harness proposal]
|
||||||
|
I --> J{Config validator}
|
||||||
|
J -- invalid, repeated, unsafe --> F
|
||||||
|
J -- valid config_patch --> B
|
||||||
|
G --> K{Stop validator}
|
||||||
|
K -- search_high_saturated_by_incumbent --> L[Stop and keep incumbent]
|
||||||
|
K -- useful candidates remain --> H
|
||||||
|
```
|
||||||
|
|
||||||
|
这个 loop 中,harness 的作用不是把 prompt 写得更漂亮,而是把 tuning 变成
|
||||||
|
一个受测量约束的决策过程:
|
||||||
|
|
||||||
|
```text
|
||||||
|
measurement -> diagnosis -> candidate family -> scored action -> validated proposal/stop
|
||||||
|
```
|
||||||
|
|
||||||
|
## 形式化设计:observation
|
||||||
|
|
||||||
|
每个 trial 结束后,AITuner 不只记录一段自然语言总结,而是形成结构化 observation:
|
||||||
|
|
||||||
|
```text
|
||||||
|
o_t = (
|
||||||
|
config_t,
|
||||||
|
probe_history_t,
|
||||||
|
pass_rate_t,
|
||||||
|
latency/SLO_failure_profile_t,
|
||||||
|
request_rate_t,
|
||||||
|
parallel_size_t,
|
||||||
|
launch_status_t,
|
||||||
|
prior_failures_t,
|
||||||
|
incumbent_t
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
本实验里 observation 中最重要的字段是:
|
||||||
|
|
||||||
|
- `config_t`: 当前 trial 的 `flag_patch` 和 `env_patch`,例如 `TP=2, DP=1`。
|
||||||
|
- `probe_history_t`: 在不同 `sampling_u` 下二分探测得到的 feasible/infeasible
|
||||||
|
结果。
|
||||||
|
- `pass_rate_t`: 是否满足 target pass rate 0.95。
|
||||||
|
- `latency/SLO_failure_profile_t`: TTFT 和 TPOT 哪个先触发 SLO pressure。
|
||||||
|
- `request_rate_t`: 当前配置在 SLO 下能承载的 request rate。
|
||||||
|
- `parallel_size_t`: 该配置实际使用的并行规模,用于归一化 per-GPU objective。
|
||||||
|
- `prior_failures_t`: 之前哪些配置 launch failed 或 no feasible,避免重复试错。
|
||||||
|
- `incumbent_t`: 当前最优配置及其 `request_rate_per_gpu`。
|
||||||
|
|
||||||
|
目标函数是:
|
||||||
|
|
||||||
|
```text
|
||||||
|
J(config_t) = request_rate_t / parallel_size_t
|
||||||
|
subject to pass_rate_t >= 0.95
|
||||||
|
```
|
||||||
|
|
||||||
|
也就是说,harness 优化的是满足 SLO 后的 `req/s/GPU`,不是 raw throughput,
|
||||||
|
也不是 LLM 主观认为“更强”的配置。
|
||||||
|
|
||||||
|
## 形式化设计:bottleneck classifier
|
||||||
|
|
||||||
|
`bottleneck classifier` 把 observation 映射成 ranked bottleneck hypotheses:
|
||||||
|
|
||||||
|
```text
|
||||||
|
b_t = ranked_bottleneck(o_t)
|
||||||
|
```
|
||||||
|
|
||||||
|
它判断的不是“哪个 knob 看起来常用”,而是“当前 SLO failure 和 latency profile
|
||||||
|
说明哪个系统环节在限制 objective”。
|
||||||
|
|
||||||
|
常见分类包括:
|
||||||
|
|
||||||
|
| Bottleneck | 典型证据 | 倾向 knob family |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `ttft_prefill` | 长 prompt 下 TTFT 接近或超过 SLO,prefill service time 是瓶颈 | 提高 TP,调整 prefill batching |
|
||||||
|
| `decode_tpot` | TPOT p95/p99 超 SLO,decode token latency 是瓶颈 | 调整 `max-num-seqs`,提高 TP,降低 decode contention |
|
||||||
|
| `admission_queueing` | waiting/arrival lag 增长,服务时间未必单独变差 | 提高 DP,调整 admission/concurrency knobs |
|
||||||
|
| `memory_kv` | KV cache pressure、preemption、OOM、launch failure | 调整 `gpu-memory-utilization`、`block-size`、sequence/token caps |
|
||||||
|
| `topology_comm` | TP 增加降低 latency 但 per-GPU efficiency 下降 | 回退 TP,比较 DP/TP tradeoff |
|
||||||
|
|
||||||
|
本实验里,两个 harness arms 都把 ranked bottleneck 识别为
|
||||||
|
`ttft_prefill`。原因是 workload 有 heavy-tailed long prompts,并且 TTFT SLO 很紧;
|
||||||
|
这意味着单个请求的 prefill service time 是主要限制。DP-only 只能增加 replica,
|
||||||
|
不能缩短一个长 prompt 的 prefill 路径,因此不是第一优先级。
|
||||||
|
|
||||||
|
## 形式化设计:candidate family
|
||||||
|
|
||||||
|
`candidate family generator` 根据 bottleneck 和 topology constraints 生成可比较的
|
||||||
|
action family:
|
||||||
|
|
||||||
|
```text
|
||||||
|
A_t = candidate_knob_families(
|
||||||
|
b_t,
|
||||||
|
topology_constraints,
|
||||||
|
prior_failures_t,
|
||||||
|
incumbent_t
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
在这个 case 中:
|
||||||
|
|
||||||
|
- `b_t = ttft_prefill`。
|
||||||
|
- 允许的 TP frontier 是 `TP=1 -> TP=2 -> TP=4 -> TP=8`。
|
||||||
|
- 允许的 DP frontier 是 `DP=1,2,4,8`,但 DP-only 不直接缓解单请求 prefill
|
||||||
|
latency。
|
||||||
|
- EP 固定为 1,因此不探索 expert parallel。
|
||||||
|
- 之前没有 failed topology,因此相邻 TP probe launch risk 低。
|
||||||
|
|
||||||
|
所以 harness 选择了:
|
||||||
|
|
||||||
|
```text
|
||||||
|
trial-0001: TP=2, DP=1
|
||||||
|
trial-0002: TP=4, DP=1
|
||||||
|
```
|
||||||
|
|
||||||
|
这不是写死“Qwen27B 应该 TP4”。如果 classifier 输出的是
|
||||||
|
`admission_queueing`,candidate family 会更偏向 DP 或 `max-num-seqs`;如果输出是
|
||||||
|
`memory_kv`,则会更偏向 memory/cache/sequence knobs。
|
||||||
|
|
||||||
|
## 形式化设计:scoring
|
||||||
|
|
||||||
|
每个 candidate action 都按同一个抽象打分:
|
||||||
|
|
||||||
|
```text
|
||||||
|
score(a) = expected_bottleneck_relief(a)
|
||||||
|
+ information_gain(a)
|
||||||
|
+ launch_safety(a)
|
||||||
|
- regression_risk(a)
|
||||||
|
- measurement_cost(a)
|
||||||
|
```
|
||||||
|
|
||||||
|
这些项在本实验里的含义是:
|
||||||
|
|
||||||
|
- `expected_bottleneck_relief`: TP2/TP4 预计能降低 long-prefill compute latency,
|
||||||
|
直接作用于 `ttft_prefill`。
|
||||||
|
- `information_gain`: TP frontier probe 可以区分“需要 compute-latency relief”
|
||||||
|
还是“只是 admission/replica 不够”。
|
||||||
|
- `launch_safety`: TP2/TP4 均满足 topology constraints,没有重复 failed signature。
|
||||||
|
- `regression_risk`: TP 增加会带来通信开销,可能损害 per-GPU efficiency,所以必须用
|
||||||
|
`request_rate_per_gpu` 验证。
|
||||||
|
- `measurement_cost`: 每个 GPU trial 成本高;因此高信息量的 topology probe 优先于
|
||||||
|
多个局部 runtime tweak。
|
||||||
|
|
||||||
|
实际结果验证了这个 scoring:
|
||||||
|
|
||||||
|
| Arm | Trial | Patch | req/s/GPU | Pass rate | 解释 |
|
||||||
|
| --- | ---: | --- | ---: | ---: | --- |
|
||||||
|
| `gpt55_harness` | 1 | `TP=2, DP=1` | 0.2142 | 0.9572 | 相邻 TP probe 已满足 SLO,但仍未饱和 search high。 |
|
||||||
|
| `gpt55_harness` | 2 | `TP=4, DP=1` | 0.4429 | 0.9718 | TP frontier 继续缓解 prefill bottleneck,达到 reference best。 |
|
||||||
|
| `gpt54mini_harness` | 1 | `TP=2, DP=1` | 0.1992 | 0.9707 | 弱模型也选择同一机制路径。 |
|
||||||
|
| `gpt54mini_harness` | 2 | `TP=4, DP=1` | 0.4429 | 0.9727 | 弱模型加 harness 匹配强模型加 harness。 |
|
||||||
|
|
||||||
|
## 形式化设计:validator stop
|
||||||
|
|
||||||
|
Stop 不是 LLM 自己说“我觉得差不多了”。Stop 必须通过 `stop validator`:
|
||||||
|
|
||||||
|
```text
|
||||||
|
stop(o_t, incumbent_t, search_state_t, candidate_set_t) -> true/false
|
||||||
|
```
|
||||||
|
|
||||||
|
本实验里 stop 的记录是:
|
||||||
|
|
||||||
|
```text
|
||||||
|
tuning_stop_reason: harness_stop
|
||||||
|
validator_reason: search_high_saturated_by_incumbent
|
||||||
|
diagnosis: The incumbent's highest measured probe is feasible and is within the
|
||||||
|
configured binary-search resolution of search.high.
|
||||||
|
```
|
||||||
|
|
||||||
|
含义是:
|
||||||
|
|
||||||
|
1. 当前 incumbent 的最高测量 probe 已经 feasible。
|
||||||
|
2. 该 feasible probe 距离 `search.high` 已经在 binary-search tolerance 内。
|
||||||
|
3. 在当前搜索区间和 SLO 约束下,继续花 GPU trial 很难提高 measured objective。
|
||||||
|
4. 因此 validator 授权 stop,并保留当前 incumbent。
|
||||||
|
|
||||||
|
这给 harness 带来了 stop discipline:它既不会因为 LLM 过早自信而随便停,也不会在
|
||||||
|
已经 saturate search high 后继续 burn budget。
|
||||||
|
|
||||||
|
## 实际 tune 了哪些 knobs
|
||||||
|
|
||||||
|
Harness winning path 只改了 topology:
|
||||||
|
|
||||||
|
```text
|
||||||
|
base config + tensor-parallel-size=4, data-parallel-size=1
|
||||||
|
```
|
||||||
|
|
||||||
|
它没有在 winning path 中调 scheduler/cache/memory knobs,因为 `ttft_prefill`
|
||||||
|
bottleneck 下,首要动作是缩短单请求 prefill service time。
|
||||||
|
|
||||||
|
Naive arms 则走了另一个方向:
|
||||||
|
|
||||||
|
| Arm | 所有 trials 使用的 topology | 变化过的 runtime knobs | Best req/s/GPU |
|
||||||
|
| --- | --- | --- | ---: |
|
||||||
|
| `gpt55_naive` | `TP=1, DP=8` | `max-num-batched-tokens`, `max-num-seqs`, `block-size`, `gpu-memory-utilization`, prefix caching, chunked prefill | 0.0273 |
|
||||||
|
| `gpt54mini_naive` | `TP=1, DP=8` | `max-num-batched-tokens`, `max-num-seqs`, `block-size`, `gpu-memory-utilization` | 0.0231 |
|
||||||
|
|
||||||
|
`gpt55_naive` 的第一个 proposal 明确选择 `TP=1, DP=8`,理由是模型能单卡放下,
|
||||||
|
因此 horizontal data parallelism 应该最大化 request rate,而 TP 会带来通信开销。
|
||||||
|
之后 naive proposals 一直保留 DP-heavy topology,只围绕 runtime knobs 搜索。
|
||||||
|
两个 naive arms 合计 20 个 trial slots 都没有进入 TP2/TP4 topology frontier。
|
||||||
|
|
||||||
|
## 为什么比 baseline 更好
|
||||||
|
|
||||||
|
Baseline 失败的原因是优化了错误的因果路径。
|
||||||
|
|
||||||
|
对 `ttft_prefill`-bound workload,关键服务时间是单个请求的 prefill latency。
|
||||||
|
DP-heavy topology 可以增加 replica 数,但每个 replica 仍用 TP1 处理长 prompt;
|
||||||
|
它不能显著缩短单请求 prefill path。在 tight TTFT SLO 下,这会导致 feasible
|
||||||
|
`sampling_u` 很低;再除以 GPU 数得到 `req/s/GPU` 后,结果只有
|
||||||
|
`0.02-0.027 req/s/GPU`。
|
||||||
|
|
||||||
|
Harness 的优化路径是:
|
||||||
|
|
||||||
|
```text
|
||||||
|
observed SLO pressure
|
||||||
|
-> classify as ttft_prefill
|
||||||
|
-> choose legal TP frontier probe
|
||||||
|
-> measure feasible req/s/GPU under the same SLO
|
||||||
|
-> stop only when search.high is saturated by incumbent
|
||||||
|
```
|
||||||
|
|
||||||
|
这条路径是可测量、可反驳的。如果 TP4 降低了 latency 但
|
||||||
|
`request_rate_per_gpu` 明显下降,harness 会 reject 这个 hypothesis。如果
|
||||||
|
bottleneck 是 admission/queueing 而不是 TTFT/prefill,同一个 knob-effect model
|
||||||
|
会偏向 DP 或 `max-num-seqs`,而不是 TP frontier。
|
||||||
|
|
||||||
|
因此,这个结果不是“Qwen27B case 里我们 prompt 诱导模型说 TP4”。更准确的结论是:
|
||||||
|
harness 用 SLO-derived bottleneck evidence 把搜索导向了正确的 knob family,
|
||||||
|
再用 per-GPU objective 和 validator stop 验证这个方向。
|
||||||
|
|
||||||
|
## 证据边界
|
||||||
|
|
||||||
|
这份报告强支撑 Qwen27B tight-SLO case 上的 harness 机制,但不能单独当作通用性证明。
|
||||||
|
当前可成立的结论是:
|
||||||
|
|
||||||
|
- 在这个 case 中,harness 同时提升了 final quality、convergence speed、AUC 和
|
||||||
|
stop discipline。
|
||||||
|
- `gpt-5.4-mini + harness` 匹配 `gpt-5.5 + harness`,并显著超过
|
||||||
|
`gpt-5.5 + naive`,说明收益主要来自 harness 的结构化状态和 validator,而不是
|
||||||
|
单纯来自更强模型。
|
||||||
|
- 成功路径用的是通用机制:SLO-derived bottleneck classification、topology
|
||||||
|
constraints、knob-effect scoring、per-GPU objective、validator-authorized stop。
|
||||||
|
- 还需要在其他 bottleneck/case 上继续验证,例如 prefill scheduler pressure、
|
||||||
|
decode TPOT pressure、memory/KV pressure、admission/queueing pressure。
|
||||||
|
|
||||||
|
## 原始 aggregate report 摘录
|
||||||
|
|
||||||
|
```text
|
||||||
|
# qwen27b-tight-2x2-aggregate-20260623T005838Z
|
||||||
|
|
||||||
|
## Aggregate
|
||||||
|
|
||||||
|
- Cases: `1`
|
||||||
|
- Harness-vs-naive pass/checks: `2`/`2`
|
||||||
|
- Winner counts: `{"final_best": {"gpt55_harness": 1}, "fastest_to_target": {"gpt55_harness": 1}, "normalized_auc": {"gpt55_harness": 1}}`
|
||||||
|
|
||||||
|
## By Kind
|
||||||
|
|
||||||
|
| Kind | Arms | Mean final/ref | Mean AUC | Target reached |
|
||||||
|
| --- | ---: | ---: | ---: | ---: |
|
||||||
|
| `harness` | 2 | 1.0000 | 0.9467 | 2 |
|
||||||
|
| `naive` | 2 | 0.0569 | 0.0543 | 0 |
|
||||||
|
|
||||||
|
## Cases
|
||||||
|
|
||||||
|
### qwen27b-tight-slo-2x2-aggregate
|
||||||
|
|
||||||
|
- Reference best req/s/GPU: `0.4429`
|
||||||
|
- Target fraction: `0.95`
|
||||||
|
- Winners: `{"final_best": "gpt55_harness", "fastest_to_target": "gpt55_harness", "normalized_auc": "gpt55_harness"}`
|
||||||
|
|
||||||
|
| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |
|
||||||
|
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||||
|
| `gpt55_harness` | `harness` | 2 | 0.4429 | 1.0000 | 2 | 0.9484 | 0 | 0 |
|
||||||
|
| `gpt55_naive` | `naive` | 10 | 0.0273 | 0.0616 | - | 0.0588 | 2 | 2 |
|
||||||
|
| `gpt54mini_harness` | `harness` | 2 | 0.4429 | 1.0000 | 2 | 0.9450 | 0 | 0 |
|
||||||
|
| `gpt54mini_naive` | `naive` | 10 | 0.0231 | 0.0522 | - | 0.0498 | 1 | 1 |
|
||||||
|
|
||||||
|
| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |
|
||||||
|
| --- | ---: | ---: | ---: | --- |
|
||||||
|
| `gpt55_harness` | 16.2290 | - | 16.1296 | `True` |
|
||||||
|
| `gpt54mini_harness` | 16.2290 | - | 16.0720 | `True` |
|
||||||
|
```
|
||||||
164
docs/harness-ablation/qwen30b-slo-robustness-20260624.md
Normal file
164
docs/harness-ablation/qwen30b-slo-robustness-20260624.md
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
# Qwen30B SLO robustness - 2026-06-24
|
||||||
|
|
||||||
|
本文整理 Qwen30B-A3B community vLLM 0.20 case 在三档 SLO 下的 harness/naive
|
||||||
|
对比,并解释不同 SLO 为什么没有导致完全不同的最终 topology,却改变了可承载负载边界
|
||||||
|
和 bottleneck 判断。
|
||||||
|
|
||||||
|
原始报告位于远端共享 checkout:
|
||||||
|
|
||||||
|
```text
|
||||||
|
.aituner-reports/qwen30b-slo-robust-gpt55-dash1-20260623T163521Z-strict/report.md
|
||||||
|
.aituner-reports/qwen30b-slo-robust-gpt55-dash1-20260623T163521Z-medium/report.md
|
||||||
|
.aituner-reports/qwen30b-slo-robust-gpt55-dash1-20260623T163521Z-loose/report.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## 实验设计
|
||||||
|
|
||||||
|
Case: `qwen30b-a3b-slo-{strict,medium,loose}-gpt55`。
|
||||||
|
|
||||||
|
共同设置:
|
||||||
|
|
||||||
|
- Served model: Qwen30B-A3B community vLLM 0.20。
|
||||||
|
- Hardware: H20,允许 1/2/4/8 GPU topology。
|
||||||
|
- Trace: chat 0-8k,输出长度 128。
|
||||||
|
- Search: `sampling_u in [0, 1.0]`,tolerance 0.001,max 6 probes。
|
||||||
|
- Objective: 在 pass rate >= 0.95 下最大化 `request_rate / used_gpu_count`。
|
||||||
|
- Tuner model: `gpt-5.5`。
|
||||||
|
|
||||||
|
三档 SLO:
|
||||||
|
|
||||||
|
| SLO | TTFT step rule | TPOT |
|
||||||
|
| --- | --- | ---: |
|
||||||
|
| strict | <=4k: 1s, <=32k: 2s, else: 3s | 40 ms |
|
||||||
|
| medium | <=4k: 2s, <=32k: 4s, else: 6s | 50 ms |
|
||||||
|
| loose | <=4k: 4s, <=32k: 8s, else: 12s | 70 ms |
|
||||||
|
|
||||||
|
## 结果摘要
|
||||||
|
|
||||||
|
| SLO | Harness final req/s/GPU | Naive final req/s/GPU | Final speedup | AUC speedup | Harness TTT |
|
||||||
|
| --- | ---: | ---: | ---: | ---: | ---: |
|
||||||
|
| strict | 2.2083 | 0.8000 | 2.7604x | 2.7886x | 1 |
|
||||||
|
| medium | 3.2583 | 0.8000 | 4.0729x | 4.0729x | 1 |
|
||||||
|
| loose | 3.2583 | 1.0458 | 3.1155x | 4.4622x | 1 |
|
||||||
|
|
||||||
|
三个 SLO 下 harness 都在第一个 trial 到达该 SLO 下的 reference best。naive 在 8 个
|
||||||
|
trials 内没有达到 95% reference target。
|
||||||
|
|
||||||
|
## 最终 tune 出来的配置
|
||||||
|
|
||||||
|
三档 SLO 的最终 best topology 都是:
|
||||||
|
|
||||||
|
```text
|
||||||
|
tensor-parallel-size = 2
|
||||||
|
data-parallel-size = 1
|
||||||
|
enable-expert-parallel = false
|
||||||
|
```
|
||||||
|
|
||||||
|
但这不表示 SLO 没有影响。SLO 改变的是同一个 topology 的可行负载上限:
|
||||||
|
|
||||||
|
| SLO | Best config | Best sampling_u | Total req/s | req/s/GPU | Pass rate |
|
||||||
|
| --- | --- | ---: | ---: | ---: | ---: |
|
||||||
|
| strict | `TP=2, DP=1` | 0.484375 | 4.4167 | 2.2083 | 1.0000 |
|
||||||
|
| medium | `TP=2, DP=1` | 0.750000 | 6.5167 | 3.2583 | 1.0000 |
|
||||||
|
| loose | `TP=2, DP=1` | 0.750000 | 6.5167 | 3.2583 | 1.0000 |
|
||||||
|
|
||||||
|
strict 到 medium/loose 的主要变化是 feasible frontier 右移:同一个 `TP=2, DP=1`
|
||||||
|
配置在 strict 下只能稳定承载 `sampling_u=0.484375`,在 medium/loose 下可以承载
|
||||||
|
`sampling_u=0.75`。
|
||||||
|
|
||||||
|
## 为什么 `TP=2, DP=1` 稳定胜出
|
||||||
|
|
||||||
|
AITuner 的 scoring 不是 raw throughput,而是 SLO-constrained per-GPU throughput:
|
||||||
|
|
||||||
|
```text
|
||||||
|
J(c, SLO) = max_u request_rate(c, u) / used_gpu_count(c)
|
||||||
|
subject to pass_rate(c, u, SLO) >= 0.95
|
||||||
|
```
|
||||||
|
|
||||||
|
这解释了为什么 `TP=4` 没有赢。`TP=4` 的单请求 latency 更低、总吞吐可以更高,
|
||||||
|
但它使用两倍 GPU,per-GPU objective 反而下降:
|
||||||
|
|
||||||
|
| SLO | Config | Total req/s | Used GPUs | req/s/GPU | 解释 |
|
||||||
|
| --- | --- | ---: | ---: | ---: | --- |
|
||||||
|
| strict | `TP=2, DP=1` | 4.4167 | 2 | 2.2083 | strict best |
|
||||||
|
| strict | `TP=4, DP=1` | 4.4167 | 4 | 1.1042 | latency 更低,但 GPU efficiency 更差 |
|
||||||
|
| medium/loose | `TP=2, DP=1` | 6.5167 | 2 | 3.2583 | medium/loose best |
|
||||||
|
| medium/loose | `TP=4, DP=1` | 8.3667 | 4 | 2.0917 | raw throughput 更高,但 per-GPU 不划算 |
|
||||||
|
|
||||||
|
因此 harness 学到的不是“越多 GPU 越好”,而是更具体的机制:
|
||||||
|
|
||||||
|
```text
|
||||||
|
TP=1: 单请求 prefill/decode latency 偏高,SLO-constrained load frontier 低。
|
||||||
|
TP=2: 足够缓解 latency,同时 GPU 数量仍低,per-GPU objective 最优。
|
||||||
|
TP=4: 继续降低 latency,但通信和 GPU 数量成本超过收益。
|
||||||
|
```
|
||||||
|
|
||||||
|
## SLO 改变 bottleneck 的方式
|
||||||
|
|
||||||
|
strict 下,`TP=2, DP=1` 在 `sampling_u=0.484375` 可行,但下一档
|
||||||
|
`sampling_u=0.5` 直接进入 queueing collapse:
|
||||||
|
|
||||||
|
| Point | Pass rate | 主要失败原因 |
|
||||||
|
| --- | ---: | --- |
|
||||||
|
| strict, `u=0.484375` | 1.0000 | 无 |
|
||||||
|
| strict, `u=0.5` | 0.0290 | `tpot_ms>40`, `ttft_ms>1000/2000`, `slo_pass_rate_unrecoverable` |
|
||||||
|
|
||||||
|
medium/loose 下,TTFT 阈值放宽后,同一 topology 能承载更高 arrival intensity。
|
||||||
|
但是在 `u=0.765625` 仍会进入不可恢复的排队区:
|
||||||
|
|
||||||
|
| SLO | Feasible point | Next infeasible point | 主要失败原因 |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| medium | `u=0.75`, pass 1.0000 | `u=0.765625`, pass 0.6900 | `tpot_ms>50`, `slo_pass_rate_unrecoverable` |
|
||||||
|
| loose | `u=0.75`, pass 1.0000 | `u=0.765625`, pass 0.2900 | `tpot_ms>70`, `slo_pass_rate_unrecoverable` |
|
||||||
|
|
||||||
|
这说明 SLO 放宽不是无限提高吞吐。服务系统还有 queueing stability frontier;
|
||||||
|
超过 frontier 后,即使单个请求的 steady-state latency 看起来可控,排队也会让 pass rate
|
||||||
|
迅速崩掉。
|
||||||
|
|
||||||
|
## 其他候选配置的信号
|
||||||
|
|
||||||
|
`TP=1, DP=1` 对 SLO 更敏感:
|
||||||
|
|
||||||
|
| SLO | `TP=1, DP=1` req/s/GPU | 解释 |
|
||||||
|
| --- | ---: | --- |
|
||||||
|
| strict | 2.2000 | 接近 strict best,但略低于 `TP=2` |
|
||||||
|
| medium | 2.2000 | 仍低于 `TP=2` |
|
||||||
|
| loose | 2.8500 | 宽松 SLO 下受益明显,但仍低于 `TP=2` |
|
||||||
|
|
||||||
|
`gpu-memory-utilization=0.92` 在 medium/loose 中与 `TP=2` 打平:
|
||||||
|
|
||||||
|
| SLO | Config | req/s/GPU |
|
||||||
|
| --- | --- | ---: |
|
||||||
|
| medium | `TP=2, gpu-memory-utilization=0.92` | 3.2583 |
|
||||||
|
| loose | `TP=2, gpu-memory-utilization=0.92` | 3.2583 |
|
||||||
|
|
||||||
|
这说明该 workload 的主瓶颈不是 KV memory headroom,而是 topology 和 queueing
|
||||||
|
frontier。
|
||||||
|
|
||||||
|
EP family 在该环境下不稳定:
|
||||||
|
|
||||||
|
```text
|
||||||
|
TP=4, EP=2/4, enable-expert-parallel=true -> engine_launch exit_code=2
|
||||||
|
```
|
||||||
|
|
||||||
|
这些失败 trial 没有进入 best candidate,但它们说明当前 failure memory 还可以继续加强:
|
||||||
|
同一类 EP launch failure 出现后,后续 proposal 应更积极地屏蔽该 family。
|
||||||
|
|
||||||
|
## 对 paper claim 的含义
|
||||||
|
|
||||||
|
这组实验支持的 claim 是:
|
||||||
|
|
||||||
|
1. Harness 对 SLO 变化有稳定收益:strict/medium/loose 三档均显著优于 naive。
|
||||||
|
2. Harness 不是固定写死某个 knob。它通过 SLO-constrained probing 找到 feasible
|
||||||
|
frontier;在本 case 中最终 topology 相同,但可承载负载边界随 SLO 改变。
|
||||||
|
3. Harness 的 value 来自 topology-first candidate family、per-GPU scoring 和
|
||||||
|
validator 对 failed family 的处理,而不是自然语言 prompt 的偶然表达。
|
||||||
|
|
||||||
|
这组实验尚不能单独 claim:
|
||||||
|
|
||||||
|
- 所有模型和 workload 上都 robust。
|
||||||
|
- `TP=2, DP=1` 是全局最优。
|
||||||
|
- EP family 已经被最优处理。
|
||||||
|
|
||||||
|
对应的后续证据应放在 roadmap 中跟踪:局部 grid/near-optimum、跨模型 2x2、跨 workload
|
||||||
|
SLO robustness,以及 failure-memory ablation。
|
||||||
@@ -51,6 +51,13 @@ enabled = true
|
|||||||
sync_remote_path = "~/aituner"
|
sync_remote_path = "~/aituner"
|
||||||
fleet_root = "~/.aituner_gpu_fleet"
|
fleet_root = "~/.aituner_gpu_fleet"
|
||||||
|
|
||||||
|
[[hosts]]
|
||||||
|
name = "dash4"
|
||||||
|
ssh_alias = "dash4"
|
||||||
|
enabled = true
|
||||||
|
sync_remote_path = "~/workspace/aituner"
|
||||||
|
fleet_root = "~/.aituner_gpu_fleet"
|
||||||
|
|
||||||
[[hosts]]
|
[[hosts]]
|
||||||
name = "dash5"
|
name = "dash5"
|
||||||
ssh_alias = "dash5"
|
ssh_alias = "dash5"
|
||||||
|
|||||||
@@ -4,5 +4,5 @@ dash0
|
|||||||
dash1
|
dash1
|
||||||
dash2
|
dash2
|
||||||
dash3
|
dash3
|
||||||
|
dash4
|
||||||
dash5
|
dash5
|
||||||
|
|
||||||
|
|||||||
@@ -10,22 +10,37 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def topo(patch):
|
TOPOLOGY_KEYS = (
|
||||||
|
("tensor-parallel-size", "TP"),
|
||||||
|
("data-parallel-size", "DP"),
|
||||||
|
("expert-parallel-size", "EP"),
|
||||||
|
)
|
||||||
|
|
||||||
|
RUNTIME_KEYS = (
|
||||||
|
"gpu-memory-utilization",
|
||||||
|
"enable-chunked-prefill",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
"max-num-seqs",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def topo(patch, base_flags=None):
|
||||||
fp = (patch or {}).get("flag_patch", {}) or {}
|
fp = (patch or {}).get("flag_patch", {}) or {}
|
||||||
ep = (patch or {}).get("env_patch", {}) or {}
|
ep = (patch or {}).get("env_patch", {}) or {}
|
||||||
|
effective = dict(base_flags or {})
|
||||||
|
effective.update(fp)
|
||||||
parts = []
|
parts = []
|
||||||
for k, label in (
|
for k, label in TOPOLOGY_KEYS:
|
||||||
("tensor-parallel-size", "TP"),
|
if k in effective:
|
||||||
("data-parallel-size", "DP"),
|
parts.append(f"{label}{effective[k]}")
|
||||||
("expert-parallel-size", "EP"),
|
runtime = {k: effective[k] for k in RUNTIME_KEYS if k in effective}
|
||||||
):
|
runtime.update(
|
||||||
if k in fp:
|
{
|
||||||
parts.append(f"{label}{fp[k]}")
|
k: v
|
||||||
runtime = {
|
for k, v in fp.items()
|
||||||
k: v
|
if k not in {key for key, _ in TOPOLOGY_KEYS} and k not in runtime
|
||||||
for k, v in fp.items()
|
}
|
||||||
if k not in ("tensor-parallel-size", "data-parallel-size", "expert-parallel-size")
|
)
|
||||||
}
|
|
||||||
runtime.update({f"env:{k}": v for k, v in ep.items()})
|
runtime.update({f"env:{k}": v for k, v in ep.items()})
|
||||||
base = "+".join(parts) if parts else "baseline-topo"
|
base = "+".join(parts) if parts else "baseline-topo"
|
||||||
if runtime:
|
if runtime:
|
||||||
@@ -36,6 +51,11 @@ def topo(patch):
|
|||||||
def main():
|
def main():
|
||||||
store = Path(sys.argv[1])
|
store = Path(sys.argv[1])
|
||||||
state = json.load(open(store / "state.json"))
|
state = json.load(open(store / "state.json"))
|
||||||
|
snapshot_path = store / "study_spec.snapshot.json"
|
||||||
|
base_flags = {}
|
||||||
|
if snapshot_path.exists():
|
||||||
|
snapshot = json.load(open(snapshot_path))
|
||||||
|
base_flags = ((snapshot.get("engine") or {}).get("base_flags") or {})
|
||||||
print(f"study_id: {state.get('study_id')}")
|
print(f"study_id: {state.get('study_id')}")
|
||||||
print(f"best_trial: {state.get('best_trial_id')} best_per_gpu: {state.get('best_request_rate_per_gpu')}")
|
print(f"best_trial: {state.get('best_trial_id')} best_per_gpu: {state.get('best_request_rate_per_gpu')}")
|
||||||
print(f"stop_reason: {state.get('tuning_stop_reason')!r}")
|
print(f"stop_reason: {state.get('tuning_stop_reason')!r}")
|
||||||
@@ -53,7 +73,7 @@ def main():
|
|||||||
pgs = f"{pg:.4f}" if isinstance(pg, (int, float)) else str(pg)
|
pgs = f"{pg:.4f}" if isinstance(pg, (int, float)) else str(pg)
|
||||||
incs = f"{incumbent:.4f}" if isinstance(incumbent, (int, float)) else str(incumbent)
|
incs = f"{incumbent:.4f}" if isinstance(incumbent, (int, float)) else str(incumbent)
|
||||||
print(
|
print(
|
||||||
f"{i:<5}{t.get('trial_id',''):<11}{str(t.get('status','')):<14}{pgs:<10}{incs:<11}{topo(t.get('config_patch'))}"
|
f"{i:<5}{t.get('trial_id',''):<11}{str(t.get('status','')):<14}{pgs:<10}{incs:<11}{topo(t.get('config_patch'), base_flags)}"
|
||||||
)
|
)
|
||||||
# also dump proposals dir to see what was *proposed* (incl. vetoed/failed)
|
# also dump proposals dir to see what was *proposed* (incl. vetoed/failed)
|
||||||
pdir = store / "proposals"
|
pdir = store / "proposals"
|
||||||
@@ -64,7 +84,7 @@ def main():
|
|||||||
pr = json.load(open(p))
|
pr = json.load(open(p))
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
print(f" {p.stem}: should_stop={pr.get('should_stop')} | {topo(pr.get('config_patch'))}")
|
print(f" {p.stem}: should_stop={pr.get('should_stop')} | {topo(pr.get('config_patch'), base_flags)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
99
scripts/calibrate_time_scale.py
Normal file
99
scripts/calibrate_time_scale.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Criterion-A time_scale calibration.
|
||||||
|
|
||||||
|
Binary-search the smallest replay_time_scale whose A-family L-C-A similarity to the
|
||||||
|
real (scale=1.0) arrival process stays >= tau. Uniform time scaling distorts only
|
||||||
|
the A axis (rate + fano; interarrival CV is scale-invariant), so this bounds the
|
||||||
|
arrival-axis distortion introduced by compression using the same similarity metric
|
||||||
|
Stop-A uses. Pure trace metadata -> deterministic, no GPU needed.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
PYTHONPATH=src python3 scripts/calibrate_time_scale.py \
|
||||||
|
--trace trace_windows/traces/chat_w20260311_1000.jsonl \
|
||||||
|
--gpu-count 8 --min-input 0 --max-input 8192 --tau 0.9
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from aituner.lca import _family_similarity, build_workload_profile
|
||||||
|
from aituner.trace import TraceRequest, WindowRecord
|
||||||
|
|
||||||
|
|
||||||
|
def load_rows(path: Path, lo: int, hi: int) -> list[dict]:
|
||||||
|
with path.open(encoding="utf-8") as fh:
|
||||||
|
rows = [json.loads(l) for l in fh if l.strip()]
|
||||||
|
return [r for r in rows if lo <= int(r["input_length"]) <= hi]
|
||||||
|
|
||||||
|
|
||||||
|
def build_requests(rows: list[dict]) -> tuple[list[TraceRequest], float, float]:
|
||||||
|
reqs = []
|
||||||
|
for i, r in enumerate(rows):
|
||||||
|
reqs.append(
|
||||||
|
TraceRequest(
|
||||||
|
row_id=str(r.get("chat_id", i)),
|
||||||
|
arrival_s=float(r["timestamp"]),
|
||||||
|
sampling_u=float(r.get("sampling_u", 0.0)),
|
||||||
|
body={},
|
||||||
|
prompt_tokens_hint=int(r["input_length"]),
|
||||||
|
completion_tokens_hint=int(r["output_length"]),
|
||||||
|
metadata={"hash_ids": r.get("hash_ids") if isinstance(r.get("hash_ids"), list) else None},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
amin = min(x.arrival_s for x in reqs)
|
||||||
|
amax = max(x.arrival_s for x in reqs)
|
||||||
|
return reqs, amin, amax
|
||||||
|
|
||||||
|
|
||||||
|
def profile_at(reqs, amin, amax, gpu_count, scale):
|
||||||
|
rs = [
|
||||||
|
TraceRequest(
|
||||||
|
x.row_id, (x.arrival_s - amin) * scale, x.sampling_u, x.body,
|
||||||
|
x.prompt_tokens_hint, x.completion_tokens_hint, x.metadata,
|
||||||
|
)
|
||||||
|
for x in reqs
|
||||||
|
]
|
||||||
|
span = (amax - amin) * scale
|
||||||
|
w = WindowRecord(
|
||||||
|
window_id="w", trace_path="", trace_type="chat",
|
||||||
|
window_start=0.0, window_end=span, source_payload={"block_size": 64},
|
||||||
|
)
|
||||||
|
return build_workload_profile(rs, w, gpu_count=gpu_count, length_mode="total")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--trace", type=Path, required=True)
|
||||||
|
ap.add_argument("--gpu-count", type=int, default=8)
|
||||||
|
ap.add_argument("--min-input", type=int, default=0)
|
||||||
|
ap.add_argument("--max-input", type=int, default=8192)
|
||||||
|
ap.add_argument("--tau", type=float, default=0.9)
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
rows = load_rows(args.trace, args.min_input, args.max_input)
|
||||||
|
reqs, amin, amax = build_requests(rows)
|
||||||
|
print(f"n={len(reqs)} raw arrival span={amax - amin:.1f}s")
|
||||||
|
base = profile_at(reqs, amin, amax, args.gpu_count, 1.0)
|
||||||
|
print(f"{'scale':>6} {'simA':>7} {'rate/gpu':>9} {'fano':>8} {'span_s':>8}")
|
||||||
|
for s in (1.0, 0.95, 0.9, 0.85, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2):
|
||||||
|
p = profile_at(reqs, amin, amax, args.gpu_count, s)
|
||||||
|
a = _family_similarity(base.vector, p.vector)["A"]
|
||||||
|
print(f"{s:6.2f} {a:7.3f} {math.expm1(p.vector[7]):9.3f} {math.expm1(p.vector[9]):8.2f} {(amax-amin)*s:8.1f}")
|
||||||
|
|
||||||
|
lo, hi = 0.05, 1.0
|
||||||
|
for _ in range(40):
|
||||||
|
mid = (lo + hi) / 2
|
||||||
|
a = _family_similarity(base.vector, profile_at(reqs, amin, amax, args.gpu_count, mid).vector)["A"]
|
||||||
|
if a >= args.tau:
|
||||||
|
hi = mid
|
||||||
|
else:
|
||||||
|
lo = mid
|
||||||
|
print(f"\nsmallest scale with simA>={args.tau}: {hi:.4f} (arrival span {(amax-amin)*hi:.0f}s)")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
31
scripts/run_ablation_pair_d1.sh
Normal file
31
scripts/run_ablation_pair_d1.sh
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# 12-iteration harness-vs-naive ablation, both arms on dash1 (clean paired run,
|
||||||
|
# no host confound). Substrate: real output_length (no completion override),
|
||||||
|
# replay_time_scale=0.8775 (criterion-A, sim_A>=0.90), Stop-A on (LCA offered
|
||||||
|
# window), per-probe Stop-A-consistent drain deadline. Harness stops early; naive
|
||||||
|
# runs the full budget. Run from the repo root on dash1.
|
||||||
|
set -u
|
||||||
|
# Re-read the codex token from auth.json right before each arm (capturing it once at
|
||||||
|
# launch goes stale during a long run -- that is what 401'd naive runs 2/3).
|
||||||
|
read_key() { export OPENAI_API_KEY=$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])'); }
|
||||||
|
# codex config.toml points at a dash0-local proxy (127.0.0.1:11235); on dash1 the
|
||||||
|
# LLM endpoint is reachable directly, so force a direct connection.
|
||||||
|
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||||
|
mkdir -p .aituner
|
||||||
|
rm -rf .aituner/abl12-harness .aituner/abl12-naive .aituner/ABLATION12_DONE
|
||||||
|
|
||||||
|
read_key
|
||||||
|
echo "=== harness ON (12-iter) start $(date -Is) ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
|
||||||
|
--store-root .aituner/abl12-harness --max-trials 12 --skip-baseline > .aituner/abl12-harness.log 2>&1
|
||||||
|
echo "=== harness ON (12-iter) done $(date -Is) ==="
|
||||||
|
|
||||||
|
read_key
|
||||||
|
echo "=== naive OFF (12-iter) start $(date -Is) ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
|
||||||
|
--store-root .aituner/abl12-naive --max-trials 12 --skip-baseline > .aituner/abl12-naive.log 2>&1
|
||||||
|
echo "=== naive OFF (12-iter) done $(date -Is) ==="
|
||||||
|
|
||||||
|
touch .aituner/ABLATION12_DONE
|
||||||
81
scripts/run_clean_ablation_pair_d1.sh
Normal file
81
scripts/run_clean_ablation_pair_d1.sh
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Clean same-policy harness-vs-naive ablation on dash1.
|
||||||
|
#
|
||||||
|
# This is intended as the first robustness gate for harness evaluation:
|
||||||
|
# both arms use the same study substrate and the same configured LLM endpoint;
|
||||||
|
# the only intended difference is llm.use_harness.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
RUN_LABEL="${AITUNER_RUN_ID:-qwen27b-clean-pair-$(date -u +%Y%m%dT%H%M%SZ)}"
|
||||||
|
MAX_TRIALS="${MAX_TRIALS:-12}"
|
||||||
|
ROOT="$(pwd)"
|
||||||
|
HARNESS_STORE=".aituner/${RUN_LABEL}-harness"
|
||||||
|
NAIVE_STORE=".aituner/${RUN_LABEL}-naive"
|
||||||
|
REPORT_ROOT=".aituner-reports/${RUN_LABEL}"
|
||||||
|
SPEC_PATH=".aituner-reports/${RUN_LABEL}.spec.json"
|
||||||
|
|
||||||
|
read_key() {
|
||||||
|
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||||
|
export OPENAI_API_KEY
|
||||||
|
OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||||
|
mkdir -p .aituner .aituner-reports
|
||||||
|
rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${SPEC_PATH}"
|
||||||
|
|
||||||
|
read_key
|
||||||
|
echo "=== harness ON clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
|
||||||
|
--store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||||
|
> ".aituner/${RUN_LABEL}-harness.log" 2>&1
|
||||||
|
echo "=== harness ON clean pair done $(date -Is) ==="
|
||||||
|
|
||||||
|
read_key
|
||||||
|
echo "=== naive OFF clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
|
||||||
|
--store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||||
|
> ".aituner/${RUN_LABEL}-naive.log" 2>&1
|
||||||
|
echo "=== naive OFF clean pair done $(date -Is) ==="
|
||||||
|
|
||||||
|
python3 - <<PY
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
root = Path("${ROOT}")
|
||||||
|
run_label = "${RUN_LABEL}"
|
||||||
|
spec = {
|
||||||
|
"report_id": run_label,
|
||||||
|
"output_root": str(root / "${REPORT_ROOT}"),
|
||||||
|
"target_fraction": 0.95,
|
||||||
|
"min_final_ratio": 0.98,
|
||||||
|
"cases": [
|
||||||
|
{
|
||||||
|
"case_id": "qwen27b-chat-0-8k-clean-gpt55",
|
||||||
|
"description": "Clean same-policy gpt-5.5 harness-vs-naive pair on dash1.",
|
||||||
|
"tags": ["qwen27b", "chat", "0-8k", "h20", "clean-pair", "gpt-5.5"],
|
||||||
|
"budgets": [1, 2, 3, 4, 6, 8, 12],
|
||||||
|
"arms": [
|
||||||
|
{
|
||||||
|
"name": "harness",
|
||||||
|
"kind": "harness",
|
||||||
|
"study_root": str(root / "${HARNESS_STORE}" / "dash0-qwen27b-ablation-harness-on"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "naive",
|
||||||
|
"kind": "naive",
|
||||||
|
"study_root": str(root / "${NAIVE_STORE}" / "dash0-qwen27b-ablation-naive-off"),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
Path("${SPEC_PATH}").write_text(json.dumps(spec, indent=2) + "\\n", encoding="utf-8")
|
||||||
|
PY
|
||||||
|
|
||||||
|
PYTHONPATH=src python3 scripts/tuning_report.py --spec "${SPEC_PATH}"
|
||||||
|
touch ".aituner/${RUN_LABEL}.DONE"
|
||||||
|
echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ==="
|
||||||
177
scripts/run_clean_pair_from_specs.sh
Executable file
177
scripts/run_clean_pair_from_specs.sh
Executable file
@@ -0,0 +1,177 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Run a clean same-policy harness-vs-naive pair from one or two base specs.
|
||||||
|
#
|
||||||
|
# Required env:
|
||||||
|
# RUN_LABEL
|
||||||
|
# CASE_ID
|
||||||
|
# HARNESS_BASE_SPEC
|
||||||
|
#
|
||||||
|
# Optional env:
|
||||||
|
# NAIVE_BASE_SPEC defaults to HARNESS_BASE_SPEC
|
||||||
|
# MAX_TRIALS defaults to 12
|
||||||
|
# CASE_DESCRIPTION
|
||||||
|
# CASE_TAGS_JSON JSON list, defaults to []
|
||||||
|
# BUDGETS_JSON JSON list, defaults to [1,2,3,4,6,8,MAX_TRIALS]
|
||||||
|
# COMMON_SPEC_PATCH_FILE JSON deep-merged into both generated specs
|
||||||
|
# HARNESS_SPEC_PATCH_FILE JSON deep-merged into harness generated spec
|
||||||
|
# NAIVE_SPEC_PATCH_FILE JSON deep-merged into naive generated spec
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
RUN_LABEL="${RUN_LABEL:?RUN_LABEL is required}"
|
||||||
|
CASE_ID="${CASE_ID:?CASE_ID is required}"
|
||||||
|
HARNESS_BASE_SPEC="${HARNESS_BASE_SPEC:?HARNESS_BASE_SPEC is required}"
|
||||||
|
NAIVE_BASE_SPEC="${NAIVE_BASE_SPEC:-${HARNESS_BASE_SPEC}}"
|
||||||
|
MAX_TRIALS="${MAX_TRIALS:-12}"
|
||||||
|
CASE_DESCRIPTION="${CASE_DESCRIPTION:-Clean same-policy harness-vs-naive pair.}"
|
||||||
|
CASE_TAGS_JSON="${CASE_TAGS_JSON:-[]}"
|
||||||
|
BUDGETS_JSON="${BUDGETS_JSON:-}"
|
||||||
|
|
||||||
|
ROOT="$(pwd)"
|
||||||
|
RUN_CONFIG_ROOT=".aituner-run-configs/${RUN_LABEL}"
|
||||||
|
HARNESS_SPEC="${RUN_CONFIG_ROOT}/harness.json"
|
||||||
|
NAIVE_SPEC="${RUN_CONFIG_ROOT}/naive.json"
|
||||||
|
HARNESS_STORE=".aituner/${RUN_LABEL}-harness"
|
||||||
|
NAIVE_STORE=".aituner/${RUN_LABEL}-naive"
|
||||||
|
REPORT_ROOT=".aituner-reports/${RUN_LABEL}"
|
||||||
|
REPORT_SPEC=".aituner-reports/${RUN_LABEL}.spec.json"
|
||||||
|
export RUN_LABEL CASE_ID HARNESS_BASE_SPEC NAIVE_BASE_SPEC MAX_TRIALS CASE_DESCRIPTION
|
||||||
|
export CASE_TAGS_JSON BUDGETS_JSON ROOT RUN_CONFIG_ROOT HARNESS_SPEC NAIVE_SPEC
|
||||||
|
export HARNESS_STORE NAIVE_STORE REPORT_ROOT REPORT_SPEC
|
||||||
|
|
||||||
|
read_key() {
|
||||||
|
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||||
|
export OPENAI_API_KEY
|
||||||
|
OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||||
|
mkdir -p "${RUN_CONFIG_ROOT}" .aituner .aituner-reports
|
||||||
|
rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${REPORT_SPEC}"
|
||||||
|
|
||||||
|
python3 - <<'PY'
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def deep_merge(base: dict[str, Any], patch: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
merged = dict(base)
|
||||||
|
for key, value in patch.items():
|
||||||
|
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||||
|
merged[key] = deep_merge(merged[key], value)
|
||||||
|
else:
|
||||||
|
merged[key] = value
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def load_patch(env_name: str) -> dict[str, Any]:
|
||||||
|
path = os.environ.get(env_name)
|
||||||
|
if not path:
|
||||||
|
return {}
|
||||||
|
payload = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
raise SystemExit(f"{env_name} must point to a JSON object")
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def generated_spec(base_path: str, *, use_harness: bool, suffix: str, arm_patch: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
base = json.loads(Path(base_path).read_text(encoding="utf-8"))
|
||||||
|
if not isinstance(base, dict):
|
||||||
|
raise SystemExit(f"{base_path} must contain a JSON object")
|
||||||
|
common = load_patch("COMMON_SPEC_PATCH_FILE")
|
||||||
|
spec = deep_merge(base, common)
|
||||||
|
spec = deep_merge(spec, arm_patch)
|
||||||
|
spec["study_id"] = str(spec.get("study_id") or os.environ["CASE_ID"]) + f"-{suffix}"
|
||||||
|
llm = dict(spec.get("llm") or {})
|
||||||
|
llm["use_harness"] = use_harness
|
||||||
|
spec["llm"] = llm
|
||||||
|
return spec
|
||||||
|
|
||||||
|
|
||||||
|
run_config_root = Path(os.environ["RUN_CONFIG_ROOT"])
|
||||||
|
harness = generated_spec(
|
||||||
|
os.environ["HARNESS_BASE_SPEC"],
|
||||||
|
use_harness=True,
|
||||||
|
suffix="harness",
|
||||||
|
arm_patch=load_patch("HARNESS_SPEC_PATCH_FILE"),
|
||||||
|
)
|
||||||
|
naive = generated_spec(
|
||||||
|
os.environ["NAIVE_BASE_SPEC"],
|
||||||
|
use_harness=False,
|
||||||
|
suffix="naive",
|
||||||
|
arm_patch=load_patch("NAIVE_SPEC_PATCH_FILE"),
|
||||||
|
)
|
||||||
|
(run_config_root / "harness.json").write_text(json.dumps(harness, indent=2) + "\n", encoding="utf-8")
|
||||||
|
(run_config_root / "naive.json").write_text(json.dumps(naive, indent=2) + "\n", encoding="utf-8")
|
||||||
|
print(json.dumps({"harness_study_id": harness["study_id"], "naive_study_id": naive["study_id"]}, ensure_ascii=False))
|
||||||
|
PY
|
||||||
|
|
||||||
|
read_key
|
||||||
|
echo "=== harness clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec "${HARNESS_SPEC}" \
|
||||||
|
--store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||||
|
> ".aituner/${RUN_LABEL}-harness.log" 2>&1
|
||||||
|
echo "=== harness clean pair done $(date -Is) ==="
|
||||||
|
|
||||||
|
read_key
|
||||||
|
echo "=== naive clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec "${NAIVE_SPEC}" \
|
||||||
|
--store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||||
|
> ".aituner/${RUN_LABEL}-naive.log" 2>&1
|
||||||
|
echo "=== naive clean pair done $(date -Is) ==="
|
||||||
|
|
||||||
|
python3 - <<'PY'
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
root = Path(os.environ["ROOT"])
|
||||||
|
run_label = os.environ["RUN_LABEL"]
|
||||||
|
harness = json.loads(Path(os.environ["HARNESS_SPEC"]).read_text(encoding="utf-8"))
|
||||||
|
naive = json.loads(Path(os.environ["NAIVE_SPEC"]).read_text(encoding="utf-8"))
|
||||||
|
max_trials = int(os.environ["MAX_TRIALS"])
|
||||||
|
budgets_text = os.environ.get("BUDGETS_JSON") or ""
|
||||||
|
if budgets_text:
|
||||||
|
budgets = json.loads(budgets_text)
|
||||||
|
else:
|
||||||
|
budgets = [1, 2, 3, 4, 6, 8, max_trials]
|
||||||
|
budgets = sorted({int(item) for item in budgets if int(item) > 0})
|
||||||
|
tags = json.loads(os.environ.get("CASE_TAGS_JSON") or "[]")
|
||||||
|
spec = {
|
||||||
|
"report_id": run_label,
|
||||||
|
"output_root": str(root / os.environ["REPORT_ROOT"]),
|
||||||
|
"target_fraction": 0.95,
|
||||||
|
"min_final_ratio": 0.98,
|
||||||
|
"cases": [
|
||||||
|
{
|
||||||
|
"case_id": os.environ["CASE_ID"],
|
||||||
|
"description": os.environ["CASE_DESCRIPTION"],
|
||||||
|
"tags": tags,
|
||||||
|
"budgets": budgets,
|
||||||
|
"arms": [
|
||||||
|
{
|
||||||
|
"name": "harness",
|
||||||
|
"kind": "harness",
|
||||||
|
"study_root": str(
|
||||||
|
root / os.environ["HARNESS_STORE"] / harness["study_id"]
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "naive",
|
||||||
|
"kind": "naive",
|
||||||
|
"study_root": str(root / os.environ["NAIVE_STORE"] / naive["study_id"]),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
Path(os.environ["REPORT_SPEC"]).write_text(json.dumps(spec, indent=2) + "\n", encoding="utf-8")
|
||||||
|
PY
|
||||||
|
|
||||||
|
PYTHONPATH=src python3 scripts/tuning_report.py --spec "${REPORT_SPEC}"
|
||||||
|
touch ".aituner/${RUN_LABEL}.DONE"
|
||||||
|
echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ==="
|
||||||
16
scripts/run_harness_only_d1.sh
Normal file
16
scripts/run_harness_only_d1.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Harness-only re-run on gpt-5.5 to EMPIRICALLY verify the gpu-memory-utilization fix:
|
||||||
|
# success = the harness recovers ~0.87/GPU (climbs gpu-mem-util to ~0.94) and then stops,
|
||||||
|
# matching the naive-discovered ground truth. Run from the repo root on dash1.
|
||||||
|
set -u
|
||||||
|
read_key() { export OPENAI_API_KEY=$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])'); }
|
||||||
|
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||||
|
mkdir -p .aituner
|
||||||
|
rm -rf .aituner/abl12-harness .aituner/abl12-harness.log .aituner/HARNESS_ONLY_DONE
|
||||||
|
read_key
|
||||||
|
echo "=== harness ON (gpt-5.5, gpu-mem-util fix) start $(date -Is) ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
|
||||||
|
--store-root .aituner/abl12-harness --max-trials 12 --skip-baseline > .aituner/abl12-harness.log 2>&1
|
||||||
|
echo "=== harness ON done $(date -Is) ==="
|
||||||
|
touch .aituner/HARNESS_ONLY_DONE
|
||||||
26
scripts/run_naive_repeats_d1.sh
Normal file
26
scripts/run_naive_repeats_d1.sh
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Fig-18 naive nondeterminism: after the main pair (ABLATION12_DONE) finishes, run
|
||||||
|
# 2 more naive arms (runs 2 and 3) on the SAME substrate. The naive LLM (gpt-5.4,
|
||||||
|
# use_harness=false) is nondeterministic, so the run-to-run spread (fail / slow /
|
||||||
|
# lucky) is the result. Harness arm stays a single deterministic curve. Run from
|
||||||
|
# the repo root on dash1; survives disconnect via setsid/nohup at launch.
|
||||||
|
set -u
|
||||||
|
export OPENAI_API_KEY=$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')
|
||||||
|
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||||
|
|
||||||
|
# Wait for the main harness+naive(run1) pair to complete so we never contend for GPUs.
|
||||||
|
echo "=== waiting for ABLATION12_DONE $(date -Is) ==="
|
||||||
|
while [ ! -f .aituner/ABLATION12_DONE ]; do sleep 120; done
|
||||||
|
echo "=== main pair done, starting naive repeats $(date -Is) ==="
|
||||||
|
|
||||||
|
for r in 2 3; do
|
||||||
|
rm -rf ".aituner/abl12-naive${r}" ".aituner/abl12-naive${r}.log"
|
||||||
|
echo "=== naive run ${r} start $(date -Is) ==="
|
||||||
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||||
|
--spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
|
||||||
|
--store-root ".aituner/abl12-naive${r}" --max-trials 12 --skip-baseline > ".aituner/abl12-naive${r}.log" 2>&1
|
||||||
|
echo "=== naive run ${r} done $(date -Is) ==="
|
||||||
|
done
|
||||||
|
|
||||||
|
touch .aituner/NAIVE_REPEATS_DONE
|
||||||
|
echo "=== all naive repeats done $(date -Is) ==="
|
||||||
36
scripts/tuning_report.py
Normal file
36
scripts/tuning_report.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from aituner.tuning_report import run_tuning_report
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Summarize anytime tuning progress across harness/naive study stores."
|
||||||
|
)
|
||||||
|
parser.add_argument("--spec", required=True, help="Path to a tuning report JSON spec.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
summary = run_tuning_report(Path(args.spec))
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"report_id": summary["report_id"],
|
||||||
|
"report_root": summary["report_root"],
|
||||||
|
"case_count": summary["aggregate"]["case_count"],
|
||||||
|
"harness_vs_naive_pass_count": summary["aggregate"]["harness_vs_naive_pass_count"],
|
||||||
|
"harness_vs_naive_check_count": summary["aggregate"]["harness_vs_naive_check_count"],
|
||||||
|
"winner_counts": summary["aggregate"]["winner_counts"],
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@@ -367,20 +367,41 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
proposal_source_label = "harness"
|
proposal_source_label = "harness"
|
||||||
else:
|
else:
|
||||||
proposal_source_label = str(proposal_source) if proposal_source else "llm"
|
proposal_source_label = str(proposal_source) if proposal_source else "llm"
|
||||||
|
stop_authorized_by = (
|
||||||
|
"validator"
|
||||||
|
if (is_harness_stop or authorized)
|
||||||
|
else "file_proposal"
|
||||||
|
if proposal_source is not None
|
||||||
|
else "llm_after_veto_budget"
|
||||||
|
)
|
||||||
|
stop_reason = (
|
||||||
|
"harness_stop"
|
||||||
|
if is_harness_stop
|
||||||
|
else "proposal_file_stop"
|
||||||
|
if proposal_source is not None
|
||||||
|
else "llm_stop"
|
||||||
|
)
|
||||||
|
stop_details = {
|
||||||
|
"proposal_name": proposal_name,
|
||||||
|
"proposal_source": proposal_source_label,
|
||||||
|
"stop_authorized_by": stop_authorized_by,
|
||||||
|
}
|
||||||
|
if stop_authority:
|
||||||
|
stop_details["validator_reason"] = stop_authority.get("reason")
|
||||||
|
state.tuning_stop_reason = stop_reason
|
||||||
|
state.tuning_stop_diagnosis = proposal.diagnosis
|
||||||
|
state.tuning_stop_details = stop_details
|
||||||
|
store.save_state(state)
|
||||||
executed.append(
|
executed.append(
|
||||||
{
|
{
|
||||||
"trial_id": None,
|
"trial_id": None,
|
||||||
"proposal_name": proposal_name,
|
"proposal_name": proposal_name,
|
||||||
"proposal_source": proposal_source_label,
|
"proposal_source": proposal_source_label,
|
||||||
"stopped": True,
|
"stopped": True,
|
||||||
"stop_authorized_by": (
|
"reason": state.tuning_stop_reason,
|
||||||
"validator"
|
"stop_authorized_by": stop_authorized_by,
|
||||||
if (is_harness_stop or authorized)
|
|
||||||
else "file_proposal"
|
|
||||||
if proposal_source is not None
|
|
||||||
else "llm_after_veto_budget"
|
|
||||||
),
|
|
||||||
"diagnosis": proposal.diagnosis,
|
"diagnosis": proposal.diagnosis,
|
||||||
|
"details": stop_details,
|
||||||
"state_best_trial_id": state.best_trial_id,
|
"state_best_trial_id": state.best_trial_id,
|
||||||
"state_best_request_rate": state.best_request_rate,
|
"state_best_request_rate": state.best_request_rate,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,6 +24,13 @@ _RUNTIME_KEYS = {
|
|||||||
_STRONG_INCUMBENT_MIN_GAIN = 1.8
|
_STRONG_INCUMBENT_MIN_GAIN = 1.8
|
||||||
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
|
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
|
||||||
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
|
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
|
||||||
|
# Decode-bound throughput is frequently KV-cache limited, so more gpu-memory-utilization
|
||||||
|
# yields more KV blocks and more concurrent decode. Hill-climb in small steps toward a
|
||||||
|
# safe ceiling and let measurement find the real peak: a too-high target regresses or
|
||||||
|
# fails to launch and is rejected by the incumbent guard, and its tested signature then
|
||||||
|
# blocks re-proposal so the climb terminates.
|
||||||
|
_GMU_STEP = 0.02
|
||||||
|
_GMU_SAFE_CEILING = 0.97
|
||||||
|
|
||||||
|
|
||||||
def build_harness_context(
|
def build_harness_context(
|
||||||
@@ -383,14 +390,17 @@ def _knob_harnesses(
|
|||||||
"knob_family": "gpu-memory-utilization",
|
"knob_family": "gpu-memory-utilization",
|
||||||
"use_when": [
|
"use_when": [
|
||||||
"The engine launches cleanly but memory headroom limits batching.",
|
"The engine launches cleanly but memory headroom limits batching.",
|
||||||
|
"A decode-bound incumbent (decode_tpot) is KV-cache limited and could sustain more concurrent decode with more KV blocks.",
|
||||||
],
|
],
|
||||||
"procedure": [
|
"procedure": [
|
||||||
"Make small adjustments only after topology and batching knobs are stable.",
|
"Make small adjustments only after topology and batching knobs are stable.",
|
||||||
|
"Raise gpu-memory-utilization one small step at a time and keep the step only if request_rate_per_gpu improves and the engine still launches.",
|
||||||
],
|
],
|
||||||
"guards": [
|
"guards": [
|
||||||
"Treat launch OOM as hard negative evidence and back off immediately.",
|
"Treat launch OOM as hard negative evidence and back off immediately.",
|
||||||
|
"Do not exceed a safe utilization ceiling; stop climbing once a higher target regresses or fails to launch.",
|
||||||
],
|
],
|
||||||
"active_now": False,
|
"active_now": active_bottleneck in {"decode_tpot", "admission_or_queueing"},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return harnesses
|
return harnesses
|
||||||
@@ -597,6 +607,15 @@ def stateful_history_limit() -> int:
|
|||||||
return 8
|
return 8
|
||||||
|
|
||||||
|
|
||||||
|
def _state_completed_trials_with_rates(state: StudyState) -> list[TrialSummary]:
|
||||||
|
return [
|
||||||
|
trial
|
||||||
|
for trial in state.trials
|
||||||
|
if trial.status == "completed"
|
||||||
|
and isinstance(trial.best_request_rate_per_gpu, (int, float))
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def _load_result(trial: TrialSummary) -> dict[str, Any] | None:
|
def _load_result(trial: TrialSummary) -> dict[str, Any] | None:
|
||||||
if not trial.result_path:
|
if not trial.result_path:
|
||||||
return None
|
return None
|
||||||
@@ -1084,6 +1103,7 @@ def _candidate_actions(
|
|||||||
anchor,
|
anchor,
|
||||||
top_bottleneck,
|
top_bottleneck,
|
||||||
bottleneck_hypotheses,
|
bottleneck_hypotheses,
|
||||||
|
recent_diagnostics,
|
||||||
tested_signatures,
|
tested_signatures,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -1177,13 +1197,31 @@ def _runtime_candidate_actions(
|
|||||||
anchor: dict[str, Any],
|
anchor: dict[str, Any],
|
||||||
top_bottleneck: str,
|
top_bottleneck: str,
|
||||||
bottleneck_hypotheses: list[dict[str, Any]],
|
bottleneck_hypotheses: list[dict[str, Any]],
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
tested_signatures: set[str],
|
tested_signatures: set[str],
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
tunable = set(study.engine.tunable_flags)
|
tunable = set(study.engine.tunable_flags)
|
||||||
anchor_flags = _effective_flags_for_item(study, anchor)
|
anchor_flags = _effective_flags_for_item(study, anchor)
|
||||||
topology_patch = _preserve_topology_patch(study, anchor_flags)
|
topology_patch = _preserve_topology_patch(study, anchor_flags)
|
||||||
|
runtime_base_patch = {**topology_patch, **_preserve_runtime_patch(study, anchor_flags)}
|
||||||
actions: list[dict[str, Any]] = []
|
actions: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
cur_tp = _parse_int_like(anchor_flags.get("tensor-parallel-size"), default=1)
|
||||||
|
cur_dp = _parse_int_like(anchor_flags.get("data-parallel-size"), default=1)
|
||||||
|
# Topology-before-runtime: gpu-mem-util / raising max-num-seqs are micro-tuning that is
|
||||||
|
# only justified once no untested TP increase remains. At an intermediate TP (e.g. TP2
|
||||||
|
# while TP4 is still reachable and untried) a latency bottleneck must still be answered
|
||||||
|
# by climbing TP, not a runtime tweak -- otherwise runtime tuning preempts the frontier.
|
||||||
|
_next_tp = _next_allowed_tp(study, current_tp=cur_tp, current_dp=cur_dp)
|
||||||
|
tp_frontier_open = (
|
||||||
|
_next_tp is not None
|
||||||
|
and _config_signature(
|
||||||
|
{"env_patch": {}, "flag_patch": {"tensor-parallel-size": _next_tp}}
|
||||||
|
)
|
||||||
|
not in tested_signatures
|
||||||
|
)
|
||||||
|
topology_settled = not tp_frontier_open
|
||||||
|
|
||||||
if "max-num-batched-tokens" in tunable:
|
if "max-num-batched-tokens" in tunable:
|
||||||
current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
|
current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
|
||||||
mbt_targets: list[tuple[str, int]] = []
|
mbt_targets: list[tuple[str, int]] = []
|
||||||
@@ -1198,7 +1236,7 @@ def _runtime_candidate_actions(
|
|||||||
elif top_bottleneck == "decode_tpot" and current_mbt > 8192:
|
elif top_bottleneck == "decode_tpot" and current_mbt > 8192:
|
||||||
mbt_targets.append(("lower_mbt", max(8192, current_mbt // 2)))
|
mbt_targets.append(("lower_mbt", max(8192, current_mbt // 2)))
|
||||||
for action_id, target in mbt_targets:
|
for action_id, target in mbt_targets:
|
||||||
patch = {**topology_patch, "max-num-batched-tokens": target}
|
patch = {**runtime_base_patch, "max-num-batched-tokens": target}
|
||||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
if signature in tested_signatures:
|
if signature in tested_signatures:
|
||||||
continue
|
continue
|
||||||
@@ -1222,18 +1260,48 @@ def _runtime_candidate_actions(
|
|||||||
|
|
||||||
if "max-num-seqs" in tunable:
|
if "max-num-seqs" in tunable:
|
||||||
current_mns = _parse_int_like(anchor_flags.get("max-num-seqs"), default=0)
|
current_mns = _parse_int_like(anchor_flags.get("max-num-seqs"), default=0)
|
||||||
|
max_num_seqs_tested = any(
|
||||||
|
"max-num-seqs" in (
|
||||||
|
((item.get("config_patch") or {}).get("flag_patch") or {})
|
||||||
|
if isinstance(item.get("config_patch"), dict)
|
||||||
|
else {}
|
||||||
|
)
|
||||||
|
for item in recent_diagnostics
|
||||||
|
)
|
||||||
mns_targets: list[tuple[str, int]] = []
|
mns_targets: list[tuple[str, int]] = []
|
||||||
if top_bottleneck == "admission_or_queueing":
|
if top_bottleneck == "admission_or_queueing":
|
||||||
target = max(8, int(current_mns * 1.5)) if current_mns > 0 else 64
|
target = max(8, int(current_mns * 1.5)) if current_mns > 0 else 64
|
||||||
mns_targets.append(("raise_max_num_seqs", _round_up_to_multiple(target, 8)))
|
mns_targets.append(("raise_max_num_seqs", _round_up_to_multiple(target, 8)))
|
||||||
elif top_bottleneck == "decode_tpot" and current_mns > 8:
|
elif top_bottleneck == "decode_tpot":
|
||||||
mns_targets.append(("lower_max_num_seqs", max(8, current_mns // 2)))
|
if current_mns > 8:
|
||||||
|
mns_targets.append(("lower_max_num_seqs", max(8, current_mns // 2)))
|
||||||
|
# Decode concurrency can also be too low: once topology is settled, raising
|
||||||
|
# max-num-seqs exploits decode parallelism when the incumbent has SLO headroom.
|
||||||
|
# The incumbent guard keeps it only if per-GPU rate improves.
|
||||||
|
if topology_settled:
|
||||||
|
raise_target = _round_up_to_multiple(
|
||||||
|
max(16, int(current_mns * 1.5)) if current_mns > 0 else 48, 8
|
||||||
|
)
|
||||||
|
mns_targets.append(("raise_max_num_seqs", raise_target))
|
||||||
|
elif top_bottleneck == "ttft_prefill" and topology_settled and not max_num_seqs_tested:
|
||||||
|
# Prefill-heavy TTFT can still be admission/concurrency limited after TP and
|
||||||
|
# max-num-batched-tokens probes settle. Try a modest same-topology seq cap
|
||||||
|
# increase before letting convergence guards declare the incumbent final.
|
||||||
|
target = _round_up_to_multiple(
|
||||||
|
max(16, int(current_mns * 1.5)) if current_mns > 0 else 64, 8
|
||||||
|
)
|
||||||
|
mns_targets.append(("raise_max_num_seqs", target))
|
||||||
for action_id, target in mns_targets:
|
for action_id, target in mns_targets:
|
||||||
patch = {**topology_patch, "max-num-seqs": target}
|
patch = {**runtime_base_patch, "max-num-seqs": target}
|
||||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
if signature in tested_signatures:
|
if signature in tested_signatures:
|
||||||
continue
|
continue
|
||||||
relief = 0.25 if top_bottleneck in {"decode_tpot", "admission_or_queueing"} else 0.08
|
if top_bottleneck in {"decode_tpot", "admission_or_queueing"}:
|
||||||
|
relief = 0.25
|
||||||
|
elif top_bottleneck == "ttft_prefill":
|
||||||
|
relief = 0.3
|
||||||
|
else:
|
||||||
|
relief = 0.08
|
||||||
actions.append(
|
actions.append(
|
||||||
_runtime_action(
|
_runtime_action(
|
||||||
action_id=action_id,
|
action_id=action_id,
|
||||||
@@ -1241,20 +1309,71 @@ def _runtime_candidate_actions(
|
|||||||
score=relief + _information_gain(bottleneck_hypotheses, "runtime"),
|
score=relief + _information_gain(bottleneck_hypotheses, "runtime"),
|
||||||
patch=patch,
|
patch=patch,
|
||||||
hypothesis=(
|
hypothesis=(
|
||||||
"Adjust max-num-seqs to test whether concurrency pressure is the "
|
"Adjust max-num-seqs to test whether concurrency/admission pressure "
|
||||||
"limiting factor under the configured SLO."
|
"is the limiting factor under the configured SLO."
|
||||||
),
|
),
|
||||||
expected_effects=[
|
expected_effects=[
|
||||||
"change decode/admission concurrency on the incumbent topology",
|
"change prefill/decode admission concurrency on the incumbent topology",
|
||||||
"confirm if TPOT or queueing pressure is caused by sequence concurrency",
|
"confirm if latency or queueing pressure is caused by sequence concurrency",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
top_bottleneck == "ttft_prefill"
|
||||||
|
and topology_settled
|
||||||
|
and "max-num-batched-tokens" in tunable
|
||||||
|
and "max-num-seqs" in tunable
|
||||||
|
and max_num_seqs_tested
|
||||||
|
):
|
||||||
|
current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
|
||||||
|
current_mns = _parse_int_like(anchor_flags.get("max-num-seqs"), default=0)
|
||||||
|
if current_mbt > 0:
|
||||||
|
window_target = _initial_mbt_from_window(window_summary)
|
||||||
|
step_target = _next_mbt_step(current_mbt) or current_mbt
|
||||||
|
mbt_target = min(
|
||||||
|
32768,
|
||||||
|
max(
|
||||||
|
step_target,
|
||||||
|
min(window_target, _round_up_to_multiple(current_mbt * 2, 1024)),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
mbt_target = _initial_mbt_from_window(window_summary)
|
||||||
|
mns_target = _round_up_to_multiple(
|
||||||
|
max(16, int(current_mns * 1.5)) if current_mns > 0 else 64, 8
|
||||||
|
)
|
||||||
|
if mbt_target > 0 and (mbt_target != current_mbt or mns_target != current_mns):
|
||||||
|
patch = {
|
||||||
|
**runtime_base_patch,
|
||||||
|
"max-num-batched-tokens": mbt_target,
|
||||||
|
"max-num-seqs": mns_target,
|
||||||
|
}
|
||||||
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
|
if signature not in tested_signatures:
|
||||||
|
actions.append(
|
||||||
|
_runtime_action(
|
||||||
|
action_id="raise_mbt_and_max_num_seqs",
|
||||||
|
knob_family="prefill-runtime-interaction",
|
||||||
|
score=0.38
|
||||||
|
+ _information_gain(bottleneck_hypotheses, "runtime"),
|
||||||
|
patch=patch,
|
||||||
|
hypothesis=(
|
||||||
|
"Jointly raise max-num-batched-tokens and max-num-seqs to test "
|
||||||
|
"whether prefill batching headroom and admission concurrency only "
|
||||||
|
"help when adjusted together."
|
||||||
|
),
|
||||||
|
expected_effects=[
|
||||||
|
"preserve the incumbent topology while changing coupled prefill runtime limits",
|
||||||
|
"confirm whether separate MBT or sequence-cap probes masked an interaction",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if "enable-chunked-prefill" in tunable and top_bottleneck == "ttft_prefill":
|
if "enable-chunked-prefill" in tunable and top_bottleneck == "ttft_prefill":
|
||||||
current = bool(anchor_flags.get("enable-chunked-prefill", False))
|
current = bool(anchor_flags.get("enable-chunked-prefill", False))
|
||||||
if not current:
|
if not current:
|
||||||
patch = {**topology_patch, "enable-chunked-prefill": True}
|
patch = {**runtime_base_patch, "enable-chunked-prefill": True}
|
||||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
if signature not in tested_signatures:
|
if signature not in tested_signatures:
|
||||||
actions.append(
|
actions.append(
|
||||||
@@ -1273,6 +1392,37 @@ def _runtime_candidate_actions(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
"gpu-memory-utilization" in tunable
|
||||||
|
and topology_settled
|
||||||
|
and top_bottleneck in {"decode_tpot", "admission_or_queueing"}
|
||||||
|
):
|
||||||
|
current_gmu = _parse_float_like(
|
||||||
|
anchor_flags.get("gpu-memory-utilization"), default=0.9
|
||||||
|
)
|
||||||
|
if 0.0 < current_gmu < _GMU_SAFE_CEILING:
|
||||||
|
target = round(min(_GMU_SAFE_CEILING, current_gmu + _GMU_STEP), 4)
|
||||||
|
if target > current_gmu:
|
||||||
|
patch = {**runtime_base_patch, "gpu-memory-utilization": target}
|
||||||
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
|
if signature not in tested_signatures:
|
||||||
|
actions.append(
|
||||||
|
_runtime_action(
|
||||||
|
action_id="raise_gpu_memory_utilization",
|
||||||
|
knob_family="gpu-memory-utilization",
|
||||||
|
score=0.4 + _information_gain(bottleneck_hypotheses, "runtime"),
|
||||||
|
patch=patch,
|
||||||
|
hypothesis=(
|
||||||
|
"Raise gpu-memory-utilization to add KV-cache headroom so the "
|
||||||
|
"decode-bound incumbent can sustain more concurrent decode."
|
||||||
|
),
|
||||||
|
expected_effects=[
|
||||||
|
"add KV-cache blocks for higher decode concurrency on the incumbent topology",
|
||||||
|
"reject if the higher memory target regresses request_rate_per_gpu or fails to launch",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
|
|
||||||
@@ -1422,6 +1572,18 @@ def _preserve_topology_patch(study: StudySpec, flags: dict[str, Any]) -> dict[st
|
|||||||
return patch
|
return patch
|
||||||
|
|
||||||
|
|
||||||
|
def _preserve_runtime_patch(study: StudySpec, flags: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
patch: dict[str, Any] = {}
|
||||||
|
tunable = set(study.engine.tunable_flags)
|
||||||
|
base = study.engine.base_flags
|
||||||
|
for key in _RUNTIME_KEYS:
|
||||||
|
if key not in tunable or key not in flags:
|
||||||
|
continue
|
||||||
|
if flags.get(key) != base.get(key):
|
||||||
|
patch[key] = flags[key]
|
||||||
|
return patch
|
||||||
|
|
||||||
|
|
||||||
def _normalized_topology_flags(flags: dict[str, Any]) -> dict[str, Any]:
|
def _normalized_topology_flags(flags: dict[str, Any]) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"tensor-parallel-size": _parse_int_like(
|
"tensor-parallel-size": _parse_int_like(
|
||||||
@@ -1696,11 +1858,15 @@ def _runtime_refinement_proposal(
|
|||||||
best_flags = best_patch.get("flag_patch")
|
best_flags = best_patch.get("flag_patch")
|
||||||
if not isinstance(best_flags, dict):
|
if not isinstance(best_flags, dict):
|
||||||
best_flags = {}
|
best_flags = {}
|
||||||
best_tp = _parse_int_like(best_flags.get("tensor-parallel-size"), default=1)
|
best_effective_flags = _effective_flags_for_item(study, best)
|
||||||
|
best_tp = _parse_int_like(best_effective_flags.get("tensor-parallel-size"), default=1)
|
||||||
if best_tp <= 1:
|
if best_tp <= 1:
|
||||||
return default
|
return default
|
||||||
tunable = set(study.engine.tunable_flags)
|
tunable = set(study.engine.tunable_flags)
|
||||||
flag_patch: dict[str, Any] = {"tensor-parallel-size": best_tp}
|
flag_patch = {
|
||||||
|
**_preserve_topology_patch(study, best_effective_flags),
|
||||||
|
**_preserve_runtime_patch(study, best_effective_flags),
|
||||||
|
}
|
||||||
if "enable-chunked-prefill" in tunable:
|
if "enable-chunked-prefill" in tunable:
|
||||||
flag_patch["enable-chunked-prefill"] = True
|
flag_patch["enable-chunked-prefill"] = True
|
||||||
if "max-num-batched-tokens" not in tunable:
|
if "max-num-batched-tokens" not in tunable:
|
||||||
@@ -1735,7 +1901,7 @@ def _runtime_refinement_proposal(
|
|||||||
"config_patch": {"env_patch": {}, "flag_patch": flag_patch},
|
"config_patch": {"env_patch": {}, "flag_patch": flag_patch},
|
||||||
"expected_effects": [
|
"expected_effects": [
|
||||||
"preserve the incumbent topology",
|
"preserve the incumbent topology",
|
||||||
"increase batching headroom without also raising memory pressure",
|
"increase batching headroom without dropping measured runtime gains",
|
||||||
],
|
],
|
||||||
"incumbent_trial_id": best.get("trial_id"),
|
"incumbent_trial_id": best.get("trial_id"),
|
||||||
}
|
}
|
||||||
@@ -1877,15 +2043,19 @@ def _validation_exhausted_guard(
|
|||||||
}
|
}
|
||||||
if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
|
if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
|
||||||
return default
|
return default
|
||||||
completed = [
|
state_completed = _state_completed_trials_with_rates(state)
|
||||||
item
|
if state_completed:
|
||||||
for item in recent_diagnostics
|
baseline_rate = float(state_completed[0].best_request_rate_per_gpu)
|
||||||
if item.get("status") == "completed"
|
else:
|
||||||
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
completed = [
|
||||||
]
|
item
|
||||||
if not completed:
|
for item in recent_diagnostics
|
||||||
return default
|
if item.get("status") == "completed"
|
||||||
baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
|
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||||
|
]
|
||||||
|
if not completed:
|
||||||
|
return default
|
||||||
|
baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
|
||||||
incumbent_rate = _as_float(state.best_request_rate_per_gpu)
|
incumbent_rate = _as_float(state.best_request_rate_per_gpu)
|
||||||
if baseline_rate <= 0 or incumbent_rate <= 0:
|
if baseline_rate <= 0 or incumbent_rate <= 0:
|
||||||
return default
|
return default
|
||||||
@@ -1923,12 +2093,18 @@ def _validation_exhausted_guard(
|
|||||||
"incumbent_gain_vs_baseline": gain,
|
"incumbent_gain_vs_baseline": gain,
|
||||||
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
||||||
}
|
}
|
||||||
if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best):
|
improving_trials = [
|
||||||
|
item
|
||||||
|
for item in after_best
|
||||||
|
if isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||||
|
and float(item["best_request_rate_per_gpu"]) > incumbent_rate
|
||||||
|
]
|
||||||
|
if improving_trials:
|
||||||
return {
|
return {
|
||||||
**default,
|
**default,
|
||||||
"reason": "post_incumbent_validation_found_feasible_candidate",
|
"reason": "post_incumbent_validation_found_improving_candidate",
|
||||||
"incumbent_gain_vs_baseline": gain,
|
"incumbent_gain_vs_baseline": gain,
|
||||||
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
"validation_trial_ids": [str(item.get("trial_id")) for item in improving_trials],
|
||||||
}
|
}
|
||||||
|
|
||||||
families: set[str] = set()
|
families: set[str] = set()
|
||||||
@@ -1954,7 +2130,7 @@ def _validation_exhausted_guard(
|
|||||||
"reason": "post_incumbent_validation_exhausted",
|
"reason": "post_incumbent_validation_exhausted",
|
||||||
"summary": (
|
"summary": (
|
||||||
"A strong incumbent was followed by validation probes across nearby "
|
"A strong incumbent was followed by validation probes across nearby "
|
||||||
"topology/runtime families, and none produced a feasible candidate."
|
"topology/runtime families, and none improved request_rate_per_gpu."
|
||||||
),
|
),
|
||||||
"incumbent_trial_id": state.best_trial_id,
|
"incumbent_trial_id": state.best_trial_id,
|
||||||
"incumbent_gain_vs_baseline": gain,
|
"incumbent_gain_vs_baseline": gain,
|
||||||
@@ -1995,16 +2171,11 @@ def _strong_incumbent_guard(
|
|||||||
}
|
}
|
||||||
if state.best_trial_id is None or state.best_request_rate_per_gpu is None:
|
if state.best_trial_id is None or state.best_request_rate_per_gpu is None:
|
||||||
return default
|
return default
|
||||||
completed = [
|
completed = _state_completed_trials_with_rates(state)
|
||||||
item
|
|
||||||
for item in recent_diagnostics
|
|
||||||
if item.get("status") == "completed"
|
|
||||||
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
|
||||||
]
|
|
||||||
if len(completed) < 2:
|
if len(completed) < 2:
|
||||||
return default
|
return default
|
||||||
baseline = completed[0]
|
baseline = completed[0]
|
||||||
baseline_rate = float(baseline["best_request_rate_per_gpu"])
|
baseline_rate = float(baseline.best_request_rate_per_gpu)
|
||||||
incumbent_rate = float(state.best_request_rate_per_gpu)
|
incumbent_rate = float(state.best_request_rate_per_gpu)
|
||||||
if baseline_rate <= 0:
|
if baseline_rate <= 0:
|
||||||
return default
|
return default
|
||||||
@@ -2014,7 +2185,7 @@ def _strong_incumbent_guard(
|
|||||||
return {
|
return {
|
||||||
"guard_active": True,
|
"guard_active": True,
|
||||||
"reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best_enter_validation_phase",
|
"reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best_enter_validation_phase",
|
||||||
"baseline_trial_id": baseline.get("trial_id"),
|
"baseline_trial_id": baseline.trial_id,
|
||||||
"baseline_request_rate_per_gpu": baseline_rate,
|
"baseline_request_rate_per_gpu": baseline_rate,
|
||||||
"incumbent_gain_vs_baseline": gain,
|
"incumbent_gain_vs_baseline": gain,
|
||||||
"recommended_next_action": (
|
"recommended_next_action": (
|
||||||
@@ -2024,7 +2195,7 @@ def _strong_incumbent_guard(
|
|||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
**default,
|
**default,
|
||||||
"baseline_trial_id": baseline.get("trial_id"),
|
"baseline_trial_id": baseline.trial_id,
|
||||||
"baseline_request_rate_per_gpu": baseline_rate,
|
"baseline_request_rate_per_gpu": baseline_rate,
|
||||||
"incumbent_gain_vs_baseline": gain,
|
"incumbent_gain_vs_baseline": gain,
|
||||||
"reason": "need_more_evidence_before_strong_incumbent_stop",
|
"reason": "need_more_evidence_before_strong_incumbent_stop",
|
||||||
@@ -2252,6 +2423,19 @@ def _parse_int_like(value: Any, *, default: int) -> int:
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_float_like(value: Any, *, default: float) -> float:
|
||||||
|
if value is None or isinstance(value, bool):
|
||||||
|
return default
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return float(value)
|
||||||
|
if isinstance(value, str) and value.strip():
|
||||||
|
try:
|
||||||
|
return float(value.strip())
|
||||||
|
except ValueError:
|
||||||
|
return default
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
def _config_signature(config_patch: Any) -> str:
|
def _config_signature(config_patch: Any) -> str:
|
||||||
if not isinstance(config_patch, dict):
|
if not isinstance(config_patch, dict):
|
||||||
config_patch = {}
|
config_patch = {}
|
||||||
|
|||||||
581
src/aituner/tuning_report.py
Normal file
581
src/aituner/tuning_report.py
Normal file
@@ -0,0 +1,581 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from .spec import SpecError, load_structured_file
|
||||||
|
from .store import StudyStore
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_BUDGETS = [1, 2, 3, 4, 6, 8, 12]
|
||||||
|
DEFAULT_TARGET_FRACTION = 0.95
|
||||||
|
DEFAULT_MIN_FINAL_RATIO = 0.98
|
||||||
|
|
||||||
|
|
||||||
|
def run_tuning_report(spec_path: Path) -> dict[str, Any]:
|
||||||
|
spec_path = spec_path.resolve()
|
||||||
|
spec = _load_report_spec(spec_path)
|
||||||
|
report_root = _resolve_output_root(spec, spec_path=spec_path)
|
||||||
|
report_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
cases = [
|
||||||
|
_summarize_case(case, spec_path=spec_path)
|
||||||
|
for case in spec["cases"]
|
||||||
|
]
|
||||||
|
summary = {
|
||||||
|
"report_id": spec["report_id"],
|
||||||
|
"report_root": str(report_root),
|
||||||
|
"target_fraction": spec["target_fraction"],
|
||||||
|
"min_final_ratio": spec["min_final_ratio"],
|
||||||
|
"cases": cases,
|
||||||
|
"aggregate": _aggregate_cases(cases),
|
||||||
|
}
|
||||||
|
StudyStore.write_json(report_root / "summary.json", summary)
|
||||||
|
(report_root / "report.md").write_text(_render_report(summary), encoding="utf-8")
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
def _load_report_spec(path: Path) -> dict[str, Any]:
|
||||||
|
payload = dict(load_structured_file(path))
|
||||||
|
report_id = str(payload.get("report_id") or "").strip()
|
||||||
|
if not report_id:
|
||||||
|
raise SpecError("report_id must be a non-empty string.")
|
||||||
|
raw_cases = payload.get("cases")
|
||||||
|
if not isinstance(raw_cases, list) or not raw_cases:
|
||||||
|
raise SpecError("cases must be a non-empty list.")
|
||||||
|
target_fraction = _as_float(payload.get("target_fraction"), default=DEFAULT_TARGET_FRACTION)
|
||||||
|
if target_fraction <= 0:
|
||||||
|
raise SpecError("target_fraction must be positive.")
|
||||||
|
min_final_ratio = _as_float(payload.get("min_final_ratio"), default=DEFAULT_MIN_FINAL_RATIO)
|
||||||
|
if min_final_ratio <= 0:
|
||||||
|
raise SpecError("min_final_ratio must be positive.")
|
||||||
|
cases = [
|
||||||
|
_load_case(
|
||||||
|
item,
|
||||||
|
idx=idx,
|
||||||
|
default_target_fraction=target_fraction,
|
||||||
|
default_min_final_ratio=min_final_ratio,
|
||||||
|
)
|
||||||
|
for idx, item in enumerate(raw_cases)
|
||||||
|
]
|
||||||
|
return {
|
||||||
|
"report_id": report_id,
|
||||||
|
"output_root": str(payload.get("output_root") or "").strip() or None,
|
||||||
|
"target_fraction": target_fraction,
|
||||||
|
"min_final_ratio": min_final_ratio,
|
||||||
|
"cases": cases,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_case(
|
||||||
|
raw: Any,
|
||||||
|
*,
|
||||||
|
idx: int,
|
||||||
|
default_target_fraction: float,
|
||||||
|
default_min_final_ratio: float,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
raise SpecError(f"cases[{idx}] must be an object.")
|
||||||
|
case_id = str(raw.get("case_id") or "").strip()
|
||||||
|
if not case_id:
|
||||||
|
raise SpecError(f"cases[{idx}].case_id must be a non-empty string.")
|
||||||
|
raw_arms = raw.get("arms")
|
||||||
|
if not isinstance(raw_arms, list) or not raw_arms:
|
||||||
|
raise SpecError(f"cases[{idx}].arms must be a non-empty list.")
|
||||||
|
arms = [_load_arm(item, context=f"cases[{idx}].arms[{arm_idx}]") for arm_idx, item in enumerate(raw_arms)]
|
||||||
|
names = [item["name"] for item in arms]
|
||||||
|
if len(names) != len(set(names)):
|
||||||
|
raise SpecError(f"cases[{idx}].arms names must be unique.")
|
||||||
|
raw_budgets = raw.get("budgets", DEFAULT_BUDGETS)
|
||||||
|
if not isinstance(raw_budgets, list) or not raw_budgets:
|
||||||
|
raise SpecError(f"cases[{idx}].budgets must be a non-empty list.")
|
||||||
|
budgets = sorted({_positive_int(item, context=f"cases[{idx}].budgets") for item in raw_budgets})
|
||||||
|
return {
|
||||||
|
"case_id": case_id,
|
||||||
|
"description": str(raw.get("description") or "").strip(),
|
||||||
|
"tags": [str(item).strip() for item in raw.get("tags", []) if str(item).strip()]
|
||||||
|
if isinstance(raw.get("tags", []), list)
|
||||||
|
else [],
|
||||||
|
"budgets": budgets,
|
||||||
|
"target_fraction": _as_float(raw.get("target_fraction"), default=default_target_fraction),
|
||||||
|
"min_final_ratio": _as_float(raw.get("min_final_ratio"), default=default_min_final_ratio),
|
||||||
|
"arms": arms,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_arm(raw: Any, *, context: str) -> dict[str, Any]:
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
raise SpecError(f"{context} must be an object.")
|
||||||
|
name = str(raw.get("name") or "").strip()
|
||||||
|
if not name:
|
||||||
|
raise SpecError(f"{context}.name must be a non-empty string.")
|
||||||
|
kind = str(raw.get("kind") or name).strip()
|
||||||
|
study_root = str(raw.get("study_root") or "").strip()
|
||||||
|
if not study_root:
|
||||||
|
raise SpecError(f"{context}.study_root must be a non-empty string.")
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"kind": kind,
|
||||||
|
"study_root": study_root,
|
||||||
|
"label": str(raw.get("label") or "").strip() or name,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_output_root(spec: dict[str, Any], *, spec_path: Path) -> Path:
|
||||||
|
raw = spec.get("output_root")
|
||||||
|
if raw:
|
||||||
|
return _resolve_path(str(raw), base_dir=spec_path.parent)
|
||||||
|
return (Path(".aituner-reports") / str(spec["report_id"])).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_case(case: dict[str, Any], *, spec_path: Path) -> dict[str, Any]:
|
||||||
|
arms = [
|
||||||
|
_summarize_arm(arm, budgets=case["budgets"], spec_path=spec_path)
|
||||||
|
for arm in case["arms"]
|
||||||
|
]
|
||||||
|
reference = _reference_best(arms)
|
||||||
|
max_budget = max(case["budgets"] + [arm["trial_count"] for arm in arms])
|
||||||
|
for arm in arms:
|
||||||
|
_add_reference_metrics(
|
||||||
|
arm,
|
||||||
|
reference=reference,
|
||||||
|
max_budget=max_budget,
|
||||||
|
target_fraction=case["target_fraction"],
|
||||||
|
)
|
||||||
|
winners = _case_winners(arms)
|
||||||
|
comparison = _harness_vs_naive(
|
||||||
|
arms,
|
||||||
|
min_final_ratio=case["min_final_ratio"],
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"case_id": case["case_id"],
|
||||||
|
"description": case["description"],
|
||||||
|
"tags": case["tags"],
|
||||||
|
"budgets": case["budgets"],
|
||||||
|
"target_fraction": case["target_fraction"],
|
||||||
|
"min_final_ratio": case["min_final_ratio"],
|
||||||
|
"reference_best_per_gpu": reference,
|
||||||
|
"max_budget": max_budget,
|
||||||
|
"arms": arms,
|
||||||
|
"winners": winners,
|
||||||
|
"harness_vs_naive": comparison,
|
||||||
|
"warnings": _case_warnings(case, arms, comparison),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_arm(arm: dict[str, Any], *, budgets: list[int], spec_path: Path) -> dict[str, Any]:
|
||||||
|
study_root = _resolve_study_root(arm["study_root"], base_dir=spec_path.parent)
|
||||||
|
state = json.loads((study_root / "state.json").read_text(encoding="utf-8"))
|
||||||
|
trials = state.get("trials") if isinstance(state.get("trials"), list) else []
|
||||||
|
curve = _running_best_curve(trials)
|
||||||
|
final_best = curve[-1] if curve else None
|
||||||
|
best_trial_index = _first_index_at_value(curve, final_best)
|
||||||
|
return {
|
||||||
|
"name": arm["name"],
|
||||||
|
"kind": arm["kind"],
|
||||||
|
"label": arm["label"],
|
||||||
|
"study_root": str(study_root),
|
||||||
|
"study_id": state.get("study_id"),
|
||||||
|
"trial_count": len(trials),
|
||||||
|
"completed_count": sum(1 for item in trials if item.get("status") == "completed"),
|
||||||
|
"failed_count": sum(1 for item in trials if item.get("status") == "failed"),
|
||||||
|
"no_feasible_count": sum(
|
||||||
|
1 for item in trials if not isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||||
|
),
|
||||||
|
"best_trial_id": state.get("best_trial_id"),
|
||||||
|
"best_trial_index": best_trial_index,
|
||||||
|
"final_best_per_gpu": final_best,
|
||||||
|
"state_best_per_gpu": state.get("best_request_rate_per_gpu"),
|
||||||
|
"best_at_budget": {str(budget): _value_at_budget(curve, budget) for budget in budgets},
|
||||||
|
"running_best_per_gpu": curve,
|
||||||
|
"stop_reason": str(state.get("tuning_stop_reason") or ""),
|
||||||
|
"stop_diagnosis": str(state.get("tuning_stop_diagnosis") or ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _add_reference_metrics(
|
||||||
|
arm: dict[str, Any],
|
||||||
|
*,
|
||||||
|
reference: float | None,
|
||||||
|
max_budget: int,
|
||||||
|
target_fraction: float,
|
||||||
|
) -> None:
|
||||||
|
final_best = arm.get("final_best_per_gpu")
|
||||||
|
arm["final_ratio_to_reference"] = (
|
||||||
|
float(final_best) / reference
|
||||||
|
if reference and isinstance(final_best, (int, float))
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
target = reference * target_fraction if reference else None
|
||||||
|
arm["target_per_gpu"] = target
|
||||||
|
arm["trials_to_target"] = _trials_to_target(arm["running_best_per_gpu"], target)
|
||||||
|
arm["normalized_auc"] = _normalized_auc(
|
||||||
|
arm["running_best_per_gpu"],
|
||||||
|
reference=reference,
|
||||||
|
max_budget=max_budget,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _harness_vs_naive(arms: list[dict[str, Any]], *, min_final_ratio: float) -> list[dict[str, Any]]:
|
||||||
|
naive = [arm for arm in arms if arm["kind"] == "naive"]
|
||||||
|
harnesses = [arm for arm in arms if arm["kind"] == "harness"]
|
||||||
|
if not naive or not harnesses:
|
||||||
|
return []
|
||||||
|
best_naive_final = _max_optional(arm.get("final_best_per_gpu") for arm in naive)
|
||||||
|
best_naive_ttt = _min_optional(arm.get("trials_to_target") for arm in naive)
|
||||||
|
best_naive_auc = _max_optional(arm.get("normalized_auc") for arm in naive)
|
||||||
|
rows = []
|
||||||
|
for harness in harnesses:
|
||||||
|
final = harness.get("final_best_per_gpu")
|
||||||
|
ttt = harness.get("trials_to_target")
|
||||||
|
auc = harness.get("normalized_auc")
|
||||||
|
final_ratio = (
|
||||||
|
float(final) / best_naive_final
|
||||||
|
if best_naive_final and isinstance(final, (int, float))
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
auc_ratio = (
|
||||||
|
float(auc) / best_naive_auc
|
||||||
|
if best_naive_auc and isinstance(auc, (int, float))
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
speedup = _speedup(best_naive_ttt, ttt)
|
||||||
|
pass_final = final_ratio is not None and final_ratio >= min_final_ratio
|
||||||
|
pass_speed = speedup is None or speedup >= 1.0
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"harness": harness["name"],
|
||||||
|
"best_naive_final_per_gpu": best_naive_final,
|
||||||
|
"best_naive_trials_to_target": best_naive_ttt,
|
||||||
|
"best_naive_normalized_auc": best_naive_auc,
|
||||||
|
"final_ratio_vs_best_naive": final_ratio,
|
||||||
|
"target_trial_speedup_vs_best_naive": speedup,
|
||||||
|
"auc_ratio_vs_best_naive": auc_ratio,
|
||||||
|
"passes_min_final_ratio": pass_final,
|
||||||
|
"passes_speed": pass_speed,
|
||||||
|
"passes": pass_final and pass_speed,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _case_winners(arms: list[dict[str, Any]]) -> dict[str, str | None]:
|
||||||
|
return {
|
||||||
|
"final_best": _argmax(arms, "final_best_per_gpu"),
|
||||||
|
"fastest_to_target": _argmin(arms, "trials_to_target"),
|
||||||
|
"normalized_auc": _argmax(arms, "normalized_auc"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _aggregate_cases(cases: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
|
by_kind: dict[str, dict[str, Any]] = {}
|
||||||
|
final_wins: dict[str, int] = {}
|
||||||
|
speed_wins: dict[str, int] = {}
|
||||||
|
auc_wins: dict[str, int] = {}
|
||||||
|
harness_passes = 0
|
||||||
|
harness_checks = 0
|
||||||
|
for case in cases:
|
||||||
|
for winner_key, target in (
|
||||||
|
("final_best", final_wins),
|
||||||
|
("fastest_to_target", speed_wins),
|
||||||
|
("normalized_auc", auc_wins),
|
||||||
|
):
|
||||||
|
winner = case["winners"].get(winner_key)
|
||||||
|
if winner:
|
||||||
|
target[winner] = target.get(winner, 0) + 1
|
||||||
|
for row in case["harness_vs_naive"]:
|
||||||
|
harness_checks += 1
|
||||||
|
if row["passes"]:
|
||||||
|
harness_passes += 1
|
||||||
|
for arm in case["arms"]:
|
||||||
|
bucket = by_kind.setdefault(
|
||||||
|
arm["kind"],
|
||||||
|
{
|
||||||
|
"arm_count": 0,
|
||||||
|
"mean_final_ratio_to_reference": None,
|
||||||
|
"mean_normalized_auc": None,
|
||||||
|
"target_reached_count": 0,
|
||||||
|
"_final_ratios": [],
|
||||||
|
"_aucs": [],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
bucket["arm_count"] += 1
|
||||||
|
if isinstance(arm.get("final_ratio_to_reference"), (int, float)):
|
||||||
|
bucket["_final_ratios"].append(float(arm["final_ratio_to_reference"]))
|
||||||
|
if isinstance(arm.get("normalized_auc"), (int, float)):
|
||||||
|
bucket["_aucs"].append(float(arm["normalized_auc"]))
|
||||||
|
if isinstance(arm.get("trials_to_target"), int):
|
||||||
|
bucket["target_reached_count"] += 1
|
||||||
|
for bucket in by_kind.values():
|
||||||
|
ratios = bucket.pop("_final_ratios")
|
||||||
|
aucs = bucket.pop("_aucs")
|
||||||
|
bucket["mean_final_ratio_to_reference"] = _mean(ratios)
|
||||||
|
bucket["mean_normalized_auc"] = _mean(aucs)
|
||||||
|
return {
|
||||||
|
"case_count": len(cases),
|
||||||
|
"by_kind": by_kind,
|
||||||
|
"winner_counts": {
|
||||||
|
"final_best": final_wins,
|
||||||
|
"fastest_to_target": speed_wins,
|
||||||
|
"normalized_auc": auc_wins,
|
||||||
|
},
|
||||||
|
"harness_vs_naive_pass_count": harness_passes,
|
||||||
|
"harness_vs_naive_check_count": harness_checks,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _case_warnings(
|
||||||
|
case: dict[str, Any],
|
||||||
|
arms: list[dict[str, Any]],
|
||||||
|
comparison: list[dict[str, Any]],
|
||||||
|
) -> list[str]:
|
||||||
|
warnings = []
|
||||||
|
kinds = {arm["kind"] for arm in arms}
|
||||||
|
if "harness" not in kinds or "naive" not in kinds:
|
||||||
|
warnings.append("case does not include both harness and naive arms")
|
||||||
|
if len(case["tags"]) < 2:
|
||||||
|
warnings.append("case has few tags; add workload/model/SLO tags to support generalization claims")
|
||||||
|
if not comparison:
|
||||||
|
return warnings
|
||||||
|
for row in comparison:
|
||||||
|
if not row["passes_min_final_ratio"]:
|
||||||
|
warnings.append(
|
||||||
|
f"{row['harness']} final best is below min_final_ratio versus best naive"
|
||||||
|
)
|
||||||
|
if not row["passes_speed"]:
|
||||||
|
warnings.append(
|
||||||
|
f"{row['harness']} reaches target later than best naive"
|
||||||
|
)
|
||||||
|
return warnings
|
||||||
|
|
||||||
|
|
||||||
|
def _running_best_curve(trials: list[Any]) -> list[float | None]:
|
||||||
|
curve: list[float | None] = []
|
||||||
|
incumbent: float | None = None
|
||||||
|
for trial in trials:
|
||||||
|
rate = trial.get("best_request_rate_per_gpu") if isinstance(trial, dict) else None
|
||||||
|
if isinstance(rate, (int, float)) and (incumbent is None or float(rate) > incumbent):
|
||||||
|
incumbent = float(rate)
|
||||||
|
curve.append(incumbent)
|
||||||
|
return curve
|
||||||
|
|
||||||
|
|
||||||
|
def _value_at_budget(curve: list[float | None], budget: int) -> float | None:
|
||||||
|
if not curve:
|
||||||
|
return None
|
||||||
|
index = min(max(budget, 1), len(curve)) - 1
|
||||||
|
return curve[index]
|
||||||
|
|
||||||
|
|
||||||
|
def _trials_to_target(curve: list[float | None], target: float | None) -> int | None:
|
||||||
|
if target is None:
|
||||||
|
return None
|
||||||
|
for idx, value in enumerate(curve, start=1):
|
||||||
|
if isinstance(value, (int, float)) and value >= target:
|
||||||
|
return idx
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _normalized_auc(
|
||||||
|
curve: list[float | None],
|
||||||
|
*,
|
||||||
|
reference: float | None,
|
||||||
|
max_budget: int,
|
||||||
|
) -> float | None:
|
||||||
|
if not reference or max_budget <= 0:
|
||||||
|
return None
|
||||||
|
total = 0.0
|
||||||
|
for budget in range(1, max_budget + 1):
|
||||||
|
value = _value_at_budget(curve, budget)
|
||||||
|
total += float(value) if isinstance(value, (int, float)) else 0.0
|
||||||
|
return total / (reference * max_budget)
|
||||||
|
|
||||||
|
|
||||||
|
def _reference_best(arms: list[dict[str, Any]]) -> float | None:
|
||||||
|
return _max_optional(arm.get("final_best_per_gpu") for arm in arms)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_study_root(raw_path: str, *, base_dir: Path) -> Path:
|
||||||
|
path = _resolve_path(raw_path, base_dir=base_dir)
|
||||||
|
if (path / "state.json").exists():
|
||||||
|
return path
|
||||||
|
matches = sorted(path.glob("*/state.json"))
|
||||||
|
if len(matches) == 1:
|
||||||
|
return matches[0].parent
|
||||||
|
if not matches:
|
||||||
|
raise SpecError(f"study_root does not contain state.json: {path}")
|
||||||
|
raise SpecError(f"study_root is ambiguous; point to a specific study directory: {path}")
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_path(raw_path: str, *, base_dir: Path) -> Path:
|
||||||
|
path = Path(raw_path)
|
||||||
|
if not path.is_absolute():
|
||||||
|
path = (base_dir / path).resolve()
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def _as_float(value: Any, *, default: float) -> float:
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
||||||
|
raise SpecError(f"Expected numeric value, got {value!r}.")
|
||||||
|
return float(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _positive_int(value: Any, *, context: str) -> int:
|
||||||
|
if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
|
||||||
|
raise SpecError(f"{context} must contain positive integers.")
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def _first_index_at_value(curve: list[float | None], value: float | None) -> int | None:
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
for idx, item in enumerate(curve, start=1):
|
||||||
|
if item == value:
|
||||||
|
return idx
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _argmax(rows: list[dict[str, Any]], key: str) -> str | None:
|
||||||
|
scored = [
|
||||||
|
(str(row["name"]), float(row[key]))
|
||||||
|
for row in rows
|
||||||
|
if isinstance(row.get(key), (int, float))
|
||||||
|
]
|
||||||
|
if not scored:
|
||||||
|
return None
|
||||||
|
scored.sort(key=lambda item: item[1], reverse=True)
|
||||||
|
return scored[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
def _argmin(rows: list[dict[str, Any]], key: str) -> str | None:
|
||||||
|
scored = [
|
||||||
|
(str(row["name"]), int(row[key]))
|
||||||
|
for row in rows
|
||||||
|
if isinstance(row.get(key), int)
|
||||||
|
]
|
||||||
|
if not scored:
|
||||||
|
return None
|
||||||
|
scored.sort(key=lambda item: item[1])
|
||||||
|
return scored[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
def _max_optional(values: Any) -> float | None:
|
||||||
|
scored = [float(item) for item in values if isinstance(item, (int, float))]
|
||||||
|
return max(scored) if scored else None
|
||||||
|
|
||||||
|
|
||||||
|
def _min_optional(values: Any) -> int | None:
|
||||||
|
scored = [int(item) for item in values if isinstance(item, int)]
|
||||||
|
return min(scored) if scored else None
|
||||||
|
|
||||||
|
|
||||||
|
def _mean(values: list[float]) -> float | None:
|
||||||
|
return sum(values) / len(values) if values else None
|
||||||
|
|
||||||
|
|
||||||
|
def _speedup(naive_trials: int | None, harness_trials: int | None) -> float | None:
|
||||||
|
if harness_trials is None:
|
||||||
|
return 0.0 if naive_trials is not None else None
|
||||||
|
if naive_trials is None:
|
||||||
|
return None
|
||||||
|
if harness_trials <= 0:
|
||||||
|
return None
|
||||||
|
return float(naive_trials) / float(harness_trials)
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt(value: Any) -> str:
|
||||||
|
if isinstance(value, float):
|
||||||
|
return f"{value:.4f}"
|
||||||
|
if value is None:
|
||||||
|
return "-"
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _render_report(summary: dict[str, Any]) -> str:
|
||||||
|
lines = [
|
||||||
|
f"# {summary['report_id']}",
|
||||||
|
"",
|
||||||
|
"## Aggregate",
|
||||||
|
"",
|
||||||
|
f"- Cases: `{summary['aggregate']['case_count']}`",
|
||||||
|
f"- Harness-vs-naive pass/checks: `{summary['aggregate']['harness_vs_naive_pass_count']}`/`{summary['aggregate']['harness_vs_naive_check_count']}`",
|
||||||
|
f"- Winner counts: `{json.dumps(summary['aggregate']['winner_counts'], ensure_ascii=False)}`",
|
||||||
|
"",
|
||||||
|
"## By Kind",
|
||||||
|
"",
|
||||||
|
"| Kind | Arms | Mean final/ref | Mean AUC | Target reached |",
|
||||||
|
"| --- | ---: | ---: | ---: | ---: |",
|
||||||
|
]
|
||||||
|
for kind, payload in sorted(summary["aggregate"]["by_kind"].items()):
|
||||||
|
lines.append(
|
||||||
|
"| "
|
||||||
|
+ " | ".join(
|
||||||
|
[
|
||||||
|
f"`{kind}`",
|
||||||
|
str(payload["arm_count"]),
|
||||||
|
_fmt(payload["mean_final_ratio_to_reference"]),
|
||||||
|
_fmt(payload["mean_normalized_auc"]),
|
||||||
|
str(payload["target_reached_count"]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
+ " |"
|
||||||
|
)
|
||||||
|
lines.extend(["", "## Cases", ""])
|
||||||
|
for case in summary["cases"]:
|
||||||
|
lines.extend(
|
||||||
|
[
|
||||||
|
f"### {case['case_id']}",
|
||||||
|
"",
|
||||||
|
f"- Reference best req/s/GPU: `{_fmt(case['reference_best_per_gpu'])}`",
|
||||||
|
f"- Target fraction: `{case['target_fraction']}`",
|
||||||
|
f"- Winners: `{json.dumps(case['winners'], ensure_ascii=False)}`",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if case["warnings"]:
|
||||||
|
lines.append(f"- Warnings: `{json.dumps(case['warnings'], ensure_ascii=False)}`")
|
||||||
|
lines.extend(
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |",
|
||||||
|
"| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
for arm in case["arms"]:
|
||||||
|
lines.append(
|
||||||
|
"| "
|
||||||
|
+ " | ".join(
|
||||||
|
[
|
||||||
|
f"`{arm['name']}`",
|
||||||
|
f"`{arm['kind']}`",
|
||||||
|
str(arm["trial_count"]),
|
||||||
|
_fmt(arm["final_best_per_gpu"]),
|
||||||
|
_fmt(arm["final_ratio_to_reference"]),
|
||||||
|
_fmt(arm["trials_to_target"]),
|
||||||
|
_fmt(arm["normalized_auc"]),
|
||||||
|
str(arm["failed_count"]),
|
||||||
|
str(arm["no_feasible_count"]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
+ " |"
|
||||||
|
)
|
||||||
|
if case["harness_vs_naive"]:
|
||||||
|
lines.extend(["", "| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |", "| --- | ---: | ---: | ---: | --- |"])
|
||||||
|
for row in case["harness_vs_naive"]:
|
||||||
|
lines.append(
|
||||||
|
"| "
|
||||||
|
+ " | ".join(
|
||||||
|
[
|
||||||
|
f"`{row['harness']}`",
|
||||||
|
_fmt(row["final_ratio_vs_best_naive"]),
|
||||||
|
_fmt(row["target_trial_speedup_vs_best_naive"]),
|
||||||
|
_fmt(row["auc_ratio_vs_best_naive"]),
|
||||||
|
f"`{row['passes']}`",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
+ " |"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
return "\n".join(lines)
|
||||||
@@ -18,7 +18,7 @@ from .engine import build_launch_recipe
|
|||||||
from .http_client import HttpClientError, stream_chat_completion, wait_for_server
|
from .http_client import HttpClientError, stream_chat_completion, wait_for_server
|
||||||
from .lca import find_convergence_prefix, resolve_length_mode
|
from .lca import find_convergence_prefix, resolve_length_mode
|
||||||
from .search import ThresholdProbe, binary_search_max_feasible
|
from .search import ThresholdProbe, binary_search_max_feasible
|
||||||
from .slo import RequestOutcome, evaluate_request, summarize_evaluations
|
from .slo import RequestOutcome, _rule_threshold_ms, evaluate_request, summarize_evaluations
|
||||||
from .spec import ConfigPatch, SamplingSearchSpec, TrialSpec, load_study_spec, to_jsonable
|
from .spec import ConfigPatch, SamplingSearchSpec, TrialSpec, load_study_spec, to_jsonable
|
||||||
from .trace import TraceRequest, load_trace_requests, select_requests_for_threshold
|
from .trace import TraceRequest, load_trace_requests, select_requests_for_threshold
|
||||||
|
|
||||||
@@ -254,6 +254,34 @@ def _ignore_sigterm_if_main() -> None:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _probe_drain_deadline(
|
||||||
|
reqs: list[TraceRequest], slo: Any, *, ceiling: float | None
|
||||||
|
) -> float | None:
|
||||||
|
"""Stop-A-consistent per-probe drain deadline (wall-clock seconds).
|
||||||
|
|
||||||
|
The deadline is the time a *feasible* config needs to drain the admitted set:
|
||||||
|
the last admitted arrival plus the worst-case TTFT budget plus the p99 output
|
||||||
|
length times the TPOT budget. A config that cannot finish by this deadline is
|
||||||
|
genuinely SLO-infeasible, so the clock never pre-empts the LCA-matched offered
|
||||||
|
window (Stop-A) -- it only fails the unfit. ``ceiling`` is a hard safety cap.
|
||||||
|
"""
|
||||||
|
if not reqs or slo.tpot_rule is None:
|
||||||
|
return ceiling
|
||||||
|
last_arrival = max(float(r.arrival_s or 0.0) for r in reqs)
|
||||||
|
inputs = sorted(int(r.prompt_tokens_hint or 0) for r in reqs)
|
||||||
|
outputs = sorted(int(r.completion_tokens_hint or 0) for r in reqs)
|
||||||
|
|
||||||
|
def _p99(xs: list[int]) -> int:
|
||||||
|
return xs[min(len(xs) - 1, int(0.99 * len(xs)))] if xs else 0
|
||||||
|
|
||||||
|
p99_in, p99_out = _p99(inputs), _p99(outputs)
|
||||||
|
tpot_ms = _rule_threshold_ms(slo.tpot_rule, p99_in)
|
||||||
|
ttft_ms = _rule_threshold_ms(slo.ttft_rule, p99_in) if slo.ttft_rule is not None else 0.0
|
||||||
|
margin_s = 30.0
|
||||||
|
deadline = last_arrival + (ttft_ms + p99_out * tpot_ms) / 1000.0 + margin_s
|
||||||
|
return min(float(ceiling), deadline) if ceiling else deadline
|
||||||
|
|
||||||
|
|
||||||
def _adaptive_replay_set(
|
def _adaptive_replay_set(
|
||||||
selected: list[TraceRequest],
|
selected: list[TraceRequest],
|
||||||
*,
|
*,
|
||||||
@@ -640,7 +668,9 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
|||||||
max_concurrency=study.trace.max_concurrency,
|
max_concurrency=study.trace.max_concurrency,
|
||||||
target_pass_rate=study.slo.target_pass_rate,
|
target_pass_rate=study.slo.target_pass_rate,
|
||||||
max_lag_s=study.trace.early_stop_max_lag_s,
|
max_lag_s=study.trace.early_stop_max_lag_s,
|
||||||
max_elapsed_s=study.trace.early_stop_max_elapsed_s,
|
max_elapsed_s=_probe_drain_deadline(
|
||||||
|
reqs, study.slo, ceiling=study.trace.early_stop_max_elapsed_s
|
||||||
|
),
|
||||||
evaluate_outcome=lambda outcome: evaluate_request(outcome, study.slo),
|
evaluate_outcome=lambda outcome: evaluate_request(outcome, study.slo),
|
||||||
drain_inflight_on_early_stop=not restart_after_early_stop,
|
drain_inflight_on_early_stop=not restart_after_early_stop,
|
||||||
)
|
)
|
||||||
@@ -751,20 +781,28 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
|||||||
best = primary_search.best_feasible_payload
|
best = primary_search.best_feasible_payload
|
||||||
best_source = "primary_search"
|
best_source = "primary_search"
|
||||||
fallback_search = None
|
fallback_search = None
|
||||||
|
skipped_lower_range_fallback = False
|
||||||
|
lower_range_fallback_skip_reason = ""
|
||||||
original_search_low = float(study.search.low)
|
original_search_low = float(study.search.low)
|
||||||
inherited_search_floor = float(trial.search.low)
|
inherited_search_floor = float(trial.search.low)
|
||||||
if best is None and inherited_search_floor > original_search_low:
|
if best is None and inherited_search_floor > original_search_low:
|
||||||
fallback_search = binary_search_max_feasible(
|
if trial.search.inherit_incumbent_floor:
|
||||||
low=original_search_low,
|
skipped_lower_range_fallback = True
|
||||||
high=inherited_search_floor,
|
lower_range_fallback_skip_reason = (
|
||||||
tolerance=trial.search.tolerance,
|
"primary_search_above_incumbent_floor_all_infeasible"
|
||||||
max_probes=trial.search.max_probes,
|
)
|
||||||
evaluator=evaluator,
|
else:
|
||||||
)
|
fallback_search = binary_search_max_feasible(
|
||||||
if fallback_search.best_feasible_payload is not None:
|
low=original_search_low,
|
||||||
search_for_best = fallback_search
|
high=inherited_search_floor,
|
||||||
best = fallback_search.best_feasible_payload
|
tolerance=trial.search.tolerance,
|
||||||
best_source = "lower_range_fallback"
|
max_probes=trial.search.max_probes,
|
||||||
|
evaluator=evaluator,
|
||||||
|
)
|
||||||
|
if fallback_search.best_feasible_payload is not None:
|
||||||
|
search_for_best = fallback_search
|
||||||
|
best = fallback_search.best_feasible_payload
|
||||||
|
best_source = "lower_range_fallback"
|
||||||
|
|
||||||
def serialize_probe(probe: ThresholdProbe[ProbePayload]) -> dict[str, Any]:
|
def serialize_probe(probe: ThresholdProbe[ProbePayload]) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
@@ -796,7 +834,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
|||||||
"best_request_count": best.request_count if best is not None else None,
|
"best_request_count": best.request_count if best is not None else None,
|
||||||
"probes": [serialize_probe(probe) for probe in all_probes],
|
"probes": [serialize_probe(probe) for probe in all_probes],
|
||||||
}
|
}
|
||||||
if fallback_search is not None:
|
if fallback_search is not None or skipped_lower_range_fallback:
|
||||||
result["primary_search"] = {
|
result["primary_search"] = {
|
||||||
"low": inherited_search_floor,
|
"low": inherited_search_floor,
|
||||||
"high": trial.search.high,
|
"high": trial.search.high,
|
||||||
@@ -808,6 +846,16 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
|||||||
else None,
|
else None,
|
||||||
"probes": [serialize_probe(probe) for probe in primary_search.probes],
|
"probes": [serialize_probe(probe) for probe in primary_search.probes],
|
||||||
}
|
}
|
||||||
|
if skipped_lower_range_fallback:
|
||||||
|
result["lower_range_fallback"] = {
|
||||||
|
"triggered": False,
|
||||||
|
"skipped": True,
|
||||||
|
"reason": lower_range_fallback_skip_reason,
|
||||||
|
"low": original_search_low,
|
||||||
|
"high": inherited_search_floor,
|
||||||
|
"probes": [],
|
||||||
|
}
|
||||||
|
if fallback_search is not None:
|
||||||
result["lower_range_fallback"] = {
|
result["lower_range_fallback"] = {
|
||||||
"triggered": True,
|
"triggered": True,
|
||||||
"low": original_search_low,
|
"low": original_search_low,
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ from aituner.store import StudyStore
|
|||||||
from aituner.trace import load_trace_requests, summarize_window
|
from aituner.trace import load_trace_requests, summarize_window
|
||||||
from aituner.worker import (
|
from aituner.worker import (
|
||||||
_adaptive_replay_set,
|
_adaptive_replay_set,
|
||||||
|
_probe_drain_deadline,
|
||||||
_install_sigterm_as_keyboardinterrupt,
|
_install_sigterm_as_keyboardinterrupt,
|
||||||
_restore_sigterm,
|
_restore_sigterm,
|
||||||
_should_extend_on_boundary,
|
_should_extend_on_boundary,
|
||||||
@@ -535,6 +536,38 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_probe_drain_deadline_tracks_admitted_set_and_caps_at_ceiling(self) -> None:
|
||||||
|
slo = SloSpec.from_dict(
|
||||||
|
{
|
||||||
|
"target_pass_rate": 0.95,
|
||||||
|
"ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125},
|
||||||
|
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def req(arrival_s: float, in_tok: int, out_tok: int) -> TraceRequest:
|
||||||
|
return TraceRequest(
|
||||||
|
row_id="r",
|
||||||
|
arrival_s=arrival_s,
|
||||||
|
sampling_u=0.1,
|
||||||
|
body={},
|
||||||
|
prompt_tokens_hint=in_tok,
|
||||||
|
completion_tokens_hint=out_tok,
|
||||||
|
metadata={},
|
||||||
|
)
|
||||||
|
|
||||||
|
# 100 requests, last arrival 500s, p99 in=8000 / out=2000.
|
||||||
|
reqs = [req(float(i * 5), 8000, 2000) for i in range(100)]
|
||||||
|
# deadline = last_arrival + (ttft_ms + p99_out*tpot_ms)/1000 + margin
|
||||||
|
# = 495 + (5000 + 2000*50)/1000 + 30 = 495 + 105 + 30 = 630
|
||||||
|
self.assertAlmostEqual(
|
||||||
|
_probe_drain_deadline(reqs, slo, ceiling=1000.0), 630.0, places=3
|
||||||
|
)
|
||||||
|
# Ceiling caps a deadline that would otherwise exceed it.
|
||||||
|
self.assertEqual(_probe_drain_deadline(reqs, slo, ceiling=400.0), 400.0)
|
||||||
|
# No requests or no TPOT rule -> fall back to the ceiling.
|
||||||
|
self.assertEqual(_probe_drain_deadline([], slo, ceiling=400.0), 400.0)
|
||||||
|
|
||||||
def test_linear_ms_ttft_rule_scales_with_input_length(self) -> None:
|
def test_linear_ms_ttft_rule_scales_with_input_length(self) -> None:
|
||||||
slo = SloSpec.from_dict(
|
slo = SloSpec.from_dict(
|
||||||
{
|
{
|
||||||
@@ -965,6 +998,189 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
self.assertIsNotNone(proposal)
|
self.assertIsNotNone(proposal)
|
||||||
self.assertTrue(proposal.should_stop)
|
self.assertTrue(proposal.should_stop)
|
||||||
|
|
||||||
|
def test_harness_stop_after_non_improving_feasible_validation_is_exhausted(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_parallel_size=8,
|
||||||
|
best_sampling_u=0.02,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=0.8,
|
||||||
|
best_request_rate_per_gpu=0.1,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 2,
|
||||||
|
"data-parallel-size": 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0003",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.0,
|
||||||
|
best_request_rate_per_gpu=0.25,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 1,
|
||||||
|
"data-parallel-size": 8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0004",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.1,
|
||||||
|
best_request_rate_per_gpu=0.2625,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {"max-num-seqs": 160},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p95": 2048},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
self.assertTrue(context["harness_stop"]["should_stop"])
|
||||||
|
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
|
||||||
|
|
||||||
|
def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
engine_overrides={"tunable_flags": ["max-num-seqs"]},
|
||||||
|
)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0006",
|
||||||
|
best_parallel_size=8,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=0.8,
|
||||||
|
best_request_rate_per_gpu=0.1,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=0.88,
|
||||||
|
best_request_rate_per_gpu=0.11,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0003",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=0.96,
|
||||||
|
best_request_rate_per_gpu=0.12,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0004",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=1.04,
|
||||||
|
best_request_rate_per_gpu=0.13,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 32}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0005",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.24,
|
||||||
|
best_request_rate_per_gpu=0.28,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 40}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0006",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 48}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0007",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 56}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0008",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0009",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 72}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0010",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 80}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0011",
|
||||||
|
status="failed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 88}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0012",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 96}},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p95": 2048},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
self.assertTrue(context["harness_stop"]["should_stop"])
|
||||||
|
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
|
||||||
|
self.assertGreater(
|
||||||
|
context["harness_stop"]["evidence"]["incumbent_gain_vs_baseline"],
|
||||||
|
2.9,
|
||||||
|
)
|
||||||
|
|
||||||
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
|
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
@@ -1285,6 +1501,305 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_harness_runtime_refinement_preserves_incumbent_runtime_knobs(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
engine_overrides={
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"gpu-memory-utilization",
|
||||||
|
"max-num-seqs",
|
||||||
|
"enable-chunked-prefill",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
],
|
||||||
|
"topology_constraints": {
|
||||||
|
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||||
|
"allowed_tp_dp_products": [1, 2, 4],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
result_path = tmp_path / "trial-0002.json"
|
||||||
|
result_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.098,
|
||||||
|
"best_request_rate": 3.3,
|
||||||
|
"best_pass_rate": 0.97,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.098,
|
||||||
|
"feasible": True,
|
||||||
|
"payload": {
|
||||||
|
"request_count": 100,
|
||||||
|
"pass_rate": 0.97,
|
||||||
|
"request_rate": 3.3,
|
||||||
|
"early_stopped": False,
|
||||||
|
"early_stop_reason": "",
|
||||||
|
"latency_summary": {"failed_reason_counts": {}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_request_rate=3.3,
|
||||||
|
best_request_rate_per_gpu=0.825,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=2.5,
|
||||||
|
best_request_rate_per_gpu=0.625,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=3.3,
|
||||||
|
best_request_rate_per_gpu=0.825,
|
||||||
|
result_path=str(result_path),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 4,
|
||||||
|
"gpu-memory-utilization": 0.92,
|
||||||
|
"max-num-seqs": 48,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p99": 8100},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
proposal = build_harness_guided_proposal(context)
|
||||||
|
self.assertIsNotNone(proposal)
|
||||||
|
self.assertEqual(
|
||||||
|
proposal.config_patch.flag_patch,
|
||||||
|
{
|
||||||
|
"tensor-parallel-size": 4,
|
||||||
|
"gpu-memory-utilization": 0.92,
|
||||||
|
"max-num-seqs": 48,
|
||||||
|
"enable-chunked-prefill": True,
|
||||||
|
"max-num-batched-tokens": 16384,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_harness_raises_gpu_mem_util_on_settled_decode_bound_incumbent(self) -> None:
|
||||||
|
"""Regression for the coverage gap that let the naive baseline beat the harness:
|
||||||
|
a settled TP incumbent that is decode_tpot-bound must get a gpu-memory-utilization
|
||||||
|
raise (KV-cache headroom) before the harness is allowed to stop."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
slo_overrides={
|
||||||
|
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
|
||||||
|
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||||
|
},
|
||||||
|
engine_overrides={
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"gpu-memory-utilization",
|
||||||
|
],
|
||||||
|
"topology_constraints": {
|
||||||
|
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||||
|
"allowed_data_parallel_sizes": [1],
|
||||||
|
"allowed_tp_dp_products": [1, 2, 4],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
result_path = tmp_path / "trial-0002.json"
|
||||||
|
result_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.074,
|
||||||
|
"best_request_rate": 2.6,
|
||||||
|
"best_pass_rate": 0.97,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.074,
|
||||||
|
"feasible": True,
|
||||||
|
"payload": {
|
||||||
|
"request_count": 300,
|
||||||
|
"pass_rate": 0.97,
|
||||||
|
"request_rate": 2.6,
|
||||||
|
"latency_summary": {"failed_reason_counts": {}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"threshold": 0.09,
|
||||||
|
"feasible": False,
|
||||||
|
"payload": {
|
||||||
|
"request_count": 300,
|
||||||
|
"pass_rate": 0.6,
|
||||||
|
"request_rate": 3.2,
|
||||||
|
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||||
|
"latency_summary": {
|
||||||
|
"failed_reason_counts": {"tpot_ms>50.0": 90}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_request_rate=2.6,
|
||||||
|
best_request_rate_per_gpu=0.65,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=1.1,
|
||||||
|
best_request_rate_per_gpu=0.275,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=2.6,
|
||||||
|
best_request_rate_per_gpu=0.65,
|
||||||
|
result_path=str(result_path),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 4,
|
||||||
|
"gpu-memory-utilization": 0.9,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
|
||||||
|
)
|
||||||
|
proposal = build_harness_guided_proposal(context)
|
||||||
|
self.assertIsNotNone(proposal)
|
||||||
|
self.assertFalse(proposal.should_stop)
|
||||||
|
# TP4 preserved; gpu-memory-utilization hill-climbed one step (0.9 -> 0.92).
|
||||||
|
self.assertEqual(
|
||||||
|
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.92
|
||||||
|
)
|
||||||
|
# And the harness must NOT authorize a stop while that knob is untried.
|
||||||
|
self.assertIsNone(build_harness_stop_proposal(context))
|
||||||
|
|
||||||
|
def test_harness_climbs_tp_before_gpu_mem_util_micro_tuning(self) -> None:
|
||||||
|
"""gpu-memory-utilization must not preempt an untried TP increase: at a TP2 incumbent
|
||||||
|
with TP4 still reachable, the harness must climb TP, not micro-tune runtime."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
slo_overrides={
|
||||||
|
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
|
||||||
|
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||||
|
},
|
||||||
|
engine_overrides={
|
||||||
|
"tunable_flags": ["tensor-parallel-size", "gpu-memory-utilization"],
|
||||||
|
"topology_constraints": {
|
||||||
|
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||||
|
"allowed_data_parallel_sizes": [1],
|
||||||
|
"allowed_tp_dp_products": [1, 2, 4],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
result_path = tmp_path / "trial-0002.json"
|
||||||
|
result_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.03,
|
||||||
|
"best_request_rate": 1.1,
|
||||||
|
"best_pass_rate": 0.97,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.03,
|
||||||
|
"feasible": True,
|
||||||
|
"payload": {
|
||||||
|
"request_count": 300,
|
||||||
|
"pass_rate": 0.97,
|
||||||
|
"request_rate": 1.1,
|
||||||
|
"latency_summary": {"failed_reason_counts": {}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"threshold": 0.05,
|
||||||
|
"feasible": False,
|
||||||
|
"payload": {
|
||||||
|
"request_count": 300,
|
||||||
|
"pass_rate": 0.6,
|
||||||
|
"request_rate": 1.6,
|
||||||
|
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||||
|
"latency_summary": {
|
||||||
|
"failed_reason_counts": {"tpot_ms>50.0": 90}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_request_rate=1.1,
|
||||||
|
best_request_rate_per_gpu=0.55,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=0.6,
|
||||||
|
best_request_rate_per_gpu=0.6,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=1.1,
|
||||||
|
best_request_rate_per_gpu=0.55,
|
||||||
|
result_path=str(result_path),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 2,
|
||||||
|
"gpu-memory-utilization": 0.9,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
|
||||||
|
)
|
||||||
|
proposal = build_harness_guided_proposal(context)
|
||||||
|
self.assertIsNotNone(proposal)
|
||||||
|
# Must climb TP (to 4), and must NOT micro-tune gpu-memory-utilization yet.
|
||||||
|
self.assertEqual(
|
||||||
|
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
|
||||||
|
)
|
||||||
|
self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch)
|
||||||
|
|
||||||
def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None:
|
def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
@@ -1543,6 +2058,282 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
{"max-num-seqs": 32},
|
{"max-num-seqs": 32},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_prefill_convergence_stop_waits_for_sequence_concurrency_probe(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
engine_overrides={
|
||||||
|
"base_flags": {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 8000,
|
||||||
|
"tensor-parallel-size": 4,
|
||||||
|
"data-parallel-size": 1,
|
||||||
|
"max-num-batched-tokens": 8192,
|
||||||
|
"max-num-seqs": 64,
|
||||||
|
"enable-chunked-prefill": True,
|
||||||
|
},
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
"max-num-seqs",
|
||||||
|
"enable-chunked-prefill",
|
||||||
|
],
|
||||||
|
"topology_constraints": {
|
||||||
|
"allowed_tensor_parallel_sizes": [4, 8],
|
||||||
|
"allowed_data_parallel_sizes": [1, 2],
|
||||||
|
"allowed_tp_dp_products": [4, 8],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def write_result(name: str, best_rate: float | None, pass_rate: float) -> Path:
|
||||||
|
path = tmp_path / f"{name}.json"
|
||||||
|
payload = {
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.091796875 if best_rate is not None else None,
|
||||||
|
"best_request_rate": best_rate,
|
||||||
|
"best_pass_rate": pass_rate if best_rate is not None else None,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.09375,
|
||||||
|
"feasible": best_rate is not None,
|
||||||
|
"payload": {
|
||||||
|
"request_rate": best_rate,
|
||||||
|
"pass_rate": pass_rate,
|
||||||
|
"early_stop_reason": (
|
||||||
|
"" if best_rate is not None else "slo_pass_rate_unrecoverable"
|
||||||
|
),
|
||||||
|
"latency_summary": {
|
||||||
|
"failed_reason_counts": {"ttft_ms>4000.0": 32}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path.write_text(json.dumps(payload), encoding="utf-8")
|
||||||
|
return path
|
||||||
|
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0001",
|
||||||
|
best_parallel_size=8,
|
||||||
|
best_sampling_u=0.091796875,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
best_pass_rate=0.952,
|
||||||
|
result_path=str(write_result("trial-0001", 2.303, 0.952)),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 8,
|
||||||
|
"data-parallel-size": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
best_pass_rate=0.953,
|
||||||
|
result_path=str(write_result("trial-0002", 2.303, 0.953)),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 8,
|
||||||
|
"max-num-batched-tokens": 32768,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0003",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
result_path=str(write_result("trial-0003", None, 0.0)),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 4,
|
||||||
|
"data-parallel-size": 2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0004",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
best_pass_rate=0.954,
|
||||||
|
result_path=str(write_result("trial-0004", 2.303, 0.954)),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 8,
|
||||||
|
"data-parallel-size": 1,
|
||||||
|
"max-num-batched-tokens": 12288,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
self.assertFalse(context["harness_stop"]["should_stop"])
|
||||||
|
self.assertEqual(
|
||||||
|
context["harness_stop"]["reason"],
|
||||||
|
"experiment_plan_has_high_value_candidate",
|
||||||
|
)
|
||||||
|
action = context["experiment_plan"]["next_action"]
|
||||||
|
self.assertEqual(action["knob_family"], "max-num-seqs")
|
||||||
|
self.assertEqual(action["config_patch"]["flag_patch"]["max-num-seqs"], 96)
|
||||||
|
self.assertEqual(action["config_patch"]["flag_patch"]["tensor-parallel-size"], 8)
|
||||||
|
|
||||||
|
def test_prefill_sequence_probe_followed_by_joint_runtime_probe(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
engine_overrides={
|
||||||
|
"base_flags": {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 8000,
|
||||||
|
"tensor-parallel-size": 4,
|
||||||
|
"data-parallel-size": 1,
|
||||||
|
"max-num-batched-tokens": 8192,
|
||||||
|
"max-num-seqs": 64,
|
||||||
|
"enable-chunked-prefill": True,
|
||||||
|
},
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
"max-num-seqs",
|
||||||
|
"enable-chunked-prefill",
|
||||||
|
],
|
||||||
|
"topology_constraints": {
|
||||||
|
"allowed_tensor_parallel_sizes": [4, 8],
|
||||||
|
"allowed_data_parallel_sizes": [1, 2],
|
||||||
|
"allowed_tp_dp_products": [4, 8],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def write_result(name: str) -> Path:
|
||||||
|
path = tmp_path / f"{name}.json"
|
||||||
|
payload = {
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.091796875,
|
||||||
|
"best_request_rate": 2.303,
|
||||||
|
"best_pass_rate": 0.951,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.09375,
|
||||||
|
"feasible": True,
|
||||||
|
"payload": {
|
||||||
|
"request_rate": 2.303,
|
||||||
|
"pass_rate": 0.951,
|
||||||
|
"latency_summary": {
|
||||||
|
"failed_reason_counts": {"ttft_ms>4000.0": 32}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path.write_text(json.dumps(payload), encoding="utf-8")
|
||||||
|
return path
|
||||||
|
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0001",
|
||||||
|
best_parallel_size=8,
|
||||||
|
best_sampling_u=0.091796875,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
best_pass_rate=0.952,
|
||||||
|
result_path=str(write_result("trial-0001")),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 8,
|
||||||
|
"data-parallel-size": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
best_pass_rate=0.950,
|
||||||
|
result_path=str(write_result("trial-0002")),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 8,
|
||||||
|
"max-num-seqs": 96,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0003",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.303,
|
||||||
|
best_request_rate_per_gpu=0.288,
|
||||||
|
best_pass_rate=0.950,
|
||||||
|
result_path=str(write_result("trial-0003")),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 8,
|
||||||
|
"data-parallel-size": 1,
|
||||||
|
"max-num-batched-tokens": 12288,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
self.assertFalse(context["harness_stop"]["should_stop"])
|
||||||
|
self.assertEqual(
|
||||||
|
context["harness_stop"]["reason"],
|
||||||
|
"experiment_plan_has_high_value_candidate",
|
||||||
|
)
|
||||||
|
action = context["experiment_plan"]["next_action"]
|
||||||
|
flag_patch = action["config_patch"]["flag_patch"]
|
||||||
|
self.assertEqual(action["knob_family"], "prefill-runtime-interaction")
|
||||||
|
self.assertEqual(flag_patch["tensor-parallel-size"], 8)
|
||||||
|
self.assertEqual(flag_patch["max-num-batched-tokens"], 16384)
|
||||||
|
self.assertEqual(flag_patch["max-num-seqs"], 96)
|
||||||
|
|
||||||
def test_slo_unrecoverable_does_not_mask_latency_bottleneck(self) -> None:
|
def test_slo_unrecoverable_does_not_mask_latency_bottleneck(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
@@ -3273,6 +4064,94 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
[0.25, 0.375],
|
[0.25, 0.375],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_run_trial_skips_fallback_below_incumbent_floor(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
payload = json.loads(study_path.read_text(encoding="utf-8"))
|
||||||
|
payload["search"]["max_probes"] = 2
|
||||||
|
payload["search"]["inherit_incumbent_floor"] = True
|
||||||
|
study_path.write_text(json.dumps(payload), encoding="utf-8")
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||||
|
store.init_study(spec_path=study_path, study=study)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0001",
|
||||||
|
best_parallel_size=1,
|
||||||
|
best_sampling_u=0.5,
|
||||||
|
best_request_rate=2.0,
|
||||||
|
best_request_rate_per_gpu=2.0,
|
||||||
|
next_trial_index=2,
|
||||||
|
best_by_parallel_size={
|
||||||
|
"1": {
|
||||||
|
"trial_id": "trial-0001",
|
||||||
|
"parallel_size": 1,
|
||||||
|
"best_sampling_u": 0.5,
|
||||||
|
"best_request_rate": 2.0,
|
||||||
|
"best_request_rate_per_gpu": 2.0,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
trials=[],
|
||||||
|
)
|
||||||
|
proposal = Proposal.from_dict(
|
||||||
|
{
|
||||||
|
"observation": "runtime patch",
|
||||||
|
"diagnosis": "primary range all infeasible",
|
||||||
|
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}},
|
||||||
|
"expected_effects": ["measure"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||||
|
self.assertEqual(trial.search.low, 0.5)
|
||||||
|
self.assertTrue(trial.search.inherit_incumbent_floor)
|
||||||
|
|
||||||
|
def fake_replay(requests, **kwargs):
|
||||||
|
return (
|
||||||
|
[
|
||||||
|
RequestOutcome(
|
||||||
|
request_id=request.row_id,
|
||||||
|
success=True,
|
||||||
|
ttft_ms=10000.0,
|
||||||
|
tpot_ms=1000.0,
|
||||||
|
prompt_tokens=request.prompt_tokens_hint,
|
||||||
|
completion_tokens=request.completion_tokens_hint,
|
||||||
|
)
|
||||||
|
for request in requests
|
||||||
|
],
|
||||||
|
False,
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
process = mock.Mock()
|
||||||
|
process.poll.return_value = 0
|
||||||
|
with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
|
||||||
|
with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
|
||||||
|
with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
|
||||||
|
with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
|
||||||
|
result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
|
||||||
|
|
||||||
|
self.assertEqual(result["status"], "completed")
|
||||||
|
self.assertIsNone(result["best_request_rate"])
|
||||||
|
self.assertEqual(result["best_source"], "primary_search")
|
||||||
|
self.assertEqual(result["primary_search"]["low"], 0.5)
|
||||||
|
self.assertIsNone(result["primary_search"]["best_request_rate"])
|
||||||
|
self.assertEqual(
|
||||||
|
[probe["threshold"] for probe in result["primary_search"]["probes"]],
|
||||||
|
[0.75, 0.625],
|
||||||
|
)
|
||||||
|
self.assertEqual(result["lower_range_fallback"]["triggered"], False)
|
||||||
|
self.assertEqual(result["lower_range_fallback"]["skipped"], True)
|
||||||
|
self.assertEqual(result["lower_range_fallback"]["probes"], [])
|
||||||
|
self.assertEqual(
|
||||||
|
result["lower_range_fallback"]["reason"],
|
||||||
|
"primary_search_above_incumbent_floor_all_infeasible",
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
result["all_infeasible_diagnostics"]["threshold"],
|
||||||
|
0.625,
|
||||||
|
)
|
||||||
|
|
||||||
def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None:
|
def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
@@ -4242,6 +5121,18 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
self.assertTrue(proposal_path.exists())
|
self.assertTrue(proposal_path.exists())
|
||||||
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
|
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
|
||||||
self.assertTrue(proposal["should_stop"])
|
self.assertTrue(proposal["should_stop"])
|
||||||
|
state = store.load_state(study.study_id)
|
||||||
|
self.assertEqual(state.tuning_stop_reason, "harness_stop")
|
||||||
|
self.assertEqual(
|
||||||
|
state.tuning_stop_details["proposal_name"],
|
||||||
|
"harness-stop-0005",
|
||||||
|
)
|
||||||
|
self.assertEqual(state.tuning_stop_details["proposal_source"], "harness")
|
||||||
|
self.assertEqual(
|
||||||
|
state.tuning_stop_details["stop_authorized_by"],
|
||||||
|
"validator",
|
||||||
|
)
|
||||||
|
self.assertTrue(state.tuning_stop_diagnosis)
|
||||||
|
|
||||||
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
|
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
|||||||
109
tests/test_tuning_report.py
Normal file
109
tests/test_tuning_report.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from aituner.tuning_report import run_tuning_report
|
||||||
|
|
||||||
|
|
||||||
|
def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
|
||||||
|
root.mkdir(parents=True)
|
||||||
|
trials = []
|
||||||
|
best_rate = None
|
||||||
|
best_trial_id = None
|
||||||
|
for idx, rate in enumerate(rates, start=1):
|
||||||
|
trial_id = f"trial-{idx:04d}"
|
||||||
|
trials.append(
|
||||||
|
{
|
||||||
|
"trial_id": trial_id,
|
||||||
|
"status": "completed" if rate is not None else "failed",
|
||||||
|
"parallel_size": 1,
|
||||||
|
"best_request_rate": rate,
|
||||||
|
"best_request_rate_per_gpu": rate,
|
||||||
|
"config_patch": {"env_patch": {}, "flag_patch": {}},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if rate is not None and (best_rate is None or rate > best_rate):
|
||||||
|
best_rate = rate
|
||||||
|
best_trial_id = trial_id
|
||||||
|
payload = {
|
||||||
|
"study_id": study_id,
|
||||||
|
"best_trial_id": best_trial_id,
|
||||||
|
"best_request_rate": best_rate,
|
||||||
|
"best_request_rate_per_gpu": best_rate,
|
||||||
|
"next_trial_index": len(rates) + 1,
|
||||||
|
"trials": trials,
|
||||||
|
}
|
||||||
|
(root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
class TuningReportTests(unittest.TestCase):
|
||||||
|
def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
_write_state(
|
||||||
|
tmp_path / "studies" / "harness-study",
|
||||||
|
study_id="harness-study",
|
||||||
|
rates=[0.4, 0.9],
|
||||||
|
)
|
||||||
|
_write_state(
|
||||||
|
tmp_path / "naive-study",
|
||||||
|
study_id="naive-study",
|
||||||
|
rates=[0.4, None, 0.7, 0.9],
|
||||||
|
)
|
||||||
|
spec_path = tmp_path / "report.json"
|
||||||
|
spec_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"report_id": "report-1",
|
||||||
|
"output_root": str(tmp_path / "out"),
|
||||||
|
"target_fraction": 0.8,
|
||||||
|
"cases": [
|
||||||
|
{
|
||||||
|
"case_id": "case-1",
|
||||||
|
"tags": ["model-a", "chat"],
|
||||||
|
"budgets": [1, 2, 4],
|
||||||
|
"arms": [
|
||||||
|
{
|
||||||
|
"name": "harness",
|
||||||
|
"kind": "harness",
|
||||||
|
"study_root": str(tmp_path / "studies"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "naive",
|
||||||
|
"kind": "naive",
|
||||||
|
"study_root": str(tmp_path / "naive-study"),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
summary = run_tuning_report(spec_path)
|
||||||
|
|
||||||
|
case = summary["cases"][0]
|
||||||
|
self.assertEqual(case["reference_best_per_gpu"], 0.9)
|
||||||
|
self.assertEqual(case["winners"]["final_best"], "harness")
|
||||||
|
self.assertEqual(case["winners"]["fastest_to_target"], "harness")
|
||||||
|
harness = case["arms"][0]
|
||||||
|
naive = case["arms"][1]
|
||||||
|
self.assertEqual(harness["best_at_budget"]["2"], 0.9)
|
||||||
|
self.assertEqual(naive["best_at_budget"]["2"], 0.4)
|
||||||
|
self.assertEqual(case["target_fraction"], 0.8)
|
||||||
|
self.assertEqual(harness["trials_to_target"], 2)
|
||||||
|
self.assertEqual(naive["trials_to_target"], 4)
|
||||||
|
self.assertEqual(naive["failed_count"], 1)
|
||||||
|
comparison = case["harness_vs_naive"][0]
|
||||||
|
self.assertTrue(comparison["passes"])
|
||||||
|
self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
|
||||||
|
self.assertTrue((tmp_path / "out" / "summary.json").exists())
|
||||||
|
self.assertTrue((tmp_path / "out" / "report.md").exists())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user