Compare commits
23 Commits
816765071f
...
c245774d76
| Author | SHA1 | Date | |
|---|---|---|---|
| c245774d76 | |||
| d85572e7b5 | |||
| c0a9235b80 | |||
| c4173b2b3b | |||
| 6d874ecbff | |||
| 403ae2e2b7 | |||
| 861d754f29 | |||
| 76ec19224c | |||
| e67bc86240 | |||
| fd94ab9f3b | |||
| 4607711bb5 | |||
| d23b69219b | |||
| 488fae7e63 | |||
| 426151bc9f | |||
| a9d237bbfd | |||
| 5257fbc1a2 | |||
| b3156a382a | |||
| 76cca89a43 | |||
| 83162e7a64 | |||
| a3523f5601 | |||
| 95c02d7dd9 | |||
| a1b804f879 | |||
| 0c23285f39 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,6 +4,7 @@
|
||||
.aituner-tight/
|
||||
.aituner-prefill/
|
||||
.aituner-compare/
|
||||
.aituner-run-configs/
|
||||
.env
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
@@ -6,6 +6,10 @@
|
||||
- Hardware expectation: 8 NVIDIA H20 GPUs.
|
||||
- SSH check: use `ssh dash0` before scheduling or debugging remote runs.
|
||||
- Remote project path: `/home/admin/cpfs/wjh/aituner/aituner`.
|
||||
- If remote downloads are slow or fail, start the proxy from the remote `wjh`
|
||||
home directory with `./auto_proxy.sh`, then run downloads in a shell where
|
||||
`proxyOn` from `~/.bashrc` has been applied. If `autossh` is unavailable,
|
||||
`ssh -Nf proxy` provides the same local `127.0.0.1:11235` tunnel.
|
||||
|
||||
## Local/remote sync workflow
|
||||
|
||||
|
||||
@@ -130,9 +130,9 @@
|
||||
"min_input_tokens": 0,
|
||||
"max_input_tokens": 8192
|
||||
},
|
||||
"replay_time_scale": 0.5,
|
||||
"replay_time_scale": 0.8775,
|
||||
"early_stop_max_lag_s": 45.0,
|
||||
"early_stop_max_elapsed_s": 320.0,
|
||||
"early_stop_max_elapsed_s": 1000.0,
|
||||
"adaptive_stop": {
|
||||
"enabled": true,
|
||||
"tau": 0.9,
|
||||
@@ -141,8 +141,7 @@
|
||||
"max_checks": 20,
|
||||
"min_fraction": 0.1,
|
||||
"boundary_delta": 0.02
|
||||
},
|
||||
"completion_tokens_override": 128
|
||||
}
|
||||
},
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
@@ -158,7 +157,7 @@
|
||||
},
|
||||
"search": {
|
||||
"low": 0.0,
|
||||
"high": 0.25,
|
||||
"high": 0.15,
|
||||
"tolerance": 0.001,
|
||||
"max_probes": 6,
|
||||
"sample_seed": 20260325,
|
||||
@@ -169,7 +168,9 @@
|
||||
"max_history_trials": 8,
|
||||
"endpoint": {
|
||||
"provider": "codex",
|
||||
"model": "gpt-5.4",
|
||||
"model": "gpt-5.5",
|
||||
"base_url": "https://ai.gahow.org/v1",
|
||||
"wire_api": "chat.completions",
|
||||
"stream": true,
|
||||
"api_key_env": "OPENAI_API_KEY",
|
||||
"timeout_s": 180
|
||||
|
||||
@@ -130,9 +130,9 @@
|
||||
"min_input_tokens": 0,
|
||||
"max_input_tokens": 8192
|
||||
},
|
||||
"replay_time_scale": 0.5,
|
||||
"replay_time_scale": 0.8775,
|
||||
"early_stop_max_lag_s": 45.0,
|
||||
"early_stop_max_elapsed_s": 320.0,
|
||||
"early_stop_max_elapsed_s": 1000.0,
|
||||
"adaptive_stop": {
|
||||
"enabled": true,
|
||||
"tau": 0.9,
|
||||
@@ -141,8 +141,7 @@
|
||||
"max_checks": 20,
|
||||
"min_fraction": 0.1,
|
||||
"boundary_delta": 0.02
|
||||
},
|
||||
"completion_tokens_override": 128
|
||||
}
|
||||
},
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
@@ -158,7 +157,7 @@
|
||||
},
|
||||
"search": {
|
||||
"low": 0.0,
|
||||
"high": 0.25,
|
||||
"high": 0.15,
|
||||
"tolerance": 0.001,
|
||||
"max_probes": 6,
|
||||
"sample_seed": 20260325,
|
||||
@@ -169,7 +168,9 @@
|
||||
"max_history_trials": 8,
|
||||
"endpoint": {
|
||||
"provider": "codex",
|
||||
"model": "gpt-5.4",
|
||||
"model": "gpt-5.5",
|
||||
"base_url": "https://ai.gahow.org/v1",
|
||||
"wire_api": "chat.completions",
|
||||
"stream": true,
|
||||
"api_key_env": "OPENAI_API_KEY",
|
||||
"timeout_s": 180
|
||||
|
||||
26
configs/examples/tuning_report.example.json
Normal file
26
configs/examples/tuning_report.example.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"report_id": "qwen27b-abl12-harness-vs-naive",
|
||||
"output_root": "../../.aituner-reports/qwen27b-abl12-harness-vs-naive",
|
||||
"target_fraction": 0.95,
|
||||
"min_final_ratio": 0.98,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "qwen27b-chat-0-8k-real-output",
|
||||
"description": "12-trial harness-vs-naive ablation on the 0-8k chat window with real output lengths.",
|
||||
"tags": ["qwen27b", "chat", "0-8k", "h20", "real-output"],
|
||||
"budgets": [1, 2, 3, 4, 6, 8, 12],
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": "../../.aituner/abl12-harness/dash0-qwen27b-ablation-harness-on"
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": "../../.aituner/abl12-naive/dash0-qwen27b-ablation-naive-off"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
215
docs/aituner-roadmap.md
Normal file
215
docs/aituner-roadmap.md
Normal file
@@ -0,0 +1,215 @@
|
||||
# AITuner roadmap
|
||||
|
||||
本文只维护最小 roadmap:paper framing、claim 树、已有证据、最高优先级实验。
|
||||
详细实验流水账放到对应专题文档里。
|
||||
|
||||
## Paper thesis
|
||||
|
||||
AITuner 的核心不是“用 LLM 调参”。更准确的 framing 是:
|
||||
|
||||
```text
|
||||
black-box knob optimization
|
||||
-> grey-box / mechanism-guided experimental optimization
|
||||
```
|
||||
|
||||
也就是说,AITuner 仍然通过真实实验测量目标函数,但它不再把 serving engine 当成
|
||||
完全黑盒的 `config vector -> scalar score`。Harness 将 workload、SLO failure、
|
||||
probe trace、topology constraints 和 failure memory 转换成结构化的 serving
|
||||
mechanism state,并把下一步搜索限制在可解释、可验证的 intervention 上。
|
||||
|
||||
因此 LLM 不是不可替代的核心。LLM 是 planner backend / copilot;核心系统贡献是
|
||||
planner-agnostic 的 tuning substrate:
|
||||
|
||||
```text
|
||||
Harness H = (O, R, G, V, M)
|
||||
|
||||
O: observation schema
|
||||
workload L/C/A profile + probe trace + latency/SLO failure + launch status
|
||||
|
||||
R: regime attribution
|
||||
SLO violation -> prefill-bound / decode-bound / admission-bound / memory-bound / launch-bound
|
||||
|
||||
G: serving intervention grammar
|
||||
regime -> legal intervention families, not raw arbitrary knobs
|
||||
|
||||
V: validator
|
||||
tunable schema + topology constraints + no-repeat + failure memory + stop authority
|
||||
|
||||
M: measurement/scoring protocol
|
||||
SLO-constrained feasible frontier, req/s/GPU, latency quantiles, pass-rate guard
|
||||
```
|
||||
|
||||
Planner 是可替换的:
|
||||
|
||||
```text
|
||||
pi in {LLM, BO, bandit, deterministic heuristic, tree search}
|
||||
```
|
||||
|
||||
AITuner 的强 claim 应该是:同一个 planner 放在 harness-shaped space 里,比放在
|
||||
raw knob space 里更快、更稳、更接近最优;弱模型或非 LLM planner 也能从这个 substrate
|
||||
中获益。
|
||||
|
||||
## Why not pure white-box
|
||||
|
||||
我们不应 claim 完整 white-box optimization。AITuner 没有解析 vLLM scheduler、
|
||||
kernel、KV cache、通信和排队的闭式性能模型。更稳妥也更强的表述是 grey-box:
|
||||
|
||||
- objective 仍然由真实测量决定;
|
||||
- action space、constraints、failure attribution 和 intervention semantics 是系统知识驱动;
|
||||
- 每个 trial 是一个 counterfactual experiment,而不是盲目采样一个 knob vector。
|
||||
|
||||
## 关键设计点
|
||||
|
||||
| 设计点 | 更强表述 | 作用 | 需要证明 |
|
||||
| --- | --- | --- | --- |
|
||||
| Observation | mechanism state | 将 workload shape、probe trace、SLO failure、launch/memory failure 结构化 | agent 看到的是可计算状态,不是自然语言日志 |
|
||||
| Bottleneck classifier | SLO violation attribution | 把失败归因到 serving regime,而不是只看哪个指标超阈值 | attribution 和后续有效 intervention 有因果关联 |
|
||||
| Candidate family | serving intervention grammar | 把 raw knobs 提升为 topology / batching / admission / memory interventions | 搜索空间被压缩,但不写死某个 case |
|
||||
| Scoring | counterfactual verdict | 用 SLO frontier 和 req/s/GPU 判断 intervention 是否支持假设 | 最终 winner 由测量决定,不由 LLM 决定 |
|
||||
| Validator / stop | fail-safe control | 禁止非法、重复、已知失败 family;只有 validator 授权 stop | 错误 attribution 最多浪费 trial,不污染 incumbent |
|
||||
|
||||
## Claim roadmap
|
||||
|
||||
| Claim | 当前状态 | 证据文档 | 关键缺口 |
|
||||
| --- | --- | --- | --- |
|
||||
| C1. Harness 将 raw knob search 转成 mechanism-guided intervention search,提升固定预算优化效果 | 已有强信号 | [Qwen27B 2x2](harness-ablation/qwen27b-tight-2x2-model-ablation-20260623.md), [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 补 Qwen235B decode 2x2 aggregate;补 mechanism ablation |
|
||||
| C2. 收益来自 harness-defined substrate,不依赖某个强 LLM | 部分已有 | [Qwen27B 2x2](harness-ablation/qwen27b-tight-2x2-model-ablation-20260623.md) | 做 `BO/heuristic + harness` vs `BO/heuristic + raw knobs` |
|
||||
| C3. Weak planner + harness 可以匹配或超过 strong LLM naive | Qwen27B 已支持;Qwen235B 正在补 | [Qwen27B 2x2](harness-ablation/qwen27b-tight-2x2-model-ablation-20260623.md), [Qwen235B prefill progress](harness-ablation/qwen235b-prefill-2x2-progress-20260623.md) | 完成 Qwen235B decode 2x2;更新 prefill final doc |
|
||||
| C4. Attribution 和 intervention grammar 有机制贡献,不只是 prompt 信息更多 | 设计已有,严格证据不足 | [AITuner summary](aituner-harness-summary.md) | 做 shuffled attribution / no attribution / no grammar / no topology-first / no validator ablation |
|
||||
| C5. AITuner 找到 near-optimal region,而不是只找到一个可行 config | Qwen30B 有解释性信号 | [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 选 1-2 个 case 做局部 grid 或专家配置对照 |
|
||||
| C6. AITuner 能随 SLO tightness 移动到合适 frontier | Qwen30B 已完成 | [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 再选一个非同质 case 做 SLO sweep;同时画 SLO tightness -> frontier/regime transition |
|
||||
| C7. Engine adapter 让 intervention grammar 可迁移到其他 serving engine | 设计上可行,暂不作为主实验 claim | `EngineLaunchSpec` / launch recipe / tunable schema | vLLM 主线完成后,再做 SGLang adapter 和一个低成本验证 case |
|
||||
|
||||
## 最高优先级实验
|
||||
|
||||
### P0. 完成 Qwen235B decode 2x2 并整理 aggregate
|
||||
|
||||
目的:补齐最核心的 `harness on/off x strong/weak planner` 证据,回答:
|
||||
|
||||
```text
|
||||
weak LLM + harness >= strong LLM naive ?
|
||||
```
|
||||
|
||||
预期产出:
|
||||
|
||||
- 2x2 表格:每个 arm 在相同 iter budget 下的 best-so-far req/s/GPU;
|
||||
- convergence curve / normalized AUC;
|
||||
- 每个 arm 的 trial path 和主要 config patches;
|
||||
- 解释 naive 为什么走错,harness 如何通过 regime attribution 走到正确 intervention。
|
||||
|
||||
优先级原因:实验已经在跑,增量成本最低,而且直接支撑 C1/C3。
|
||||
|
||||
### P1. Planner-agnostic substrate 实验
|
||||
|
||||
目的:证明 AITuner 不是 LLM tuner,而是 harness-defined optimization substrate。
|
||||
|
||||
最小实验矩阵:
|
||||
|
||||
| Planner | Raw knob space | Harness intervention space |
|
||||
| --- | --- | --- |
|
||||
| deterministic heuristic | raw heuristic | harness policy |
|
||||
| BO 或 lightweight bandit | raw BO | harness-guided BO |
|
||||
| weak LLM | naive weak LLM | weak LLM + harness |
|
||||
| strong LLM | naive strong LLM | strong LLM + harness |
|
||||
|
||||
如果 BO 实现成本高,先用 deterministic harness policy 做 non-LLM planner baseline:
|
||||
它已经能证明“没有 LLM 也能 work”。随后再补 BO,使论证更强。
|
||||
|
||||
预期图:
|
||||
|
||||
- x-axis: trial budget;
|
||||
- y-axis: best-so-far SLO-constrained req/s/GPU;
|
||||
- line groups: raw knob space vs harness intervention space;
|
||||
- 单独 bar:invalid launch rate、repeated config rate、wasted trial rate。
|
||||
|
||||
优先级原因:这是新 framing 的关键实验。没有它,paper 仍然容易被读成“LLM prompt
|
||||
engineering”。
|
||||
|
||||
### P2. Mechanism ablation
|
||||
|
||||
目的:证明 harness 内部不是普通信息堆叠,而是 attribution、intervention grammar、
|
||||
validator 分别贡献有效机制。
|
||||
|
||||
建议 ablation:
|
||||
|
||||
| Variant | 删除/破坏什么 | 预期证明 |
|
||||
| --- | --- | --- |
|
||||
| full AITuner | 无 | 最好 |
|
||||
| no attribution | 不提供 regime attribution,只给 scalar score 和历史结果 | attribution 对方向选择有贡献 |
|
||||
| shuffled attribution | 故意打乱 regime label,但保留文本长度 | 收益来自语义正确性,不是更多 prompt tokens |
|
||||
| no intervention grammar | 允许任意 tunable knobs,移除 family guidance | action-space shaping 有贡献 |
|
||||
| no topology-first | runtime knobs 可以优先于 topology intervention | topology 是 LLM serving 的一阶决策 |
|
||||
| no validator/failure memory | 允许重复、已知 launch failure family | fail-safe control 减少 GPU burn |
|
||||
|
||||
预期图:
|
||||
|
||||
- mechanism ablation bar:final best、AUC、TTT;
|
||||
- waste breakdown:invalid launch、repeat config、wrong-family trial;
|
||||
- case study trace:每个 variant 前 3-5 个 proposal 对比。
|
||||
|
||||
优先级原因:这是回应 novelty 质疑的核心证据。
|
||||
|
||||
### P3. Near-optimum / expert baseline 证据
|
||||
|
||||
目的:证明 AITuner 不是只找到“能收敛但性能差”的 config。
|
||||
|
||||
优先选择一个成本可控 case 做局部 grid:
|
||||
|
||||
```text
|
||||
topology: TP/DP frontier
|
||||
runtime: max-num-seqs, max-num-batched-tokens, gpu-memory-utilization 的小邻域
|
||||
objective: max feasible req/s/GPU under pass_rate >= 0.95
|
||||
```
|
||||
|
||||
预期图:
|
||||
|
||||
- local grid heatmap;
|
||||
- AITuner trial path overlay;
|
||||
- AITuner best vs grid best vs expert config;
|
||||
- near-optimum gap,例如 `AITuner >= 95% of local grid optimum`。
|
||||
|
||||
优先级原因:这是 claim “tune 出最好的 config,而不是差的收敛 config” 的必要证据。
|
||||
|
||||
### P4. 第二个 SLO robustness case
|
||||
|
||||
目的:证明 Qwen30B 的 SLO robustness 不是单 case 现象。
|
||||
|
||||
不要先大规模铺 sweep。先选一个和 Qwen30B 机制不同的 case:
|
||||
|
||||
- 一个 decode-heavy case,观察 TP/DP redistribution 和 concurrency/memory intervention;
|
||||
- 或一个 long-prefill / tight-TTFT case,观察 TP 和 prefill batching intervention。
|
||||
|
||||
预期图:
|
||||
|
||||
- x-axis: SLO tightness;
|
||||
- y-axis: best feasible req/s/GPU;
|
||||
- marker/color: selected intervention regime;
|
||||
- annotation: final TP/DP/MNS/MBT;
|
||||
- 展示 SLO 放宽时 frontier/right shift 或 regime transition。
|
||||
|
||||
优先级原因:重要,但应排在 planner-agnostic 和 mechanism ablation 之后。
|
||||
|
||||
### P5. SGLang / multi-engine adapter validation
|
||||
|
||||
目的:证明 intervention grammar 可以通过 adapter lowering 到不同 serving engine。
|
||||
|
||||
当前暂缓,不作为 vLLM 主线之前的高优先级实验。等 C1-C5 稳定后再做一个低成本 case:
|
||||
|
||||
```text
|
||||
same workload profile
|
||||
same SLO objective
|
||||
same intervention grammar
|
||||
different engine adapter
|
||||
```
|
||||
|
||||
优先级原因:它能扩展 generality,但不能替代 vLLM 主线的机制证明。
|
||||
|
||||
## 暂不做
|
||||
|
||||
- 暂不把主 claim 写成“LLM 比 BO 更聪明”。新 claim 是 harness substrate 对多种 planner
|
||||
都有用。
|
||||
- 暂不 claim full white-box 或全局最优。当前更稳妥的是 grey-box、near-optimum、
|
||||
fixed-budget utility。
|
||||
- 暂不横向铺大量 SLO sweep。先补机制 ablation、planner-agnostic 和 near-optimum。
|
||||
- 暂不把 multi-engine support 放进主实验 claim。先写成 adapter-based design,等 vLLM
|
||||
证据链完整后再补一个 SGLang validation。
|
||||
138
docs/harness-ablation/qwen235b-prefill-2x2-progress-20260623.md
Normal file
138
docs/harness-ablation/qwen235b-prefill-2x2-progress-20260623.md
Normal file
@@ -0,0 +1,138 @@
|
||||
# Qwen235B prefill 2x2 progress - 2026-06-23
|
||||
|
||||
Snapshot: 2026-06-23 18:24 CST / 10:24 UTC.
|
||||
|
||||
本文整理当前 dash1/dash2/dash3 上的 Qwen235B prefill 2x2 实验进度。这个
|
||||
case 仍在跑 strong-model arm,因此本文是 progress report,不是最终 aggregate
|
||||
结论。
|
||||
|
||||
## 当前远端状态
|
||||
|
||||
| Host | 当前状态 | 说明 |
|
||||
| --- | --- | --- |
|
||||
| dash1 | running | `aituner-q235b-2x2-gpt55-20260623T010038Z` 仍在跑,当前是 `gpt-5.5 + naive` 的 trial-0004;8 张 H20 被 vLLM 占用。 |
|
||||
| dash2 | idle | 没有 tmux/GPU 任务;最近完成的是 `qwen235b-prefill-jointprobe-harness-dash2-20260622T132010Z` harness-only 验证。 |
|
||||
| dash3 | idle | 没有 tmux/GPU 任务;`gpt-5.4-mini` 2x2 arm 已完成并生成 report。 |
|
||||
|
||||
注意:三台机器共享 `/home/admin/cpfs/wjh/aituner/aituner`,所以 `.aituner` 和
|
||||
`.aituner-reports` 在不同 dash 节点上看到的是同一批产物。
|
||||
|
||||
## 已完成:gpt-5.4-mini 2x2 arm
|
||||
|
||||
Report:
|
||||
|
||||
```text
|
||||
.aituner-reports/qwen235b-prefill-2x2-gpt54mini-dash3-20260623T010038Z/report.md
|
||||
```
|
||||
|
||||
Aggregate:
|
||||
|
||||
| Arm | Kind | Trials | Final req/s/GPU | Final/ref | TTT | AUC | Failed | No feasible |
|
||||
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||
| `harness` | harness | 8 | 0.3217 | 1.0000 | 3 | 0.9483 | 0 | 1 |
|
||||
| `naive` | naive | 8 | - | - | - | 0.0000 | 2 | 8 |
|
||||
|
||||
Interpretation:
|
||||
|
||||
- `gpt-5.4-mini + harness` 找到了 `0.3217 req/s/GPU`,达到该 report 的
|
||||
reference best。
|
||||
- `gpt-5.4-mini + naive` 8 个 trials 都没有找到 feasible config,其中 2 个是
|
||||
engine launch failure。
|
||||
- Report 中 `Harness-vs-naive pass/checks: 0/1` 是 aggregator 对
|
||||
`best_naive_final_per_gpu = null` 的保守处理:因为 naive 没有 feasible best,
|
||||
final ratio 无法计算,所以 pass 记为 false。就实际 tuning 结果而言,这个 arm
|
||||
是 harness dominates naive。
|
||||
|
||||
Harness trajectory:
|
||||
|
||||
| Trial | Patch | req/s/GPU | Pass rate | 说明 |
|
||||
| ---: | --- | ---: | ---: | --- |
|
||||
| 1 | `TP=8, DP=1` | 0.2879 | 0.9522 | 初始 topology 满足 SLO,但未达到最终 best。 |
|
||||
| 2 | `TP=8, max-num-seqs=96` | 0.2879 | 0.9537 | 单独调 `max-num-seqs` 无明显提升。 |
|
||||
| 3 | `TP=8, max-num-batched-tokens=16384, max-num-seqs=96` | 0.3085 | 0.9568 | joint runtime probe 提升。 |
|
||||
| 4 | `TP=8, max-num-seqs=144, max-num-batched-tokens=32768` | 0.2879 | 0.9530 | 过大的 batching/seq 组合回退。 |
|
||||
| 5 | `TP=4, DP=2` | - | - | 无 feasible best,说明 DP-heavy/mixed topology 不解决该 prefill path。 |
|
||||
| 6 | `TP=8, max-num-seqs=96, max-num-batched-tokens=24576` | 0.2708 | 0.9523 | batching 进一步增大后回退。 |
|
||||
| 7 | `TP=4, DP=1, max-num-seqs=96, max-num-batched-tokens=16384` | 0.2338 | 0.9590 | 少用 GPU 的 TP4/DP1 per-GPU 不占优。 |
|
||||
| 8 | `TP=8, DP=1, max-num-seqs=128, max-num-batched-tokens=16384` | 0.3217 | 0.9508 | 当前 best。 |
|
||||
|
||||
这个结果说明:在 Qwen235B prefill case 上,harness 的价值不只是 topology
|
||||
选择,还包括在 TTFT/prefill 方向下做受约束的 runtime joint probe。最终 best 是
|
||||
`TP=8, DP=1, max-num-seqs=128, max-num-batched-tokens=16384`。
|
||||
|
||||
## 正在运行:gpt-5.5 2x2 arm
|
||||
|
||||
Session:
|
||||
|
||||
```text
|
||||
tmux: aituner-q235b-2x2-gpt55-20260623T010038Z
|
||||
driver log: .aituner/qwen235b-prefill-2x2-gpt55-dash1-20260623T010038Z.driver.log
|
||||
```
|
||||
|
||||
Driver timeline:
|
||||
|
||||
```text
|
||||
harness clean pair start 2026-06-23T01:00:40+00:00
|
||||
harness clean pair done 2026-06-23T08:21:13+00:00
|
||||
naive clean pair start 2026-06-23T08:21:13+00:00
|
||||
```
|
||||
|
||||
Harness side has completed all 8 trials:
|
||||
|
||||
| Trial | Patch | req/s/GPU | Pass rate |
|
||||
| ---: | --- | ---: | ---: |
|
||||
| 1 | `TP=8, DP=1` | 0.2879 | 0.9522 |
|
||||
| 2 | `TP=8, max-num-seqs=96` | 0.2879 | 0.9530 |
|
||||
| 3 | `TP=8, max-num-batched-tokens=16384, max-num-seqs=96` | 0.3085 | 0.9561 |
|
||||
| 4 | `TP=8, max-num-batched-tokens=32768, max-num-seqs=144` | 0.2783 | 0.9543 |
|
||||
| 5 | `TP=8, DP=1, max-num-batched-tokens=24576, max-num-seqs=96` | 0.2654 | 0.9513 |
|
||||
| 6 | `TP=4, DP=2, max-num-batched-tokens=16384, max-num-seqs=96` | - | - |
|
||||
| 7 | `TP=8, DP=1, max-num-batched-tokens=16384, max-num-seqs=80` | 0.3156 | 0.9505 |
|
||||
| 8 | `TP=8, max-num-batched-tokens=32768, max-num-seqs=120` | 0.2879 | 0.9508 |
|
||||
|
||||
Current harness best: `trial-0007`, `0.3156 req/s/GPU`.
|
||||
|
||||
Naive side is still running. Current state:
|
||||
|
||||
- Completed/recorded through trial-0003, with current best `0.2879 req/s/GPU`.
|
||||
- trial-0004 is active with `TP=8, DP=1, max-num-batched-tokens=8192,
|
||||
max-num-seqs=128`.
|
||||
- trial-0004 probe history so far:
|
||||
|
||||
| threshold | request rate | req/s/GPU | pass rate | feasible | main failures |
|
||||
| ---: | ---: | ---: | ---: | --- | --- |
|
||||
| 0.0625 | 1.5750 | 0.1969 | 0.9651 | true | TTFT misses and TTFT threshold violations |
|
||||
| 0.09375 | 2.3650 | 0.2956 | 0.7308 | false | `slo_pass_rate_unrecoverable`, TTFT violations |
|
||||
| 0.078125 | 1.9567 | 0.2446 | 0.9591 | true | TTFT misses and TTFT threshold violations |
|
||||
| 0.0859375 | 2.1667 | 0.2708 | 0.9546 | true | TTFT misses and TTFT threshold violations |
|
||||
|
||||
As of the snapshot, vLLM is still processing requests for trial-0004, so the naive
|
||||
side has not produced its final result or report yet.
|
||||
|
||||
## Prior Qwen235B context
|
||||
|
||||
These earlier runs explain why the current 2x2 matters:
|
||||
|
||||
| Run | Result | What it showed |
|
||||
| --- | --- | --- |
|
||||
| `qwen235b-prefill-clean-gpt55-dash1-20260621T160712Z` | harness 0.2879, naive 0.3217 | Earlier harness stopped/refined too weakly; naive found better final config. |
|
||||
| `qwen235b-prefill-seqguard-gpt55-dash1-20260622T064445Z` | harness 0.2879, naive 0.2577 | Seq guard prevented the worst early-stop failure but still did not reach the old naive best. |
|
||||
| `qwen235b-prefill-jointprobe-harness-dash2-20260622T132010Z` | harness-only 0.3085 | Joint `max-num-batched-tokens + max-num-seqs` probe improved over seqguard. |
|
||||
| `qwen235b-prefill-2x2-gpt54mini-dash3-20260623T010038Z` | harness 0.3217, naive no feasible | Weak model plus harness now reaches the old best and dominates weak naive. |
|
||||
|
||||
The current evidence points to the harness needing both:
|
||||
|
||||
1. topology discipline: stay on `TP=8, DP=1` for this prefill-heavy 235B setup;
|
||||
2. runtime joint probing: tune `max-num-batched-tokens` and `max-num-seqs` together
|
||||
instead of stopping after the first feasible TP8 result.
|
||||
|
||||
## Open item
|
||||
|
||||
The final Qwen235B 2x2 conclusion is blocked on the still-running
|
||||
`gpt-5.5 + naive` arm on dash1. Once it completes, generate an aggregate report
|
||||
combining:
|
||||
|
||||
- `qwen235b-prefill-2x2-gpt55-dash1-20260623T010038Z`
|
||||
- `qwen235b-prefill-2x2-gpt54mini-dash3-20260623T010038Z`
|
||||
|
||||
and then update this progress report into a final ablation report.
|
||||
@@ -0,0 +1,366 @@
|
||||
# Qwen27B tight-SLO 2x2 harness ablation - 2026-06-23
|
||||
|
||||
本文整理以下 aggregate report,并解释 harness 为什么能够让 tuning 更快、更有效:
|
||||
|
||||
```text
|
||||
.aituner-reports/qwen27b-tight-2x2-aggregate-20260623T005838Z/report.md
|
||||
```
|
||||
|
||||
这个实验是一个 2x2 ablation:模型强弱和是否启用 `use_harness` 交叉。
|
||||
核心问题是:harness 是否提供了可复用的搜索结构,而不仅仅是更强 LLM
|
||||
或者更长 prompt 带来的偶然收益。
|
||||
|
||||
## 实验设计
|
||||
|
||||
Case: `qwen27b-tight-slo-2x2-aggregate`。
|
||||
|
||||
实验基座:
|
||||
|
||||
- Served model: `qwen3.5-27b-256k-0223-internal`。
|
||||
- Hardware: H20,最多 8 GPUs。
|
||||
- Trace: `chat_w20260311_1000`,输入长度过滤到 0-8192 tokens,
|
||||
`replay_time_scale=1.0`,`max_concurrency=32`。
|
||||
- SLO: pass rate >= 0.95;TTFT step rule 为 <=4096 input tokens 时 2s,
|
||||
<=32768 input tokens 时 4s,更长输入时 6s;TPOT <= 50 ms。
|
||||
- Search: 在 `sampling_u in [0, 0.0625]` 上二分探测,tolerance 0.001,
|
||||
max 6 probes。
|
||||
- Tunable envs: `VLLM_ENABLE_TORCH_COMPILE`。
|
||||
- Tunable flags: `tensor-parallel-size`, `data-parallel-size`,
|
||||
`expert-parallel-size`, `gpu-memory-utilization`, `block-size`,
|
||||
`max-num-batched-tokens`, `max-num-seqs`, `enable-prefix-caching`,
|
||||
`enable-chunked-prefill`。
|
||||
- Topology constraints: TP 和 DP 均在 `{1,2,4,8}` 中,允许的 TP*DP product 为
|
||||
`{1,2,4,8}`,本 case 中 EP 固定为 1。
|
||||
|
||||
2x2 arms:
|
||||
|
||||
| Arm | Tuner model | Harness | Trial budget used |
|
||||
| --- | --- | --- | ---: |
|
||||
| `gpt55_harness` | `gpt-5.5` | on | 2 |
|
||||
| `gpt55_naive` | `gpt-5.5` | off | 10 |
|
||||
| `gpt54mini_harness` | `gpt-5.4-mini` | on | 2 |
|
||||
| `gpt54mini_naive` | `gpt-5.4-mini` | off | 10 |
|
||||
|
||||
同一个 tuner model 内,主要差异是 `use_harness`。跨模型比较则用来判断:
|
||||
更弱模型加 harness 是否能匹配或超过更强模型的 naive tuning。
|
||||
|
||||
## Aggregate result
|
||||
|
||||
Reference best: `0.4429 req/s/GPU`。
|
||||
Convergence target: reference 的 95%,即 `0.4208 req/s/GPU`。
|
||||
|
||||
| Arm | Kind | Trials | Final req/s/GPU | Final/ref | Trials to target | Normalized AUC | Failed | No feasible |
|
||||
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||
| `gpt55_harness` | harness | 2 | 0.4429 | 1.0000 | 2 | 0.9484 | 0 | 0 |
|
||||
| `gpt55_naive` | naive | 10 | 0.0273 | 0.0616 | - | 0.0588 | 2 | 2 |
|
||||
| `gpt54mini_harness` | harness | 2 | 0.4429 | 1.0000 | 2 | 0.9450 | 0 | 0 |
|
||||
| `gpt54mini_naive` | naive | 10 | 0.0231 | 0.0522 | - | 0.0498 | 1 | 1 |
|
||||
|
||||
Harness-vs-naive 检查全部通过:
|
||||
|
||||
| Harness arm | Final vs best naive | AUC vs best naive | Pass |
|
||||
| --- | ---: | ---: | --- |
|
||||
| `gpt55_harness` | 16.2290x | 16.1296x | true |
|
||||
| `gpt54mini_harness` | 16.2290x | 16.0720x | true |
|
||||
|
||||
最关键的 ablation 信号是:`gpt-5.4-mini + harness` 和
|
||||
`gpt-5.5 + harness` 达到同一个 final throughput,也都是 2 trials 达到 target;
|
||||
而两个 naive arms 用满 10 trials 后仍低于 harness arms 16x 以上。
|
||||
|
||||
## Agent loop 流程图
|
||||
|
||||
下面是当前 harness 化 agent loop 的抽象流程。LLM 仍然可以参与 proposal,
|
||||
但它拿到的不是裸文本历史,而是结构化 observation、bottleneck diagnosis、
|
||||
candidate actions 和 validator 约束;同时 validator 可以授权 stop,也可以阻止
|
||||
重复失败或不合法配置。
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Study spec: trace, SLO, search range, tunable knobs] --> B[Run one engine config]
|
||||
B --> C[Binary-search probes over sampling_u]
|
||||
C --> D[Build observation o_t]
|
||||
D --> E[Bottleneck classifier]
|
||||
E --> F[Candidate family generator]
|
||||
F --> G[Score candidate actions]
|
||||
G --> H[Prompt renderer / planner]
|
||||
H --> I[LLM or deterministic harness proposal]
|
||||
I --> J{Config validator}
|
||||
J -- invalid, repeated, unsafe --> F
|
||||
J -- valid config_patch --> B
|
||||
G --> K{Stop validator}
|
||||
K -- search_high_saturated_by_incumbent --> L[Stop and keep incumbent]
|
||||
K -- useful candidates remain --> H
|
||||
```
|
||||
|
||||
这个 loop 中,harness 的作用不是把 prompt 写得更漂亮,而是把 tuning 变成
|
||||
一个受测量约束的决策过程:
|
||||
|
||||
```text
|
||||
measurement -> diagnosis -> candidate family -> scored action -> validated proposal/stop
|
||||
```
|
||||
|
||||
## 形式化设计:observation
|
||||
|
||||
每个 trial 结束后,AITuner 不只记录一段自然语言总结,而是形成结构化 observation:
|
||||
|
||||
```text
|
||||
o_t = (
|
||||
config_t,
|
||||
probe_history_t,
|
||||
pass_rate_t,
|
||||
latency/SLO_failure_profile_t,
|
||||
request_rate_t,
|
||||
parallel_size_t,
|
||||
launch_status_t,
|
||||
prior_failures_t,
|
||||
incumbent_t
|
||||
)
|
||||
```
|
||||
|
||||
本实验里 observation 中最重要的字段是:
|
||||
|
||||
- `config_t`: 当前 trial 的 `flag_patch` 和 `env_patch`,例如 `TP=2, DP=1`。
|
||||
- `probe_history_t`: 在不同 `sampling_u` 下二分探测得到的 feasible/infeasible
|
||||
结果。
|
||||
- `pass_rate_t`: 是否满足 target pass rate 0.95。
|
||||
- `latency/SLO_failure_profile_t`: TTFT 和 TPOT 哪个先触发 SLO pressure。
|
||||
- `request_rate_t`: 当前配置在 SLO 下能承载的 request rate。
|
||||
- `parallel_size_t`: 该配置实际使用的并行规模,用于归一化 per-GPU objective。
|
||||
- `prior_failures_t`: 之前哪些配置 launch failed 或 no feasible,避免重复试错。
|
||||
- `incumbent_t`: 当前最优配置及其 `request_rate_per_gpu`。
|
||||
|
||||
目标函数是:
|
||||
|
||||
```text
|
||||
J(config_t) = request_rate_t / parallel_size_t
|
||||
subject to pass_rate_t >= 0.95
|
||||
```
|
||||
|
||||
也就是说,harness 优化的是满足 SLO 后的 `req/s/GPU`,不是 raw throughput,
|
||||
也不是 LLM 主观认为“更强”的配置。
|
||||
|
||||
## 形式化设计:bottleneck classifier
|
||||
|
||||
`bottleneck classifier` 把 observation 映射成 ranked bottleneck hypotheses:
|
||||
|
||||
```text
|
||||
b_t = ranked_bottleneck(o_t)
|
||||
```
|
||||
|
||||
它判断的不是“哪个 knob 看起来常用”,而是“当前 SLO failure 和 latency profile
|
||||
说明哪个系统环节在限制 objective”。
|
||||
|
||||
常见分类包括:
|
||||
|
||||
| Bottleneck | 典型证据 | 倾向 knob family |
|
||||
| --- | --- | --- |
|
||||
| `ttft_prefill` | 长 prompt 下 TTFT 接近或超过 SLO,prefill service time 是瓶颈 | 提高 TP,调整 prefill batching |
|
||||
| `decode_tpot` | TPOT p95/p99 超 SLO,decode token latency 是瓶颈 | 调整 `max-num-seqs`,提高 TP,降低 decode contention |
|
||||
| `admission_queueing` | waiting/arrival lag 增长,服务时间未必单独变差 | 提高 DP,调整 admission/concurrency knobs |
|
||||
| `memory_kv` | KV cache pressure、preemption、OOM、launch failure | 调整 `gpu-memory-utilization`、`block-size`、sequence/token caps |
|
||||
| `topology_comm` | TP 增加降低 latency 但 per-GPU efficiency 下降 | 回退 TP,比较 DP/TP tradeoff |
|
||||
|
||||
本实验里,两个 harness arms 都把 ranked bottleneck 识别为
|
||||
`ttft_prefill`。原因是 workload 有 heavy-tailed long prompts,并且 TTFT SLO 很紧;
|
||||
这意味着单个请求的 prefill service time 是主要限制。DP-only 只能增加 replica,
|
||||
不能缩短一个长 prompt 的 prefill 路径,因此不是第一优先级。
|
||||
|
||||
## 形式化设计:candidate family
|
||||
|
||||
`candidate family generator` 根据 bottleneck 和 topology constraints 生成可比较的
|
||||
action family:
|
||||
|
||||
```text
|
||||
A_t = candidate_knob_families(
|
||||
b_t,
|
||||
topology_constraints,
|
||||
prior_failures_t,
|
||||
incumbent_t
|
||||
)
|
||||
```
|
||||
|
||||
在这个 case 中:
|
||||
|
||||
- `b_t = ttft_prefill`。
|
||||
- 允许的 TP frontier 是 `TP=1 -> TP=2 -> TP=4 -> TP=8`。
|
||||
- 允许的 DP frontier 是 `DP=1,2,4,8`,但 DP-only 不直接缓解单请求 prefill
|
||||
latency。
|
||||
- EP 固定为 1,因此不探索 expert parallel。
|
||||
- 之前没有 failed topology,因此相邻 TP probe launch risk 低。
|
||||
|
||||
所以 harness 选择了:
|
||||
|
||||
```text
|
||||
trial-0001: TP=2, DP=1
|
||||
trial-0002: TP=4, DP=1
|
||||
```
|
||||
|
||||
这不是写死“Qwen27B 应该 TP4”。如果 classifier 输出的是
|
||||
`admission_queueing`,candidate family 会更偏向 DP 或 `max-num-seqs`;如果输出是
|
||||
`memory_kv`,则会更偏向 memory/cache/sequence knobs。
|
||||
|
||||
## 形式化设计:scoring
|
||||
|
||||
每个 candidate action 都按同一个抽象打分:
|
||||
|
||||
```text
|
||||
score(a) = expected_bottleneck_relief(a)
|
||||
+ information_gain(a)
|
||||
+ launch_safety(a)
|
||||
- regression_risk(a)
|
||||
- measurement_cost(a)
|
||||
```
|
||||
|
||||
这些项在本实验里的含义是:
|
||||
|
||||
- `expected_bottleneck_relief`: TP2/TP4 预计能降低 long-prefill compute latency,
|
||||
直接作用于 `ttft_prefill`。
|
||||
- `information_gain`: TP frontier probe 可以区分“需要 compute-latency relief”
|
||||
还是“只是 admission/replica 不够”。
|
||||
- `launch_safety`: TP2/TP4 均满足 topology constraints,没有重复 failed signature。
|
||||
- `regression_risk`: TP 增加会带来通信开销,可能损害 per-GPU efficiency,所以必须用
|
||||
`request_rate_per_gpu` 验证。
|
||||
- `measurement_cost`: 每个 GPU trial 成本高;因此高信息量的 topology probe 优先于
|
||||
多个局部 runtime tweak。
|
||||
|
||||
实际结果验证了这个 scoring:
|
||||
|
||||
| Arm | Trial | Patch | req/s/GPU | Pass rate | 解释 |
|
||||
| --- | ---: | --- | ---: | ---: | --- |
|
||||
| `gpt55_harness` | 1 | `TP=2, DP=1` | 0.2142 | 0.9572 | 相邻 TP probe 已满足 SLO,但仍未饱和 search high。 |
|
||||
| `gpt55_harness` | 2 | `TP=4, DP=1` | 0.4429 | 0.9718 | TP frontier 继续缓解 prefill bottleneck,达到 reference best。 |
|
||||
| `gpt54mini_harness` | 1 | `TP=2, DP=1` | 0.1992 | 0.9707 | 弱模型也选择同一机制路径。 |
|
||||
| `gpt54mini_harness` | 2 | `TP=4, DP=1` | 0.4429 | 0.9727 | 弱模型加 harness 匹配强模型加 harness。 |
|
||||
|
||||
## 形式化设计:validator stop
|
||||
|
||||
Stop 不是 LLM 自己说“我觉得差不多了”。Stop 必须通过 `stop validator`:
|
||||
|
||||
```text
|
||||
stop(o_t, incumbent_t, search_state_t, candidate_set_t) -> true/false
|
||||
```
|
||||
|
||||
本实验里 stop 的记录是:
|
||||
|
||||
```text
|
||||
tuning_stop_reason: harness_stop
|
||||
validator_reason: search_high_saturated_by_incumbent
|
||||
diagnosis: The incumbent's highest measured probe is feasible and is within the
|
||||
configured binary-search resolution of search.high.
|
||||
```
|
||||
|
||||
含义是:
|
||||
|
||||
1. 当前 incumbent 的最高测量 probe 已经 feasible。
|
||||
2. 该 feasible probe 距离 `search.high` 已经在 binary-search tolerance 内。
|
||||
3. 在当前搜索区间和 SLO 约束下,继续花 GPU trial 很难提高 measured objective。
|
||||
4. 因此 validator 授权 stop,并保留当前 incumbent。
|
||||
|
||||
这给 harness 带来了 stop discipline:它既不会因为 LLM 过早自信而随便停,也不会在
|
||||
已经 saturate search high 后继续 burn budget。
|
||||
|
||||
## 实际 tune 了哪些 knobs
|
||||
|
||||
Harness winning path 只改了 topology:
|
||||
|
||||
```text
|
||||
base config + tensor-parallel-size=4, data-parallel-size=1
|
||||
```
|
||||
|
||||
它没有在 winning path 中调 scheduler/cache/memory knobs,因为 `ttft_prefill`
|
||||
bottleneck 下,首要动作是缩短单请求 prefill service time。
|
||||
|
||||
Naive arms 则走了另一个方向:
|
||||
|
||||
| Arm | 所有 trials 使用的 topology | 变化过的 runtime knobs | Best req/s/GPU |
|
||||
| --- | --- | --- | ---: |
|
||||
| `gpt55_naive` | `TP=1, DP=8` | `max-num-batched-tokens`, `max-num-seqs`, `block-size`, `gpu-memory-utilization`, prefix caching, chunked prefill | 0.0273 |
|
||||
| `gpt54mini_naive` | `TP=1, DP=8` | `max-num-batched-tokens`, `max-num-seqs`, `block-size`, `gpu-memory-utilization` | 0.0231 |
|
||||
|
||||
`gpt55_naive` 的第一个 proposal 明确选择 `TP=1, DP=8`,理由是模型能单卡放下,
|
||||
因此 horizontal data parallelism 应该最大化 request rate,而 TP 会带来通信开销。
|
||||
之后 naive proposals 一直保留 DP-heavy topology,只围绕 runtime knobs 搜索。
|
||||
两个 naive arms 合计 20 个 trial slots 都没有进入 TP2/TP4 topology frontier。
|
||||
|
||||
## 为什么比 baseline 更好
|
||||
|
||||
Baseline 失败的原因是优化了错误的因果路径。
|
||||
|
||||
对 `ttft_prefill`-bound workload,关键服务时间是单个请求的 prefill latency。
|
||||
DP-heavy topology 可以增加 replica 数,但每个 replica 仍用 TP1 处理长 prompt;
|
||||
它不能显著缩短单请求 prefill path。在 tight TTFT SLO 下,这会导致 feasible
|
||||
`sampling_u` 很低;再除以 GPU 数得到 `req/s/GPU` 后,结果只有
|
||||
`0.02-0.027 req/s/GPU`。
|
||||
|
||||
Harness 的优化路径是:
|
||||
|
||||
```text
|
||||
observed SLO pressure
|
||||
-> classify as ttft_prefill
|
||||
-> choose legal TP frontier probe
|
||||
-> measure feasible req/s/GPU under the same SLO
|
||||
-> stop only when search.high is saturated by incumbent
|
||||
```
|
||||
|
||||
这条路径是可测量、可反驳的。如果 TP4 降低了 latency 但
|
||||
`request_rate_per_gpu` 明显下降,harness 会 reject 这个 hypothesis。如果
|
||||
bottleneck 是 admission/queueing 而不是 TTFT/prefill,同一个 knob-effect model
|
||||
会偏向 DP 或 `max-num-seqs`,而不是 TP frontier。
|
||||
|
||||
因此,这个结果不是“Qwen27B case 里我们 prompt 诱导模型说 TP4”。更准确的结论是:
|
||||
harness 用 SLO-derived bottleneck evidence 把搜索导向了正确的 knob family,
|
||||
再用 per-GPU objective 和 validator stop 验证这个方向。
|
||||
|
||||
## 证据边界
|
||||
|
||||
这份报告强支撑 Qwen27B tight-SLO case 上的 harness 机制,但不能单独当作通用性证明。
|
||||
当前可成立的结论是:
|
||||
|
||||
- 在这个 case 中,harness 同时提升了 final quality、convergence speed、AUC 和
|
||||
stop discipline。
|
||||
- `gpt-5.4-mini + harness` 匹配 `gpt-5.5 + harness`,并显著超过
|
||||
`gpt-5.5 + naive`,说明收益主要来自 harness 的结构化状态和 validator,而不是
|
||||
单纯来自更强模型。
|
||||
- 成功路径用的是通用机制:SLO-derived bottleneck classification、topology
|
||||
constraints、knob-effect scoring、per-GPU objective、validator-authorized stop。
|
||||
- 还需要在其他 bottleneck/case 上继续验证,例如 prefill scheduler pressure、
|
||||
decode TPOT pressure、memory/KV pressure、admission/queueing pressure。
|
||||
|
||||
## 原始 aggregate report 摘录
|
||||
|
||||
```text
|
||||
# qwen27b-tight-2x2-aggregate-20260623T005838Z
|
||||
|
||||
## Aggregate
|
||||
|
||||
- Cases: `1`
|
||||
- Harness-vs-naive pass/checks: `2`/`2`
|
||||
- Winner counts: `{"final_best": {"gpt55_harness": 1}, "fastest_to_target": {"gpt55_harness": 1}, "normalized_auc": {"gpt55_harness": 1}}`
|
||||
|
||||
## By Kind
|
||||
|
||||
| Kind | Arms | Mean final/ref | Mean AUC | Target reached |
|
||||
| --- | ---: | ---: | ---: | ---: |
|
||||
| `harness` | 2 | 1.0000 | 0.9467 | 2 |
|
||||
| `naive` | 2 | 0.0569 | 0.0543 | 0 |
|
||||
|
||||
## Cases
|
||||
|
||||
### qwen27b-tight-slo-2x2-aggregate
|
||||
|
||||
- Reference best req/s/GPU: `0.4429`
|
||||
- Target fraction: `0.95`
|
||||
- Winners: `{"final_best": "gpt55_harness", "fastest_to_target": "gpt55_harness", "normalized_auc": "gpt55_harness"}`
|
||||
|
||||
| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |
|
||||
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||
| `gpt55_harness` | `harness` | 2 | 0.4429 | 1.0000 | 2 | 0.9484 | 0 | 0 |
|
||||
| `gpt55_naive` | `naive` | 10 | 0.0273 | 0.0616 | - | 0.0588 | 2 | 2 |
|
||||
| `gpt54mini_harness` | `harness` | 2 | 0.4429 | 1.0000 | 2 | 0.9450 | 0 | 0 |
|
||||
| `gpt54mini_naive` | `naive` | 10 | 0.0231 | 0.0522 | - | 0.0498 | 1 | 1 |
|
||||
|
||||
| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |
|
||||
| --- | ---: | ---: | ---: | --- |
|
||||
| `gpt55_harness` | 16.2290 | - | 16.1296 | `True` |
|
||||
| `gpt54mini_harness` | 16.2290 | - | 16.0720 | `True` |
|
||||
```
|
||||
164
docs/harness-ablation/qwen30b-slo-robustness-20260624.md
Normal file
164
docs/harness-ablation/qwen30b-slo-robustness-20260624.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# Qwen30B SLO robustness - 2026-06-24
|
||||
|
||||
本文整理 Qwen30B-A3B community vLLM 0.20 case 在三档 SLO 下的 harness/naive
|
||||
对比,并解释不同 SLO 为什么没有导致完全不同的最终 topology,却改变了可承载负载边界
|
||||
和 bottleneck 判断。
|
||||
|
||||
原始报告位于远端共享 checkout:
|
||||
|
||||
```text
|
||||
.aituner-reports/qwen30b-slo-robust-gpt55-dash1-20260623T163521Z-strict/report.md
|
||||
.aituner-reports/qwen30b-slo-robust-gpt55-dash1-20260623T163521Z-medium/report.md
|
||||
.aituner-reports/qwen30b-slo-robust-gpt55-dash1-20260623T163521Z-loose/report.md
|
||||
```
|
||||
|
||||
## 实验设计
|
||||
|
||||
Case: `qwen30b-a3b-slo-{strict,medium,loose}-gpt55`。
|
||||
|
||||
共同设置:
|
||||
|
||||
- Served model: Qwen30B-A3B community vLLM 0.20。
|
||||
- Hardware: H20,允许 1/2/4/8 GPU topology。
|
||||
- Trace: chat 0-8k,输出长度 128。
|
||||
- Search: `sampling_u in [0, 1.0]`,tolerance 0.001,max 6 probes。
|
||||
- Objective: 在 pass rate >= 0.95 下最大化 `request_rate / used_gpu_count`。
|
||||
- Tuner model: `gpt-5.5`。
|
||||
|
||||
三档 SLO:
|
||||
|
||||
| SLO | TTFT step rule | TPOT |
|
||||
| --- | --- | ---: |
|
||||
| strict | <=4k: 1s, <=32k: 2s, else: 3s | 40 ms |
|
||||
| medium | <=4k: 2s, <=32k: 4s, else: 6s | 50 ms |
|
||||
| loose | <=4k: 4s, <=32k: 8s, else: 12s | 70 ms |
|
||||
|
||||
## 结果摘要
|
||||
|
||||
| SLO | Harness final req/s/GPU | Naive final req/s/GPU | Final speedup | AUC speedup | Harness TTT |
|
||||
| --- | ---: | ---: | ---: | ---: | ---: |
|
||||
| strict | 2.2083 | 0.8000 | 2.7604x | 2.7886x | 1 |
|
||||
| medium | 3.2583 | 0.8000 | 4.0729x | 4.0729x | 1 |
|
||||
| loose | 3.2583 | 1.0458 | 3.1155x | 4.4622x | 1 |
|
||||
|
||||
三个 SLO 下 harness 都在第一个 trial 到达该 SLO 下的 reference best。naive 在 8 个
|
||||
trials 内没有达到 95% reference target。
|
||||
|
||||
## 最终 tune 出来的配置
|
||||
|
||||
三档 SLO 的最终 best topology 都是:
|
||||
|
||||
```text
|
||||
tensor-parallel-size = 2
|
||||
data-parallel-size = 1
|
||||
enable-expert-parallel = false
|
||||
```
|
||||
|
||||
但这不表示 SLO 没有影响。SLO 改变的是同一个 topology 的可行负载上限:
|
||||
|
||||
| SLO | Best config | Best sampling_u | Total req/s | req/s/GPU | Pass rate |
|
||||
| --- | --- | ---: | ---: | ---: | ---: |
|
||||
| strict | `TP=2, DP=1` | 0.484375 | 4.4167 | 2.2083 | 1.0000 |
|
||||
| medium | `TP=2, DP=1` | 0.750000 | 6.5167 | 3.2583 | 1.0000 |
|
||||
| loose | `TP=2, DP=1` | 0.750000 | 6.5167 | 3.2583 | 1.0000 |
|
||||
|
||||
strict 到 medium/loose 的主要变化是 feasible frontier 右移:同一个 `TP=2, DP=1`
|
||||
配置在 strict 下只能稳定承载 `sampling_u=0.484375`,在 medium/loose 下可以承载
|
||||
`sampling_u=0.75`。
|
||||
|
||||
## 为什么 `TP=2, DP=1` 稳定胜出
|
||||
|
||||
AITuner 的 scoring 不是 raw throughput,而是 SLO-constrained per-GPU throughput:
|
||||
|
||||
```text
|
||||
J(c, SLO) = max_u request_rate(c, u) / used_gpu_count(c)
|
||||
subject to pass_rate(c, u, SLO) >= 0.95
|
||||
```
|
||||
|
||||
这解释了为什么 `TP=4` 没有赢。`TP=4` 的单请求 latency 更低、总吞吐可以更高,
|
||||
但它使用两倍 GPU,per-GPU objective 反而下降:
|
||||
|
||||
| SLO | Config | Total req/s | Used GPUs | req/s/GPU | 解释 |
|
||||
| --- | --- | ---: | ---: | ---: | --- |
|
||||
| strict | `TP=2, DP=1` | 4.4167 | 2 | 2.2083 | strict best |
|
||||
| strict | `TP=4, DP=1` | 4.4167 | 4 | 1.1042 | latency 更低,但 GPU efficiency 更差 |
|
||||
| medium/loose | `TP=2, DP=1` | 6.5167 | 2 | 3.2583 | medium/loose best |
|
||||
| medium/loose | `TP=4, DP=1` | 8.3667 | 4 | 2.0917 | raw throughput 更高,但 per-GPU 不划算 |
|
||||
|
||||
因此 harness 学到的不是“越多 GPU 越好”,而是更具体的机制:
|
||||
|
||||
```text
|
||||
TP=1: 单请求 prefill/decode latency 偏高,SLO-constrained load frontier 低。
|
||||
TP=2: 足够缓解 latency,同时 GPU 数量仍低,per-GPU objective 最优。
|
||||
TP=4: 继续降低 latency,但通信和 GPU 数量成本超过收益。
|
||||
```
|
||||
|
||||
## SLO 改变 bottleneck 的方式
|
||||
|
||||
strict 下,`TP=2, DP=1` 在 `sampling_u=0.484375` 可行,但下一档
|
||||
`sampling_u=0.5` 直接进入 queueing collapse:
|
||||
|
||||
| Point | Pass rate | 主要失败原因 |
|
||||
| --- | ---: | --- |
|
||||
| strict, `u=0.484375` | 1.0000 | 无 |
|
||||
| strict, `u=0.5` | 0.0290 | `tpot_ms>40`, `ttft_ms>1000/2000`, `slo_pass_rate_unrecoverable` |
|
||||
|
||||
medium/loose 下,TTFT 阈值放宽后,同一 topology 能承载更高 arrival intensity。
|
||||
但是在 `u=0.765625` 仍会进入不可恢复的排队区:
|
||||
|
||||
| SLO | Feasible point | Next infeasible point | 主要失败原因 |
|
||||
| --- | --- | --- | --- |
|
||||
| medium | `u=0.75`, pass 1.0000 | `u=0.765625`, pass 0.6900 | `tpot_ms>50`, `slo_pass_rate_unrecoverable` |
|
||||
| loose | `u=0.75`, pass 1.0000 | `u=0.765625`, pass 0.2900 | `tpot_ms>70`, `slo_pass_rate_unrecoverable` |
|
||||
|
||||
这说明 SLO 放宽不是无限提高吞吐。服务系统还有 queueing stability frontier;
|
||||
超过 frontier 后,即使单个请求的 steady-state latency 看起来可控,排队也会让 pass rate
|
||||
迅速崩掉。
|
||||
|
||||
## 其他候选配置的信号
|
||||
|
||||
`TP=1, DP=1` 对 SLO 更敏感:
|
||||
|
||||
| SLO | `TP=1, DP=1` req/s/GPU | 解释 |
|
||||
| --- | ---: | --- |
|
||||
| strict | 2.2000 | 接近 strict best,但略低于 `TP=2` |
|
||||
| medium | 2.2000 | 仍低于 `TP=2` |
|
||||
| loose | 2.8500 | 宽松 SLO 下受益明显,但仍低于 `TP=2` |
|
||||
|
||||
`gpu-memory-utilization=0.92` 在 medium/loose 中与 `TP=2` 打平:
|
||||
|
||||
| SLO | Config | req/s/GPU |
|
||||
| --- | --- | ---: |
|
||||
| medium | `TP=2, gpu-memory-utilization=0.92` | 3.2583 |
|
||||
| loose | `TP=2, gpu-memory-utilization=0.92` | 3.2583 |
|
||||
|
||||
这说明该 workload 的主瓶颈不是 KV memory headroom,而是 topology 和 queueing
|
||||
frontier。
|
||||
|
||||
EP family 在该环境下不稳定:
|
||||
|
||||
```text
|
||||
TP=4, EP=2/4, enable-expert-parallel=true -> engine_launch exit_code=2
|
||||
```
|
||||
|
||||
这些失败 trial 没有进入 best candidate,但它们说明当前 failure memory 还可以继续加强:
|
||||
同一类 EP launch failure 出现后,后续 proposal 应更积极地屏蔽该 family。
|
||||
|
||||
## 对 paper claim 的含义
|
||||
|
||||
这组实验支持的 claim 是:
|
||||
|
||||
1. Harness 对 SLO 变化有稳定收益:strict/medium/loose 三档均显著优于 naive。
|
||||
2. Harness 不是固定写死某个 knob。它通过 SLO-constrained probing 找到 feasible
|
||||
frontier;在本 case 中最终 topology 相同,但可承载负载边界随 SLO 改变。
|
||||
3. Harness 的 value 来自 topology-first candidate family、per-GPU scoring 和
|
||||
validator 对 failed family 的处理,而不是自然语言 prompt 的偶然表达。
|
||||
|
||||
这组实验尚不能单独 claim:
|
||||
|
||||
- 所有模型和 workload 上都 robust。
|
||||
- `TP=2, DP=1` 是全局最优。
|
||||
- EP family 已经被最优处理。
|
||||
|
||||
对应的后续证据应放在 roadmap 中跟踪:局部 grid/near-optimum、跨模型 2x2、跨 workload
|
||||
SLO robustness,以及 failure-memory ablation。
|
||||
@@ -51,6 +51,13 @@ enabled = true
|
||||
sync_remote_path = "~/aituner"
|
||||
fleet_root = "~/.aituner_gpu_fleet"
|
||||
|
||||
[[hosts]]
|
||||
name = "dash4"
|
||||
ssh_alias = "dash4"
|
||||
enabled = true
|
||||
sync_remote_path = "~/workspace/aituner"
|
||||
fleet_root = "~/.aituner_gpu_fleet"
|
||||
|
||||
[[hosts]]
|
||||
name = "dash5"
|
||||
ssh_alias = "dash5"
|
||||
|
||||
@@ -4,5 +4,5 @@ dash0
|
||||
dash1
|
||||
dash2
|
||||
dash3
|
||||
dash4
|
||||
dash5
|
||||
|
||||
|
||||
@@ -10,22 +10,37 @@ import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def topo(patch):
|
||||
fp = (patch or {}).get("flag_patch", {}) or {}
|
||||
ep = (patch or {}).get("env_patch", {}) or {}
|
||||
parts = []
|
||||
for k, label in (
|
||||
TOPOLOGY_KEYS = (
|
||||
("tensor-parallel-size", "TP"),
|
||||
("data-parallel-size", "DP"),
|
||||
("expert-parallel-size", "EP"),
|
||||
):
|
||||
if k in fp:
|
||||
parts.append(f"{label}{fp[k]}")
|
||||
runtime = {
|
||||
)
|
||||
|
||||
RUNTIME_KEYS = (
|
||||
"gpu-memory-utilization",
|
||||
"enable-chunked-prefill",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
)
|
||||
|
||||
|
||||
def topo(patch, base_flags=None):
|
||||
fp = (patch or {}).get("flag_patch", {}) or {}
|
||||
ep = (patch or {}).get("env_patch", {}) or {}
|
||||
effective = dict(base_flags or {})
|
||||
effective.update(fp)
|
||||
parts = []
|
||||
for k, label in TOPOLOGY_KEYS:
|
||||
if k in effective:
|
||||
parts.append(f"{label}{effective[k]}")
|
||||
runtime = {k: effective[k] for k in RUNTIME_KEYS if k in effective}
|
||||
runtime.update(
|
||||
{
|
||||
k: v
|
||||
for k, v in fp.items()
|
||||
if k not in ("tensor-parallel-size", "data-parallel-size", "expert-parallel-size")
|
||||
if k not in {key for key, _ in TOPOLOGY_KEYS} and k not in runtime
|
||||
}
|
||||
)
|
||||
runtime.update({f"env:{k}": v for k, v in ep.items()})
|
||||
base = "+".join(parts) if parts else "baseline-topo"
|
||||
if runtime:
|
||||
@@ -36,6 +51,11 @@ def topo(patch):
|
||||
def main():
|
||||
store = Path(sys.argv[1])
|
||||
state = json.load(open(store / "state.json"))
|
||||
snapshot_path = store / "study_spec.snapshot.json"
|
||||
base_flags = {}
|
||||
if snapshot_path.exists():
|
||||
snapshot = json.load(open(snapshot_path))
|
||||
base_flags = ((snapshot.get("engine") or {}).get("base_flags") or {})
|
||||
print(f"study_id: {state.get('study_id')}")
|
||||
print(f"best_trial: {state.get('best_trial_id')} best_per_gpu: {state.get('best_request_rate_per_gpu')}")
|
||||
print(f"stop_reason: {state.get('tuning_stop_reason')!r}")
|
||||
@@ -53,7 +73,7 @@ def main():
|
||||
pgs = f"{pg:.4f}" if isinstance(pg, (int, float)) else str(pg)
|
||||
incs = f"{incumbent:.4f}" if isinstance(incumbent, (int, float)) else str(incumbent)
|
||||
print(
|
||||
f"{i:<5}{t.get('trial_id',''):<11}{str(t.get('status','')):<14}{pgs:<10}{incs:<11}{topo(t.get('config_patch'))}"
|
||||
f"{i:<5}{t.get('trial_id',''):<11}{str(t.get('status','')):<14}{pgs:<10}{incs:<11}{topo(t.get('config_patch'), base_flags)}"
|
||||
)
|
||||
# also dump proposals dir to see what was *proposed* (incl. vetoed/failed)
|
||||
pdir = store / "proposals"
|
||||
@@ -64,7 +84,7 @@ def main():
|
||||
pr = json.load(open(p))
|
||||
except Exception:
|
||||
continue
|
||||
print(f" {p.stem}: should_stop={pr.get('should_stop')} | {topo(pr.get('config_patch'))}")
|
||||
print(f" {p.stem}: should_stop={pr.get('should_stop')} | {topo(pr.get('config_patch'), base_flags)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
99
scripts/calibrate_time_scale.py
Normal file
99
scripts/calibrate_time_scale.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Criterion-A time_scale calibration.
|
||||
|
||||
Binary-search the smallest replay_time_scale whose A-family L-C-A similarity to the
|
||||
real (scale=1.0) arrival process stays >= tau. Uniform time scaling distorts only
|
||||
the A axis (rate + fano; interarrival CV is scale-invariant), so this bounds the
|
||||
arrival-axis distortion introduced by compression using the same similarity metric
|
||||
Stop-A uses. Pure trace metadata -> deterministic, no GPU needed.
|
||||
|
||||
Usage:
|
||||
PYTHONPATH=src python3 scripts/calibrate_time_scale.py \
|
||||
--trace trace_windows/traces/chat_w20260311_1000.jsonl \
|
||||
--gpu-count 8 --min-input 0 --max-input 8192 --tau 0.9
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
from aituner.lca import _family_similarity, build_workload_profile
|
||||
from aituner.trace import TraceRequest, WindowRecord
|
||||
|
||||
|
||||
def load_rows(path: Path, lo: int, hi: int) -> list[dict]:
|
||||
with path.open(encoding="utf-8") as fh:
|
||||
rows = [json.loads(l) for l in fh if l.strip()]
|
||||
return [r for r in rows if lo <= int(r["input_length"]) <= hi]
|
||||
|
||||
|
||||
def build_requests(rows: list[dict]) -> tuple[list[TraceRequest], float, float]:
|
||||
reqs = []
|
||||
for i, r in enumerate(rows):
|
||||
reqs.append(
|
||||
TraceRequest(
|
||||
row_id=str(r.get("chat_id", i)),
|
||||
arrival_s=float(r["timestamp"]),
|
||||
sampling_u=float(r.get("sampling_u", 0.0)),
|
||||
body={},
|
||||
prompt_tokens_hint=int(r["input_length"]),
|
||||
completion_tokens_hint=int(r["output_length"]),
|
||||
metadata={"hash_ids": r.get("hash_ids") if isinstance(r.get("hash_ids"), list) else None},
|
||||
)
|
||||
)
|
||||
amin = min(x.arrival_s for x in reqs)
|
||||
amax = max(x.arrival_s for x in reqs)
|
||||
return reqs, amin, amax
|
||||
|
||||
|
||||
def profile_at(reqs, amin, amax, gpu_count, scale):
|
||||
rs = [
|
||||
TraceRequest(
|
||||
x.row_id, (x.arrival_s - amin) * scale, x.sampling_u, x.body,
|
||||
x.prompt_tokens_hint, x.completion_tokens_hint, x.metadata,
|
||||
)
|
||||
for x in reqs
|
||||
]
|
||||
span = (amax - amin) * scale
|
||||
w = WindowRecord(
|
||||
window_id="w", trace_path="", trace_type="chat",
|
||||
window_start=0.0, window_end=span, source_payload={"block_size": 64},
|
||||
)
|
||||
return build_workload_profile(rs, w, gpu_count=gpu_count, length_mode="total")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--trace", type=Path, required=True)
|
||||
ap.add_argument("--gpu-count", type=int, default=8)
|
||||
ap.add_argument("--min-input", type=int, default=0)
|
||||
ap.add_argument("--max-input", type=int, default=8192)
|
||||
ap.add_argument("--tau", type=float, default=0.9)
|
||||
args = ap.parse_args()
|
||||
|
||||
rows = load_rows(args.trace, args.min_input, args.max_input)
|
||||
reqs, amin, amax = build_requests(rows)
|
||||
print(f"n={len(reqs)} raw arrival span={amax - amin:.1f}s")
|
||||
base = profile_at(reqs, amin, amax, args.gpu_count, 1.0)
|
||||
print(f"{'scale':>6} {'simA':>7} {'rate/gpu':>9} {'fano':>8} {'span_s':>8}")
|
||||
for s in (1.0, 0.95, 0.9, 0.85, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2):
|
||||
p = profile_at(reqs, amin, amax, args.gpu_count, s)
|
||||
a = _family_similarity(base.vector, p.vector)["A"]
|
||||
print(f"{s:6.2f} {a:7.3f} {math.expm1(p.vector[7]):9.3f} {math.expm1(p.vector[9]):8.2f} {(amax-amin)*s:8.1f}")
|
||||
|
||||
lo, hi = 0.05, 1.0
|
||||
for _ in range(40):
|
||||
mid = (lo + hi) / 2
|
||||
a = _family_similarity(base.vector, profile_at(reqs, amin, amax, args.gpu_count, mid).vector)["A"]
|
||||
if a >= args.tau:
|
||||
hi = mid
|
||||
else:
|
||||
lo = mid
|
||||
print(f"\nsmallest scale with simA>={args.tau}: {hi:.4f} (arrival span {(amax-amin)*hi:.0f}s)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
31
scripts/run_ablation_pair_d1.sh
Normal file
31
scripts/run_ablation_pair_d1.sh
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
# 12-iteration harness-vs-naive ablation, both arms on dash1 (clean paired run,
|
||||
# no host confound). Substrate: real output_length (no completion override),
|
||||
# replay_time_scale=0.8775 (criterion-A, sim_A>=0.90), Stop-A on (LCA offered
|
||||
# window), per-probe Stop-A-consistent drain deadline. Harness stops early; naive
|
||||
# runs the full budget. Run from the repo root on dash1.
|
||||
set -u
|
||||
# Re-read the codex token from auth.json right before each arm (capturing it once at
|
||||
# launch goes stale during a long run -- that is what 401'd naive runs 2/3).
|
||||
read_key() { export OPENAI_API_KEY=$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])'); }
|
||||
# codex config.toml points at a dash0-local proxy (127.0.0.1:11235); on dash1 the
|
||||
# LLM endpoint is reachable directly, so force a direct connection.
|
||||
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||
mkdir -p .aituner
|
||||
rm -rf .aituner/abl12-harness .aituner/abl12-naive .aituner/ABLATION12_DONE
|
||||
|
||||
read_key
|
||||
echo "=== harness ON (12-iter) start $(date -Is) ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
|
||||
--store-root .aituner/abl12-harness --max-trials 12 --skip-baseline > .aituner/abl12-harness.log 2>&1
|
||||
echo "=== harness ON (12-iter) done $(date -Is) ==="
|
||||
|
||||
read_key
|
||||
echo "=== naive OFF (12-iter) start $(date -Is) ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
|
||||
--store-root .aituner/abl12-naive --max-trials 12 --skip-baseline > .aituner/abl12-naive.log 2>&1
|
||||
echo "=== naive OFF (12-iter) done $(date -Is) ==="
|
||||
|
||||
touch .aituner/ABLATION12_DONE
|
||||
81
scripts/run_clean_ablation_pair_d1.sh
Normal file
81
scripts/run_clean_ablation_pair_d1.sh
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env bash
|
||||
# Clean same-policy harness-vs-naive ablation on dash1.
|
||||
#
|
||||
# This is intended as the first robustness gate for harness evaluation:
|
||||
# both arms use the same study substrate and the same configured LLM endpoint;
|
||||
# the only intended difference is llm.use_harness.
|
||||
set -euo pipefail
|
||||
|
||||
RUN_LABEL="${AITUNER_RUN_ID:-qwen27b-clean-pair-$(date -u +%Y%m%dT%H%M%SZ)}"
|
||||
MAX_TRIALS="${MAX_TRIALS:-12}"
|
||||
ROOT="$(pwd)"
|
||||
HARNESS_STORE=".aituner/${RUN_LABEL}-harness"
|
||||
NAIVE_STORE=".aituner/${RUN_LABEL}-naive"
|
||||
REPORT_ROOT=".aituner-reports/${RUN_LABEL}"
|
||||
SPEC_PATH=".aituner-reports/${RUN_LABEL}.spec.json"
|
||||
|
||||
read_key() {
|
||||
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||
export OPENAI_API_KEY
|
||||
OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')"
|
||||
fi
|
||||
}
|
||||
|
||||
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||
mkdir -p .aituner .aituner-reports
|
||||
rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${SPEC_PATH}"
|
||||
|
||||
read_key
|
||||
echo "=== harness ON clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
|
||||
--store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||
> ".aituner/${RUN_LABEL}-harness.log" 2>&1
|
||||
echo "=== harness ON clean pair done $(date -Is) ==="
|
||||
|
||||
read_key
|
||||
echo "=== naive OFF clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
|
||||
--store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||
> ".aituner/${RUN_LABEL}-naive.log" 2>&1
|
||||
echo "=== naive OFF clean pair done $(date -Is) ==="
|
||||
|
||||
python3 - <<PY
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
root = Path("${ROOT}")
|
||||
run_label = "${RUN_LABEL}"
|
||||
spec = {
|
||||
"report_id": run_label,
|
||||
"output_root": str(root / "${REPORT_ROOT}"),
|
||||
"target_fraction": 0.95,
|
||||
"min_final_ratio": 0.98,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "qwen27b-chat-0-8k-clean-gpt55",
|
||||
"description": "Clean same-policy gpt-5.5 harness-vs-naive pair on dash1.",
|
||||
"tags": ["qwen27b", "chat", "0-8k", "h20", "clean-pair", "gpt-5.5"],
|
||||
"budgets": [1, 2, 3, 4, 6, 8, 12],
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": str(root / "${HARNESS_STORE}" / "dash0-qwen27b-ablation-harness-on"),
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": str(root / "${NAIVE_STORE}" / "dash0-qwen27b-ablation-naive-off"),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
Path("${SPEC_PATH}").write_text(json.dumps(spec, indent=2) + "\\n", encoding="utf-8")
|
||||
PY
|
||||
|
||||
PYTHONPATH=src python3 scripts/tuning_report.py --spec "${SPEC_PATH}"
|
||||
touch ".aituner/${RUN_LABEL}.DONE"
|
||||
echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ==="
|
||||
177
scripts/run_clean_pair_from_specs.sh
Executable file
177
scripts/run_clean_pair_from_specs.sh
Executable file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run a clean same-policy harness-vs-naive pair from one or two base specs.
|
||||
#
|
||||
# Required env:
|
||||
# RUN_LABEL
|
||||
# CASE_ID
|
||||
# HARNESS_BASE_SPEC
|
||||
#
|
||||
# Optional env:
|
||||
# NAIVE_BASE_SPEC defaults to HARNESS_BASE_SPEC
|
||||
# MAX_TRIALS defaults to 12
|
||||
# CASE_DESCRIPTION
|
||||
# CASE_TAGS_JSON JSON list, defaults to []
|
||||
# BUDGETS_JSON JSON list, defaults to [1,2,3,4,6,8,MAX_TRIALS]
|
||||
# COMMON_SPEC_PATCH_FILE JSON deep-merged into both generated specs
|
||||
# HARNESS_SPEC_PATCH_FILE JSON deep-merged into harness generated spec
|
||||
# NAIVE_SPEC_PATCH_FILE JSON deep-merged into naive generated spec
|
||||
set -euo pipefail
|
||||
|
||||
RUN_LABEL="${RUN_LABEL:?RUN_LABEL is required}"
|
||||
CASE_ID="${CASE_ID:?CASE_ID is required}"
|
||||
HARNESS_BASE_SPEC="${HARNESS_BASE_SPEC:?HARNESS_BASE_SPEC is required}"
|
||||
NAIVE_BASE_SPEC="${NAIVE_BASE_SPEC:-${HARNESS_BASE_SPEC}}"
|
||||
MAX_TRIALS="${MAX_TRIALS:-12}"
|
||||
CASE_DESCRIPTION="${CASE_DESCRIPTION:-Clean same-policy harness-vs-naive pair.}"
|
||||
CASE_TAGS_JSON="${CASE_TAGS_JSON:-[]}"
|
||||
BUDGETS_JSON="${BUDGETS_JSON:-}"
|
||||
|
||||
ROOT="$(pwd)"
|
||||
RUN_CONFIG_ROOT=".aituner-run-configs/${RUN_LABEL}"
|
||||
HARNESS_SPEC="${RUN_CONFIG_ROOT}/harness.json"
|
||||
NAIVE_SPEC="${RUN_CONFIG_ROOT}/naive.json"
|
||||
HARNESS_STORE=".aituner/${RUN_LABEL}-harness"
|
||||
NAIVE_STORE=".aituner/${RUN_LABEL}-naive"
|
||||
REPORT_ROOT=".aituner-reports/${RUN_LABEL}"
|
||||
REPORT_SPEC=".aituner-reports/${RUN_LABEL}.spec.json"
|
||||
export RUN_LABEL CASE_ID HARNESS_BASE_SPEC NAIVE_BASE_SPEC MAX_TRIALS CASE_DESCRIPTION
|
||||
export CASE_TAGS_JSON BUDGETS_JSON ROOT RUN_CONFIG_ROOT HARNESS_SPEC NAIVE_SPEC
|
||||
export HARNESS_STORE NAIVE_STORE REPORT_ROOT REPORT_SPEC
|
||||
|
||||
read_key() {
|
||||
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||
export OPENAI_API_KEY
|
||||
OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')"
|
||||
fi
|
||||
}
|
||||
|
||||
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||
mkdir -p "${RUN_CONFIG_ROOT}" .aituner .aituner-reports
|
||||
rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${REPORT_SPEC}"
|
||||
|
||||
python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def deep_merge(base: dict[str, Any], patch: dict[str, Any]) -> dict[str, Any]:
|
||||
merged = dict(base)
|
||||
for key, value in patch.items():
|
||||
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||
merged[key] = deep_merge(merged[key], value)
|
||||
else:
|
||||
merged[key] = value
|
||||
return merged
|
||||
|
||||
|
||||
def load_patch(env_name: str) -> dict[str, Any]:
|
||||
path = os.environ.get(env_name)
|
||||
if not path:
|
||||
return {}
|
||||
payload = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
if not isinstance(payload, dict):
|
||||
raise SystemExit(f"{env_name} must point to a JSON object")
|
||||
return payload
|
||||
|
||||
|
||||
def generated_spec(base_path: str, *, use_harness: bool, suffix: str, arm_patch: dict[str, Any]) -> dict[str, Any]:
|
||||
base = json.loads(Path(base_path).read_text(encoding="utf-8"))
|
||||
if not isinstance(base, dict):
|
||||
raise SystemExit(f"{base_path} must contain a JSON object")
|
||||
common = load_patch("COMMON_SPEC_PATCH_FILE")
|
||||
spec = deep_merge(base, common)
|
||||
spec = deep_merge(spec, arm_patch)
|
||||
spec["study_id"] = str(spec.get("study_id") or os.environ["CASE_ID"]) + f"-{suffix}"
|
||||
llm = dict(spec.get("llm") or {})
|
||||
llm["use_harness"] = use_harness
|
||||
spec["llm"] = llm
|
||||
return spec
|
||||
|
||||
|
||||
run_config_root = Path(os.environ["RUN_CONFIG_ROOT"])
|
||||
harness = generated_spec(
|
||||
os.environ["HARNESS_BASE_SPEC"],
|
||||
use_harness=True,
|
||||
suffix="harness",
|
||||
arm_patch=load_patch("HARNESS_SPEC_PATCH_FILE"),
|
||||
)
|
||||
naive = generated_spec(
|
||||
os.environ["NAIVE_BASE_SPEC"],
|
||||
use_harness=False,
|
||||
suffix="naive",
|
||||
arm_patch=load_patch("NAIVE_SPEC_PATCH_FILE"),
|
||||
)
|
||||
(run_config_root / "harness.json").write_text(json.dumps(harness, indent=2) + "\n", encoding="utf-8")
|
||||
(run_config_root / "naive.json").write_text(json.dumps(naive, indent=2) + "\n", encoding="utf-8")
|
||||
print(json.dumps({"harness_study_id": harness["study_id"], "naive_study_id": naive["study_id"]}, ensure_ascii=False))
|
||||
PY
|
||||
|
||||
read_key
|
||||
echo "=== harness clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec "${HARNESS_SPEC}" \
|
||||
--store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||
> ".aituner/${RUN_LABEL}-harness.log" 2>&1
|
||||
echo "=== harness clean pair done $(date -Is) ==="
|
||||
|
||||
read_key
|
||||
echo "=== naive clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec "${NAIVE_SPEC}" \
|
||||
--store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||
> ".aituner/${RUN_LABEL}-naive.log" 2>&1
|
||||
echo "=== naive clean pair done $(date -Is) ==="
|
||||
|
||||
python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
root = Path(os.environ["ROOT"])
|
||||
run_label = os.environ["RUN_LABEL"]
|
||||
harness = json.loads(Path(os.environ["HARNESS_SPEC"]).read_text(encoding="utf-8"))
|
||||
naive = json.loads(Path(os.environ["NAIVE_SPEC"]).read_text(encoding="utf-8"))
|
||||
max_trials = int(os.environ["MAX_TRIALS"])
|
||||
budgets_text = os.environ.get("BUDGETS_JSON") or ""
|
||||
if budgets_text:
|
||||
budgets = json.loads(budgets_text)
|
||||
else:
|
||||
budgets = [1, 2, 3, 4, 6, 8, max_trials]
|
||||
budgets = sorted({int(item) for item in budgets if int(item) > 0})
|
||||
tags = json.loads(os.environ.get("CASE_TAGS_JSON") or "[]")
|
||||
spec = {
|
||||
"report_id": run_label,
|
||||
"output_root": str(root / os.environ["REPORT_ROOT"]),
|
||||
"target_fraction": 0.95,
|
||||
"min_final_ratio": 0.98,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": os.environ["CASE_ID"],
|
||||
"description": os.environ["CASE_DESCRIPTION"],
|
||||
"tags": tags,
|
||||
"budgets": budgets,
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": str(
|
||||
root / os.environ["HARNESS_STORE"] / harness["study_id"]
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": str(root / os.environ["NAIVE_STORE"] / naive["study_id"]),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
Path(os.environ["REPORT_SPEC"]).write_text(json.dumps(spec, indent=2) + "\n", encoding="utf-8")
|
||||
PY
|
||||
|
||||
PYTHONPATH=src python3 scripts/tuning_report.py --spec "${REPORT_SPEC}"
|
||||
touch ".aituner/${RUN_LABEL}.DONE"
|
||||
echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ==="
|
||||
16
scripts/run_harness_only_d1.sh
Normal file
16
scripts/run_harness_only_d1.sh
Normal file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
# Harness-only re-run on gpt-5.5 to EMPIRICALLY verify the gpu-memory-utilization fix:
|
||||
# success = the harness recovers ~0.87/GPU (climbs gpu-mem-util to ~0.94) and then stops,
|
||||
# matching the naive-discovered ground truth. Run from the repo root on dash1.
|
||||
set -u
|
||||
read_key() { export OPENAI_API_KEY=$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])'); }
|
||||
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||
mkdir -p .aituner
|
||||
rm -rf .aituner/abl12-harness .aituner/abl12-harness.log .aituner/HARNESS_ONLY_DONE
|
||||
read_key
|
||||
echo "=== harness ON (gpt-5.5, gpu-mem-util fix) start $(date -Is) ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
|
||||
--store-root .aituner/abl12-harness --max-trials 12 --skip-baseline > .aituner/abl12-harness.log 2>&1
|
||||
echo "=== harness ON done $(date -Is) ==="
|
||||
touch .aituner/HARNESS_ONLY_DONE
|
||||
26
scripts/run_naive_repeats_d1.sh
Normal file
26
scripts/run_naive_repeats_d1.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fig-18 naive nondeterminism: after the main pair (ABLATION12_DONE) finishes, run
|
||||
# 2 more naive arms (runs 2 and 3) on the SAME substrate. The naive LLM (gpt-5.4,
|
||||
# use_harness=false) is nondeterministic, so the run-to-run spread (fail / slow /
|
||||
# lucky) is the result. Harness arm stays a single deterministic curve. Run from
|
||||
# the repo root on dash1; survives disconnect via setsid/nohup at launch.
|
||||
set -u
|
||||
export OPENAI_API_KEY=$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')
|
||||
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||
|
||||
# Wait for the main harness+naive(run1) pair to complete so we never contend for GPUs.
|
||||
echo "=== waiting for ABLATION12_DONE $(date -Is) ==="
|
||||
while [ ! -f .aituner/ABLATION12_DONE ]; do sleep 120; done
|
||||
echo "=== main pair done, starting naive repeats $(date -Is) ==="
|
||||
|
||||
for r in 2 3; do
|
||||
rm -rf ".aituner/abl12-naive${r}" ".aituner/abl12-naive${r}.log"
|
||||
echo "=== naive run ${r} start $(date -Is) ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
|
||||
--store-root ".aituner/abl12-naive${r}" --max-trials 12 --skip-baseline > ".aituner/abl12-naive${r}.log" 2>&1
|
||||
echo "=== naive run ${r} done $(date -Is) ==="
|
||||
done
|
||||
|
||||
touch .aituner/NAIVE_REPEATS_DONE
|
||||
echo "=== all naive repeats done $(date -Is) ==="
|
||||
36
scripts/tuning_report.py
Normal file
36
scripts/tuning_report.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from aituner.tuning_report import run_tuning_report
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Summarize anytime tuning progress across harness/naive study stores."
|
||||
)
|
||||
parser.add_argument("--spec", required=True, help="Path to a tuning report JSON spec.")
|
||||
args = parser.parse_args()
|
||||
summary = run_tuning_report(Path(args.spec))
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"report_id": summary["report_id"],
|
||||
"report_root": summary["report_root"],
|
||||
"case_count": summary["aggregate"]["case_count"],
|
||||
"harness_vs_naive_pass_count": summary["aggregate"]["harness_vs_naive_pass_count"],
|
||||
"harness_vs_naive_check_count": summary["aggregate"]["harness_vs_naive_check_count"],
|
||||
"winner_counts": summary["aggregate"]["winner_counts"],
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -367,20 +367,41 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
||||
proposal_source_label = "harness"
|
||||
else:
|
||||
proposal_source_label = str(proposal_source) if proposal_source else "llm"
|
||||
stop_authorized_by = (
|
||||
"validator"
|
||||
if (is_harness_stop or authorized)
|
||||
else "file_proposal"
|
||||
if proposal_source is not None
|
||||
else "llm_after_veto_budget"
|
||||
)
|
||||
stop_reason = (
|
||||
"harness_stop"
|
||||
if is_harness_stop
|
||||
else "proposal_file_stop"
|
||||
if proposal_source is not None
|
||||
else "llm_stop"
|
||||
)
|
||||
stop_details = {
|
||||
"proposal_name": proposal_name,
|
||||
"proposal_source": proposal_source_label,
|
||||
"stop_authorized_by": stop_authorized_by,
|
||||
}
|
||||
if stop_authority:
|
||||
stop_details["validator_reason"] = stop_authority.get("reason")
|
||||
state.tuning_stop_reason = stop_reason
|
||||
state.tuning_stop_diagnosis = proposal.diagnosis
|
||||
state.tuning_stop_details = stop_details
|
||||
store.save_state(state)
|
||||
executed.append(
|
||||
{
|
||||
"trial_id": None,
|
||||
"proposal_name": proposal_name,
|
||||
"proposal_source": proposal_source_label,
|
||||
"stopped": True,
|
||||
"stop_authorized_by": (
|
||||
"validator"
|
||||
if (is_harness_stop or authorized)
|
||||
else "file_proposal"
|
||||
if proposal_source is not None
|
||||
else "llm_after_veto_budget"
|
||||
),
|
||||
"reason": state.tuning_stop_reason,
|
||||
"stop_authorized_by": stop_authorized_by,
|
||||
"diagnosis": proposal.diagnosis,
|
||||
"details": stop_details,
|
||||
"state_best_trial_id": state.best_trial_id,
|
||||
"state_best_request_rate": state.best_request_rate,
|
||||
}
|
||||
|
||||
@@ -24,6 +24,13 @@ _RUNTIME_KEYS = {
|
||||
_STRONG_INCUMBENT_MIN_GAIN = 1.8
|
||||
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
|
||||
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
|
||||
# Decode-bound throughput is frequently KV-cache limited, so more gpu-memory-utilization
|
||||
# yields more KV blocks and more concurrent decode. Hill-climb in small steps toward a
|
||||
# safe ceiling and let measurement find the real peak: a too-high target regresses or
|
||||
# fails to launch and is rejected by the incumbent guard, and its tested signature then
|
||||
# blocks re-proposal so the climb terminates.
|
||||
_GMU_STEP = 0.02
|
||||
_GMU_SAFE_CEILING = 0.97
|
||||
|
||||
|
||||
def build_harness_context(
|
||||
@@ -383,14 +390,17 @@ def _knob_harnesses(
|
||||
"knob_family": "gpu-memory-utilization",
|
||||
"use_when": [
|
||||
"The engine launches cleanly but memory headroom limits batching.",
|
||||
"A decode-bound incumbent (decode_tpot) is KV-cache limited and could sustain more concurrent decode with more KV blocks.",
|
||||
],
|
||||
"procedure": [
|
||||
"Make small adjustments only after topology and batching knobs are stable.",
|
||||
"Raise gpu-memory-utilization one small step at a time and keep the step only if request_rate_per_gpu improves and the engine still launches.",
|
||||
],
|
||||
"guards": [
|
||||
"Treat launch OOM as hard negative evidence and back off immediately.",
|
||||
"Do not exceed a safe utilization ceiling; stop climbing once a higher target regresses or fails to launch.",
|
||||
],
|
||||
"active_now": False,
|
||||
"active_now": active_bottleneck in {"decode_tpot", "admission_or_queueing"},
|
||||
}
|
||||
)
|
||||
return harnesses
|
||||
@@ -597,6 +607,15 @@ def stateful_history_limit() -> int:
|
||||
return 8
|
||||
|
||||
|
||||
def _state_completed_trials_with_rates(state: StudyState) -> list[TrialSummary]:
|
||||
return [
|
||||
trial
|
||||
for trial in state.trials
|
||||
if trial.status == "completed"
|
||||
and isinstance(trial.best_request_rate_per_gpu, (int, float))
|
||||
]
|
||||
|
||||
|
||||
def _load_result(trial: TrialSummary) -> dict[str, Any] | None:
|
||||
if not trial.result_path:
|
||||
return None
|
||||
@@ -1084,6 +1103,7 @@ def _candidate_actions(
|
||||
anchor,
|
||||
top_bottleneck,
|
||||
bottleneck_hypotheses,
|
||||
recent_diagnostics,
|
||||
tested_signatures,
|
||||
)
|
||||
)
|
||||
@@ -1177,13 +1197,31 @@ def _runtime_candidate_actions(
|
||||
anchor: dict[str, Any],
|
||||
top_bottleneck: str,
|
||||
bottleneck_hypotheses: list[dict[str, Any]],
|
||||
recent_diagnostics: list[dict[str, Any]],
|
||||
tested_signatures: set[str],
|
||||
) -> list[dict[str, Any]]:
|
||||
tunable = set(study.engine.tunable_flags)
|
||||
anchor_flags = _effective_flags_for_item(study, anchor)
|
||||
topology_patch = _preserve_topology_patch(study, anchor_flags)
|
||||
runtime_base_patch = {**topology_patch, **_preserve_runtime_patch(study, anchor_flags)}
|
||||
actions: list[dict[str, Any]] = []
|
||||
|
||||
cur_tp = _parse_int_like(anchor_flags.get("tensor-parallel-size"), default=1)
|
||||
cur_dp = _parse_int_like(anchor_flags.get("data-parallel-size"), default=1)
|
||||
# Topology-before-runtime: gpu-mem-util / raising max-num-seqs are micro-tuning that is
|
||||
# only justified once no untested TP increase remains. At an intermediate TP (e.g. TP2
|
||||
# while TP4 is still reachable and untried) a latency bottleneck must still be answered
|
||||
# by climbing TP, not a runtime tweak -- otherwise runtime tuning preempts the frontier.
|
||||
_next_tp = _next_allowed_tp(study, current_tp=cur_tp, current_dp=cur_dp)
|
||||
tp_frontier_open = (
|
||||
_next_tp is not None
|
||||
and _config_signature(
|
||||
{"env_patch": {}, "flag_patch": {"tensor-parallel-size": _next_tp}}
|
||||
)
|
||||
not in tested_signatures
|
||||
)
|
||||
topology_settled = not tp_frontier_open
|
||||
|
||||
if "max-num-batched-tokens" in tunable:
|
||||
current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
|
||||
mbt_targets: list[tuple[str, int]] = []
|
||||
@@ -1198,7 +1236,7 @@ def _runtime_candidate_actions(
|
||||
elif top_bottleneck == "decode_tpot" and current_mbt > 8192:
|
||||
mbt_targets.append(("lower_mbt", max(8192, current_mbt // 2)))
|
||||
for action_id, target in mbt_targets:
|
||||
patch = {**topology_patch, "max-num-batched-tokens": target}
|
||||
patch = {**runtime_base_patch, "max-num-batched-tokens": target}
|
||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||
if signature in tested_signatures:
|
||||
continue
|
||||
@@ -1222,18 +1260,48 @@ def _runtime_candidate_actions(
|
||||
|
||||
if "max-num-seqs" in tunable:
|
||||
current_mns = _parse_int_like(anchor_flags.get("max-num-seqs"), default=0)
|
||||
max_num_seqs_tested = any(
|
||||
"max-num-seqs" in (
|
||||
((item.get("config_patch") or {}).get("flag_patch") or {})
|
||||
if isinstance(item.get("config_patch"), dict)
|
||||
else {}
|
||||
)
|
||||
for item in recent_diagnostics
|
||||
)
|
||||
mns_targets: list[tuple[str, int]] = []
|
||||
if top_bottleneck == "admission_or_queueing":
|
||||
target = max(8, int(current_mns * 1.5)) if current_mns > 0 else 64
|
||||
mns_targets.append(("raise_max_num_seqs", _round_up_to_multiple(target, 8)))
|
||||
elif top_bottleneck == "decode_tpot" and current_mns > 8:
|
||||
elif top_bottleneck == "decode_tpot":
|
||||
if current_mns > 8:
|
||||
mns_targets.append(("lower_max_num_seqs", max(8, current_mns // 2)))
|
||||
# Decode concurrency can also be too low: once topology is settled, raising
|
||||
# max-num-seqs exploits decode parallelism when the incumbent has SLO headroom.
|
||||
# The incumbent guard keeps it only if per-GPU rate improves.
|
||||
if topology_settled:
|
||||
raise_target = _round_up_to_multiple(
|
||||
max(16, int(current_mns * 1.5)) if current_mns > 0 else 48, 8
|
||||
)
|
||||
mns_targets.append(("raise_max_num_seqs", raise_target))
|
||||
elif top_bottleneck == "ttft_prefill" and topology_settled and not max_num_seqs_tested:
|
||||
# Prefill-heavy TTFT can still be admission/concurrency limited after TP and
|
||||
# max-num-batched-tokens probes settle. Try a modest same-topology seq cap
|
||||
# increase before letting convergence guards declare the incumbent final.
|
||||
target = _round_up_to_multiple(
|
||||
max(16, int(current_mns * 1.5)) if current_mns > 0 else 64, 8
|
||||
)
|
||||
mns_targets.append(("raise_max_num_seqs", target))
|
||||
for action_id, target in mns_targets:
|
||||
patch = {**topology_patch, "max-num-seqs": target}
|
||||
patch = {**runtime_base_patch, "max-num-seqs": target}
|
||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||
if signature in tested_signatures:
|
||||
continue
|
||||
relief = 0.25 if top_bottleneck in {"decode_tpot", "admission_or_queueing"} else 0.08
|
||||
if top_bottleneck in {"decode_tpot", "admission_or_queueing"}:
|
||||
relief = 0.25
|
||||
elif top_bottleneck == "ttft_prefill":
|
||||
relief = 0.3
|
||||
else:
|
||||
relief = 0.08
|
||||
actions.append(
|
||||
_runtime_action(
|
||||
action_id=action_id,
|
||||
@@ -1241,12 +1309,63 @@ def _runtime_candidate_actions(
|
||||
score=relief + _information_gain(bottleneck_hypotheses, "runtime"),
|
||||
patch=patch,
|
||||
hypothesis=(
|
||||
"Adjust max-num-seqs to test whether concurrency pressure is the "
|
||||
"limiting factor under the configured SLO."
|
||||
"Adjust max-num-seqs to test whether concurrency/admission pressure "
|
||||
"is the limiting factor under the configured SLO."
|
||||
),
|
||||
expected_effects=[
|
||||
"change decode/admission concurrency on the incumbent topology",
|
||||
"confirm if TPOT or queueing pressure is caused by sequence concurrency",
|
||||
"change prefill/decode admission concurrency on the incumbent topology",
|
||||
"confirm if latency or queueing pressure is caused by sequence concurrency",
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
top_bottleneck == "ttft_prefill"
|
||||
and topology_settled
|
||||
and "max-num-batched-tokens" in tunable
|
||||
and "max-num-seqs" in tunable
|
||||
and max_num_seqs_tested
|
||||
):
|
||||
current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
|
||||
current_mns = _parse_int_like(anchor_flags.get("max-num-seqs"), default=0)
|
||||
if current_mbt > 0:
|
||||
window_target = _initial_mbt_from_window(window_summary)
|
||||
step_target = _next_mbt_step(current_mbt) or current_mbt
|
||||
mbt_target = min(
|
||||
32768,
|
||||
max(
|
||||
step_target,
|
||||
min(window_target, _round_up_to_multiple(current_mbt * 2, 1024)),
|
||||
),
|
||||
)
|
||||
else:
|
||||
mbt_target = _initial_mbt_from_window(window_summary)
|
||||
mns_target = _round_up_to_multiple(
|
||||
max(16, int(current_mns * 1.5)) if current_mns > 0 else 64, 8
|
||||
)
|
||||
if mbt_target > 0 and (mbt_target != current_mbt or mns_target != current_mns):
|
||||
patch = {
|
||||
**runtime_base_patch,
|
||||
"max-num-batched-tokens": mbt_target,
|
||||
"max-num-seqs": mns_target,
|
||||
}
|
||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||
if signature not in tested_signatures:
|
||||
actions.append(
|
||||
_runtime_action(
|
||||
action_id="raise_mbt_and_max_num_seqs",
|
||||
knob_family="prefill-runtime-interaction",
|
||||
score=0.38
|
||||
+ _information_gain(bottleneck_hypotheses, "runtime"),
|
||||
patch=patch,
|
||||
hypothesis=(
|
||||
"Jointly raise max-num-batched-tokens and max-num-seqs to test "
|
||||
"whether prefill batching headroom and admission concurrency only "
|
||||
"help when adjusted together."
|
||||
),
|
||||
expected_effects=[
|
||||
"preserve the incumbent topology while changing coupled prefill runtime limits",
|
||||
"confirm whether separate MBT or sequence-cap probes masked an interaction",
|
||||
],
|
||||
)
|
||||
)
|
||||
@@ -1254,7 +1373,7 @@ def _runtime_candidate_actions(
|
||||
if "enable-chunked-prefill" in tunable and top_bottleneck == "ttft_prefill":
|
||||
current = bool(anchor_flags.get("enable-chunked-prefill", False))
|
||||
if not current:
|
||||
patch = {**topology_patch, "enable-chunked-prefill": True}
|
||||
patch = {**runtime_base_patch, "enable-chunked-prefill": True}
|
||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||
if signature not in tested_signatures:
|
||||
actions.append(
|
||||
@@ -1273,6 +1392,37 @@ def _runtime_candidate_actions(
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
"gpu-memory-utilization" in tunable
|
||||
and topology_settled
|
||||
and top_bottleneck in {"decode_tpot", "admission_or_queueing"}
|
||||
):
|
||||
current_gmu = _parse_float_like(
|
||||
anchor_flags.get("gpu-memory-utilization"), default=0.9
|
||||
)
|
||||
if 0.0 < current_gmu < _GMU_SAFE_CEILING:
|
||||
target = round(min(_GMU_SAFE_CEILING, current_gmu + _GMU_STEP), 4)
|
||||
if target > current_gmu:
|
||||
patch = {**runtime_base_patch, "gpu-memory-utilization": target}
|
||||
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||
if signature not in tested_signatures:
|
||||
actions.append(
|
||||
_runtime_action(
|
||||
action_id="raise_gpu_memory_utilization",
|
||||
knob_family="gpu-memory-utilization",
|
||||
score=0.4 + _information_gain(bottleneck_hypotheses, "runtime"),
|
||||
patch=patch,
|
||||
hypothesis=(
|
||||
"Raise gpu-memory-utilization to add KV-cache headroom so the "
|
||||
"decode-bound incumbent can sustain more concurrent decode."
|
||||
),
|
||||
expected_effects=[
|
||||
"add KV-cache blocks for higher decode concurrency on the incumbent topology",
|
||||
"reject if the higher memory target regresses request_rate_per_gpu or fails to launch",
|
||||
],
|
||||
)
|
||||
)
|
||||
return actions
|
||||
|
||||
|
||||
@@ -1422,6 +1572,18 @@ def _preserve_topology_patch(study: StudySpec, flags: dict[str, Any]) -> dict[st
|
||||
return patch
|
||||
|
||||
|
||||
def _preserve_runtime_patch(study: StudySpec, flags: dict[str, Any]) -> dict[str, Any]:
|
||||
patch: dict[str, Any] = {}
|
||||
tunable = set(study.engine.tunable_flags)
|
||||
base = study.engine.base_flags
|
||||
for key in _RUNTIME_KEYS:
|
||||
if key not in tunable or key not in flags:
|
||||
continue
|
||||
if flags.get(key) != base.get(key):
|
||||
patch[key] = flags[key]
|
||||
return patch
|
||||
|
||||
|
||||
def _normalized_topology_flags(flags: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"tensor-parallel-size": _parse_int_like(
|
||||
@@ -1696,11 +1858,15 @@ def _runtime_refinement_proposal(
|
||||
best_flags = best_patch.get("flag_patch")
|
||||
if not isinstance(best_flags, dict):
|
||||
best_flags = {}
|
||||
best_tp = _parse_int_like(best_flags.get("tensor-parallel-size"), default=1)
|
||||
best_effective_flags = _effective_flags_for_item(study, best)
|
||||
best_tp = _parse_int_like(best_effective_flags.get("tensor-parallel-size"), default=1)
|
||||
if best_tp <= 1:
|
||||
return default
|
||||
tunable = set(study.engine.tunable_flags)
|
||||
flag_patch: dict[str, Any] = {"tensor-parallel-size": best_tp}
|
||||
flag_patch = {
|
||||
**_preserve_topology_patch(study, best_effective_flags),
|
||||
**_preserve_runtime_patch(study, best_effective_flags),
|
||||
}
|
||||
if "enable-chunked-prefill" in tunable:
|
||||
flag_patch["enable-chunked-prefill"] = True
|
||||
if "max-num-batched-tokens" not in tunable:
|
||||
@@ -1735,7 +1901,7 @@ def _runtime_refinement_proposal(
|
||||
"config_patch": {"env_patch": {}, "flag_patch": flag_patch},
|
||||
"expected_effects": [
|
||||
"preserve the incumbent topology",
|
||||
"increase batching headroom without also raising memory pressure",
|
||||
"increase batching headroom without dropping measured runtime gains",
|
||||
],
|
||||
"incumbent_trial_id": best.get("trial_id"),
|
||||
}
|
||||
@@ -1877,6 +2043,10 @@ def _validation_exhausted_guard(
|
||||
}
|
||||
if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
|
||||
return default
|
||||
state_completed = _state_completed_trials_with_rates(state)
|
||||
if state_completed:
|
||||
baseline_rate = float(state_completed[0].best_request_rate_per_gpu)
|
||||
else:
|
||||
completed = [
|
||||
item
|
||||
for item in recent_diagnostics
|
||||
@@ -1923,12 +2093,18 @@ def _validation_exhausted_guard(
|
||||
"incumbent_gain_vs_baseline": gain,
|
||||
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
||||
}
|
||||
if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best):
|
||||
improving_trials = [
|
||||
item
|
||||
for item in after_best
|
||||
if isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||
and float(item["best_request_rate_per_gpu"]) > incumbent_rate
|
||||
]
|
||||
if improving_trials:
|
||||
return {
|
||||
**default,
|
||||
"reason": "post_incumbent_validation_found_feasible_candidate",
|
||||
"reason": "post_incumbent_validation_found_improving_candidate",
|
||||
"incumbent_gain_vs_baseline": gain,
|
||||
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
||||
"validation_trial_ids": [str(item.get("trial_id")) for item in improving_trials],
|
||||
}
|
||||
|
||||
families: set[str] = set()
|
||||
@@ -1954,7 +2130,7 @@ def _validation_exhausted_guard(
|
||||
"reason": "post_incumbent_validation_exhausted",
|
||||
"summary": (
|
||||
"A strong incumbent was followed by validation probes across nearby "
|
||||
"topology/runtime families, and none produced a feasible candidate."
|
||||
"topology/runtime families, and none improved request_rate_per_gpu."
|
||||
),
|
||||
"incumbent_trial_id": state.best_trial_id,
|
||||
"incumbent_gain_vs_baseline": gain,
|
||||
@@ -1995,16 +2171,11 @@ def _strong_incumbent_guard(
|
||||
}
|
||||
if state.best_trial_id is None or state.best_request_rate_per_gpu is None:
|
||||
return default
|
||||
completed = [
|
||||
item
|
||||
for item in recent_diagnostics
|
||||
if item.get("status") == "completed"
|
||||
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||
]
|
||||
completed = _state_completed_trials_with_rates(state)
|
||||
if len(completed) < 2:
|
||||
return default
|
||||
baseline = completed[0]
|
||||
baseline_rate = float(baseline["best_request_rate_per_gpu"])
|
||||
baseline_rate = float(baseline.best_request_rate_per_gpu)
|
||||
incumbent_rate = float(state.best_request_rate_per_gpu)
|
||||
if baseline_rate <= 0:
|
||||
return default
|
||||
@@ -2014,7 +2185,7 @@ def _strong_incumbent_guard(
|
||||
return {
|
||||
"guard_active": True,
|
||||
"reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best_enter_validation_phase",
|
||||
"baseline_trial_id": baseline.get("trial_id"),
|
||||
"baseline_trial_id": baseline.trial_id,
|
||||
"baseline_request_rate_per_gpu": baseline_rate,
|
||||
"incumbent_gain_vs_baseline": gain,
|
||||
"recommended_next_action": (
|
||||
@@ -2024,7 +2195,7 @@ def _strong_incumbent_guard(
|
||||
}
|
||||
return {
|
||||
**default,
|
||||
"baseline_trial_id": baseline.get("trial_id"),
|
||||
"baseline_trial_id": baseline.trial_id,
|
||||
"baseline_request_rate_per_gpu": baseline_rate,
|
||||
"incumbent_gain_vs_baseline": gain,
|
||||
"reason": "need_more_evidence_before_strong_incumbent_stop",
|
||||
@@ -2252,6 +2423,19 @@ def _parse_int_like(value: Any, *, default: int) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def _parse_float_like(value: Any, *, default: float) -> float:
|
||||
if value is None or isinstance(value, bool):
|
||||
return default
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str) and value.strip():
|
||||
try:
|
||||
return float(value.strip())
|
||||
except ValueError:
|
||||
return default
|
||||
return default
|
||||
|
||||
|
||||
def _config_signature(config_patch: Any) -> str:
|
||||
if not isinstance(config_patch, dict):
|
||||
config_patch = {}
|
||||
|
||||
581
src/aituner/tuning_report.py
Normal file
581
src/aituner/tuning_report.py
Normal file
@@ -0,0 +1,581 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .spec import SpecError, load_structured_file
|
||||
from .store import StudyStore
|
||||
|
||||
|
||||
DEFAULT_BUDGETS = [1, 2, 3, 4, 6, 8, 12]
|
||||
DEFAULT_TARGET_FRACTION = 0.95
|
||||
DEFAULT_MIN_FINAL_RATIO = 0.98
|
||||
|
||||
|
||||
def run_tuning_report(spec_path: Path) -> dict[str, Any]:
|
||||
spec_path = spec_path.resolve()
|
||||
spec = _load_report_spec(spec_path)
|
||||
report_root = _resolve_output_root(spec, spec_path=spec_path)
|
||||
report_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cases = [
|
||||
_summarize_case(case, spec_path=spec_path)
|
||||
for case in spec["cases"]
|
||||
]
|
||||
summary = {
|
||||
"report_id": spec["report_id"],
|
||||
"report_root": str(report_root),
|
||||
"target_fraction": spec["target_fraction"],
|
||||
"min_final_ratio": spec["min_final_ratio"],
|
||||
"cases": cases,
|
||||
"aggregate": _aggregate_cases(cases),
|
||||
}
|
||||
StudyStore.write_json(report_root / "summary.json", summary)
|
||||
(report_root / "report.md").write_text(_render_report(summary), encoding="utf-8")
|
||||
return summary
|
||||
|
||||
|
||||
def _load_report_spec(path: Path) -> dict[str, Any]:
|
||||
payload = dict(load_structured_file(path))
|
||||
report_id = str(payload.get("report_id") or "").strip()
|
||||
if not report_id:
|
||||
raise SpecError("report_id must be a non-empty string.")
|
||||
raw_cases = payload.get("cases")
|
||||
if not isinstance(raw_cases, list) or not raw_cases:
|
||||
raise SpecError("cases must be a non-empty list.")
|
||||
target_fraction = _as_float(payload.get("target_fraction"), default=DEFAULT_TARGET_FRACTION)
|
||||
if target_fraction <= 0:
|
||||
raise SpecError("target_fraction must be positive.")
|
||||
min_final_ratio = _as_float(payload.get("min_final_ratio"), default=DEFAULT_MIN_FINAL_RATIO)
|
||||
if min_final_ratio <= 0:
|
||||
raise SpecError("min_final_ratio must be positive.")
|
||||
cases = [
|
||||
_load_case(
|
||||
item,
|
||||
idx=idx,
|
||||
default_target_fraction=target_fraction,
|
||||
default_min_final_ratio=min_final_ratio,
|
||||
)
|
||||
for idx, item in enumerate(raw_cases)
|
||||
]
|
||||
return {
|
||||
"report_id": report_id,
|
||||
"output_root": str(payload.get("output_root") or "").strip() or None,
|
||||
"target_fraction": target_fraction,
|
||||
"min_final_ratio": min_final_ratio,
|
||||
"cases": cases,
|
||||
}
|
||||
|
||||
|
||||
def _load_case(
|
||||
raw: Any,
|
||||
*,
|
||||
idx: int,
|
||||
default_target_fraction: float,
|
||||
default_min_final_ratio: float,
|
||||
) -> dict[str, Any]:
|
||||
if not isinstance(raw, dict):
|
||||
raise SpecError(f"cases[{idx}] must be an object.")
|
||||
case_id = str(raw.get("case_id") or "").strip()
|
||||
if not case_id:
|
||||
raise SpecError(f"cases[{idx}].case_id must be a non-empty string.")
|
||||
raw_arms = raw.get("arms")
|
||||
if not isinstance(raw_arms, list) or not raw_arms:
|
||||
raise SpecError(f"cases[{idx}].arms must be a non-empty list.")
|
||||
arms = [_load_arm(item, context=f"cases[{idx}].arms[{arm_idx}]") for arm_idx, item in enumerate(raw_arms)]
|
||||
names = [item["name"] for item in arms]
|
||||
if len(names) != len(set(names)):
|
||||
raise SpecError(f"cases[{idx}].arms names must be unique.")
|
||||
raw_budgets = raw.get("budgets", DEFAULT_BUDGETS)
|
||||
if not isinstance(raw_budgets, list) or not raw_budgets:
|
||||
raise SpecError(f"cases[{idx}].budgets must be a non-empty list.")
|
||||
budgets = sorted({_positive_int(item, context=f"cases[{idx}].budgets") for item in raw_budgets})
|
||||
return {
|
||||
"case_id": case_id,
|
||||
"description": str(raw.get("description") or "").strip(),
|
||||
"tags": [str(item).strip() for item in raw.get("tags", []) if str(item).strip()]
|
||||
if isinstance(raw.get("tags", []), list)
|
||||
else [],
|
||||
"budgets": budgets,
|
||||
"target_fraction": _as_float(raw.get("target_fraction"), default=default_target_fraction),
|
||||
"min_final_ratio": _as_float(raw.get("min_final_ratio"), default=default_min_final_ratio),
|
||||
"arms": arms,
|
||||
}
|
||||
|
||||
|
||||
def _load_arm(raw: Any, *, context: str) -> dict[str, Any]:
|
||||
if not isinstance(raw, dict):
|
||||
raise SpecError(f"{context} must be an object.")
|
||||
name = str(raw.get("name") or "").strip()
|
||||
if not name:
|
||||
raise SpecError(f"{context}.name must be a non-empty string.")
|
||||
kind = str(raw.get("kind") or name).strip()
|
||||
study_root = str(raw.get("study_root") or "").strip()
|
||||
if not study_root:
|
||||
raise SpecError(f"{context}.study_root must be a non-empty string.")
|
||||
return {
|
||||
"name": name,
|
||||
"kind": kind,
|
||||
"study_root": study_root,
|
||||
"label": str(raw.get("label") or "").strip() or name,
|
||||
}
|
||||
|
||||
|
||||
def _resolve_output_root(spec: dict[str, Any], *, spec_path: Path) -> Path:
|
||||
raw = spec.get("output_root")
|
||||
if raw:
|
||||
return _resolve_path(str(raw), base_dir=spec_path.parent)
|
||||
return (Path(".aituner-reports") / str(spec["report_id"])).resolve()
|
||||
|
||||
|
||||
def _summarize_case(case: dict[str, Any], *, spec_path: Path) -> dict[str, Any]:
|
||||
arms = [
|
||||
_summarize_arm(arm, budgets=case["budgets"], spec_path=spec_path)
|
||||
for arm in case["arms"]
|
||||
]
|
||||
reference = _reference_best(arms)
|
||||
max_budget = max(case["budgets"] + [arm["trial_count"] for arm in arms])
|
||||
for arm in arms:
|
||||
_add_reference_metrics(
|
||||
arm,
|
||||
reference=reference,
|
||||
max_budget=max_budget,
|
||||
target_fraction=case["target_fraction"],
|
||||
)
|
||||
winners = _case_winners(arms)
|
||||
comparison = _harness_vs_naive(
|
||||
arms,
|
||||
min_final_ratio=case["min_final_ratio"],
|
||||
)
|
||||
return {
|
||||
"case_id": case["case_id"],
|
||||
"description": case["description"],
|
||||
"tags": case["tags"],
|
||||
"budgets": case["budgets"],
|
||||
"target_fraction": case["target_fraction"],
|
||||
"min_final_ratio": case["min_final_ratio"],
|
||||
"reference_best_per_gpu": reference,
|
||||
"max_budget": max_budget,
|
||||
"arms": arms,
|
||||
"winners": winners,
|
||||
"harness_vs_naive": comparison,
|
||||
"warnings": _case_warnings(case, arms, comparison),
|
||||
}
|
||||
|
||||
|
||||
def _summarize_arm(arm: dict[str, Any], *, budgets: list[int], spec_path: Path) -> dict[str, Any]:
|
||||
study_root = _resolve_study_root(arm["study_root"], base_dir=spec_path.parent)
|
||||
state = json.loads((study_root / "state.json").read_text(encoding="utf-8"))
|
||||
trials = state.get("trials") if isinstance(state.get("trials"), list) else []
|
||||
curve = _running_best_curve(trials)
|
||||
final_best = curve[-1] if curve else None
|
||||
best_trial_index = _first_index_at_value(curve, final_best)
|
||||
return {
|
||||
"name": arm["name"],
|
||||
"kind": arm["kind"],
|
||||
"label": arm["label"],
|
||||
"study_root": str(study_root),
|
||||
"study_id": state.get("study_id"),
|
||||
"trial_count": len(trials),
|
||||
"completed_count": sum(1 for item in trials if item.get("status") == "completed"),
|
||||
"failed_count": sum(1 for item in trials if item.get("status") == "failed"),
|
||||
"no_feasible_count": sum(
|
||||
1 for item in trials if not isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||
),
|
||||
"best_trial_id": state.get("best_trial_id"),
|
||||
"best_trial_index": best_trial_index,
|
||||
"final_best_per_gpu": final_best,
|
||||
"state_best_per_gpu": state.get("best_request_rate_per_gpu"),
|
||||
"best_at_budget": {str(budget): _value_at_budget(curve, budget) for budget in budgets},
|
||||
"running_best_per_gpu": curve,
|
||||
"stop_reason": str(state.get("tuning_stop_reason") or ""),
|
||||
"stop_diagnosis": str(state.get("tuning_stop_diagnosis") or ""),
|
||||
}
|
||||
|
||||
|
||||
def _add_reference_metrics(
|
||||
arm: dict[str, Any],
|
||||
*,
|
||||
reference: float | None,
|
||||
max_budget: int,
|
||||
target_fraction: float,
|
||||
) -> None:
|
||||
final_best = arm.get("final_best_per_gpu")
|
||||
arm["final_ratio_to_reference"] = (
|
||||
float(final_best) / reference
|
||||
if reference and isinstance(final_best, (int, float))
|
||||
else None
|
||||
)
|
||||
target = reference * target_fraction if reference else None
|
||||
arm["target_per_gpu"] = target
|
||||
arm["trials_to_target"] = _trials_to_target(arm["running_best_per_gpu"], target)
|
||||
arm["normalized_auc"] = _normalized_auc(
|
||||
arm["running_best_per_gpu"],
|
||||
reference=reference,
|
||||
max_budget=max_budget,
|
||||
)
|
||||
|
||||
|
||||
def _harness_vs_naive(arms: list[dict[str, Any]], *, min_final_ratio: float) -> list[dict[str, Any]]:
|
||||
naive = [arm for arm in arms if arm["kind"] == "naive"]
|
||||
harnesses = [arm for arm in arms if arm["kind"] == "harness"]
|
||||
if not naive or not harnesses:
|
||||
return []
|
||||
best_naive_final = _max_optional(arm.get("final_best_per_gpu") for arm in naive)
|
||||
best_naive_ttt = _min_optional(arm.get("trials_to_target") for arm in naive)
|
||||
best_naive_auc = _max_optional(arm.get("normalized_auc") for arm in naive)
|
||||
rows = []
|
||||
for harness in harnesses:
|
||||
final = harness.get("final_best_per_gpu")
|
||||
ttt = harness.get("trials_to_target")
|
||||
auc = harness.get("normalized_auc")
|
||||
final_ratio = (
|
||||
float(final) / best_naive_final
|
||||
if best_naive_final and isinstance(final, (int, float))
|
||||
else None
|
||||
)
|
||||
auc_ratio = (
|
||||
float(auc) / best_naive_auc
|
||||
if best_naive_auc and isinstance(auc, (int, float))
|
||||
else None
|
||||
)
|
||||
speedup = _speedup(best_naive_ttt, ttt)
|
||||
pass_final = final_ratio is not None and final_ratio >= min_final_ratio
|
||||
pass_speed = speedup is None or speedup >= 1.0
|
||||
rows.append(
|
||||
{
|
||||
"harness": harness["name"],
|
||||
"best_naive_final_per_gpu": best_naive_final,
|
||||
"best_naive_trials_to_target": best_naive_ttt,
|
||||
"best_naive_normalized_auc": best_naive_auc,
|
||||
"final_ratio_vs_best_naive": final_ratio,
|
||||
"target_trial_speedup_vs_best_naive": speedup,
|
||||
"auc_ratio_vs_best_naive": auc_ratio,
|
||||
"passes_min_final_ratio": pass_final,
|
||||
"passes_speed": pass_speed,
|
||||
"passes": pass_final and pass_speed,
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _case_winners(arms: list[dict[str, Any]]) -> dict[str, str | None]:
|
||||
return {
|
||||
"final_best": _argmax(arms, "final_best_per_gpu"),
|
||||
"fastest_to_target": _argmin(arms, "trials_to_target"),
|
||||
"normalized_auc": _argmax(arms, "normalized_auc"),
|
||||
}
|
||||
|
||||
|
||||
def _aggregate_cases(cases: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
by_kind: dict[str, dict[str, Any]] = {}
|
||||
final_wins: dict[str, int] = {}
|
||||
speed_wins: dict[str, int] = {}
|
||||
auc_wins: dict[str, int] = {}
|
||||
harness_passes = 0
|
||||
harness_checks = 0
|
||||
for case in cases:
|
||||
for winner_key, target in (
|
||||
("final_best", final_wins),
|
||||
("fastest_to_target", speed_wins),
|
||||
("normalized_auc", auc_wins),
|
||||
):
|
||||
winner = case["winners"].get(winner_key)
|
||||
if winner:
|
||||
target[winner] = target.get(winner, 0) + 1
|
||||
for row in case["harness_vs_naive"]:
|
||||
harness_checks += 1
|
||||
if row["passes"]:
|
||||
harness_passes += 1
|
||||
for arm in case["arms"]:
|
||||
bucket = by_kind.setdefault(
|
||||
arm["kind"],
|
||||
{
|
||||
"arm_count": 0,
|
||||
"mean_final_ratio_to_reference": None,
|
||||
"mean_normalized_auc": None,
|
||||
"target_reached_count": 0,
|
||||
"_final_ratios": [],
|
||||
"_aucs": [],
|
||||
},
|
||||
)
|
||||
bucket["arm_count"] += 1
|
||||
if isinstance(arm.get("final_ratio_to_reference"), (int, float)):
|
||||
bucket["_final_ratios"].append(float(arm["final_ratio_to_reference"]))
|
||||
if isinstance(arm.get("normalized_auc"), (int, float)):
|
||||
bucket["_aucs"].append(float(arm["normalized_auc"]))
|
||||
if isinstance(arm.get("trials_to_target"), int):
|
||||
bucket["target_reached_count"] += 1
|
||||
for bucket in by_kind.values():
|
||||
ratios = bucket.pop("_final_ratios")
|
||||
aucs = bucket.pop("_aucs")
|
||||
bucket["mean_final_ratio_to_reference"] = _mean(ratios)
|
||||
bucket["mean_normalized_auc"] = _mean(aucs)
|
||||
return {
|
||||
"case_count": len(cases),
|
||||
"by_kind": by_kind,
|
||||
"winner_counts": {
|
||||
"final_best": final_wins,
|
||||
"fastest_to_target": speed_wins,
|
||||
"normalized_auc": auc_wins,
|
||||
},
|
||||
"harness_vs_naive_pass_count": harness_passes,
|
||||
"harness_vs_naive_check_count": harness_checks,
|
||||
}
|
||||
|
||||
|
||||
def _case_warnings(
|
||||
case: dict[str, Any],
|
||||
arms: list[dict[str, Any]],
|
||||
comparison: list[dict[str, Any]],
|
||||
) -> list[str]:
|
||||
warnings = []
|
||||
kinds = {arm["kind"] for arm in arms}
|
||||
if "harness" not in kinds or "naive" not in kinds:
|
||||
warnings.append("case does not include both harness and naive arms")
|
||||
if len(case["tags"]) < 2:
|
||||
warnings.append("case has few tags; add workload/model/SLO tags to support generalization claims")
|
||||
if not comparison:
|
||||
return warnings
|
||||
for row in comparison:
|
||||
if not row["passes_min_final_ratio"]:
|
||||
warnings.append(
|
||||
f"{row['harness']} final best is below min_final_ratio versus best naive"
|
||||
)
|
||||
if not row["passes_speed"]:
|
||||
warnings.append(
|
||||
f"{row['harness']} reaches target later than best naive"
|
||||
)
|
||||
return warnings
|
||||
|
||||
|
||||
def _running_best_curve(trials: list[Any]) -> list[float | None]:
|
||||
curve: list[float | None] = []
|
||||
incumbent: float | None = None
|
||||
for trial in trials:
|
||||
rate = trial.get("best_request_rate_per_gpu") if isinstance(trial, dict) else None
|
||||
if isinstance(rate, (int, float)) and (incumbent is None or float(rate) > incumbent):
|
||||
incumbent = float(rate)
|
||||
curve.append(incumbent)
|
||||
return curve
|
||||
|
||||
|
||||
def _value_at_budget(curve: list[float | None], budget: int) -> float | None:
|
||||
if not curve:
|
||||
return None
|
||||
index = min(max(budget, 1), len(curve)) - 1
|
||||
return curve[index]
|
||||
|
||||
|
||||
def _trials_to_target(curve: list[float | None], target: float | None) -> int | None:
|
||||
if target is None:
|
||||
return None
|
||||
for idx, value in enumerate(curve, start=1):
|
||||
if isinstance(value, (int, float)) and value >= target:
|
||||
return idx
|
||||
return None
|
||||
|
||||
|
||||
def _normalized_auc(
|
||||
curve: list[float | None],
|
||||
*,
|
||||
reference: float | None,
|
||||
max_budget: int,
|
||||
) -> float | None:
|
||||
if not reference or max_budget <= 0:
|
||||
return None
|
||||
total = 0.0
|
||||
for budget in range(1, max_budget + 1):
|
||||
value = _value_at_budget(curve, budget)
|
||||
total += float(value) if isinstance(value, (int, float)) else 0.0
|
||||
return total / (reference * max_budget)
|
||||
|
||||
|
||||
def _reference_best(arms: list[dict[str, Any]]) -> float | None:
|
||||
return _max_optional(arm.get("final_best_per_gpu") for arm in arms)
|
||||
|
||||
|
||||
def _resolve_study_root(raw_path: str, *, base_dir: Path) -> Path:
|
||||
path = _resolve_path(raw_path, base_dir=base_dir)
|
||||
if (path / "state.json").exists():
|
||||
return path
|
||||
matches = sorted(path.glob("*/state.json"))
|
||||
if len(matches) == 1:
|
||||
return matches[0].parent
|
||||
if not matches:
|
||||
raise SpecError(f"study_root does not contain state.json: {path}")
|
||||
raise SpecError(f"study_root is ambiguous; point to a specific study directory: {path}")
|
||||
|
||||
|
||||
def _resolve_path(raw_path: str, *, base_dir: Path) -> Path:
|
||||
path = Path(raw_path)
|
||||
if not path.is_absolute():
|
||||
path = (base_dir / path).resolve()
|
||||
return path
|
||||
|
||||
|
||||
def _as_float(value: Any, *, default: float) -> float:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
||||
raise SpecError(f"Expected numeric value, got {value!r}.")
|
||||
return float(value)
|
||||
|
||||
|
||||
def _positive_int(value: Any, *, context: str) -> int:
|
||||
if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
|
||||
raise SpecError(f"{context} must contain positive integers.")
|
||||
return value
|
||||
|
||||
|
||||
def _first_index_at_value(curve: list[float | None], value: float | None) -> int | None:
|
||||
if value is None:
|
||||
return None
|
||||
for idx, item in enumerate(curve, start=1):
|
||||
if item == value:
|
||||
return idx
|
||||
return None
|
||||
|
||||
|
||||
def _argmax(rows: list[dict[str, Any]], key: str) -> str | None:
|
||||
scored = [
|
||||
(str(row["name"]), float(row[key]))
|
||||
for row in rows
|
||||
if isinstance(row.get(key), (int, float))
|
||||
]
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[1], reverse=True)
|
||||
return scored[0][0]
|
||||
|
||||
|
||||
def _argmin(rows: list[dict[str, Any]], key: str) -> str | None:
|
||||
scored = [
|
||||
(str(row["name"]), int(row[key]))
|
||||
for row in rows
|
||||
if isinstance(row.get(key), int)
|
||||
]
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[1])
|
||||
return scored[0][0]
|
||||
|
||||
|
||||
def _max_optional(values: Any) -> float | None:
|
||||
scored = [float(item) for item in values if isinstance(item, (int, float))]
|
||||
return max(scored) if scored else None
|
||||
|
||||
|
||||
def _min_optional(values: Any) -> int | None:
|
||||
scored = [int(item) for item in values if isinstance(item, int)]
|
||||
return min(scored) if scored else None
|
||||
|
||||
|
||||
def _mean(values: list[float]) -> float | None:
|
||||
return sum(values) / len(values) if values else None
|
||||
|
||||
|
||||
def _speedup(naive_trials: int | None, harness_trials: int | None) -> float | None:
|
||||
if harness_trials is None:
|
||||
return 0.0 if naive_trials is not None else None
|
||||
if naive_trials is None:
|
||||
return None
|
||||
if harness_trials <= 0:
|
||||
return None
|
||||
return float(naive_trials) / float(harness_trials)
|
||||
|
||||
|
||||
def _fmt(value: Any) -> str:
|
||||
if isinstance(value, float):
|
||||
return f"{value:.4f}"
|
||||
if value is None:
|
||||
return "-"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _render_report(summary: dict[str, Any]) -> str:
|
||||
lines = [
|
||||
f"# {summary['report_id']}",
|
||||
"",
|
||||
"## Aggregate",
|
||||
"",
|
||||
f"- Cases: `{summary['aggregate']['case_count']}`",
|
||||
f"- Harness-vs-naive pass/checks: `{summary['aggregate']['harness_vs_naive_pass_count']}`/`{summary['aggregate']['harness_vs_naive_check_count']}`",
|
||||
f"- Winner counts: `{json.dumps(summary['aggregate']['winner_counts'], ensure_ascii=False)}`",
|
||||
"",
|
||||
"## By Kind",
|
||||
"",
|
||||
"| Kind | Arms | Mean final/ref | Mean AUC | Target reached |",
|
||||
"| --- | ---: | ---: | ---: | ---: |",
|
||||
]
|
||||
for kind, payload in sorted(summary["aggregate"]["by_kind"].items()):
|
||||
lines.append(
|
||||
"| "
|
||||
+ " | ".join(
|
||||
[
|
||||
f"`{kind}`",
|
||||
str(payload["arm_count"]),
|
||||
_fmt(payload["mean_final_ratio_to_reference"]),
|
||||
_fmt(payload["mean_normalized_auc"]),
|
||||
str(payload["target_reached_count"]),
|
||||
]
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
lines.extend(["", "## Cases", ""])
|
||||
for case in summary["cases"]:
|
||||
lines.extend(
|
||||
[
|
||||
f"### {case['case_id']}",
|
||||
"",
|
||||
f"- Reference best req/s/GPU: `{_fmt(case['reference_best_per_gpu'])}`",
|
||||
f"- Target fraction: `{case['target_fraction']}`",
|
||||
f"- Winners: `{json.dumps(case['winners'], ensure_ascii=False)}`",
|
||||
]
|
||||
)
|
||||
if case["warnings"]:
|
||||
lines.append(f"- Warnings: `{json.dumps(case['warnings'], ensure_ascii=False)}`")
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |",
|
||||
"| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
|
||||
]
|
||||
)
|
||||
for arm in case["arms"]:
|
||||
lines.append(
|
||||
"| "
|
||||
+ " | ".join(
|
||||
[
|
||||
f"`{arm['name']}`",
|
||||
f"`{arm['kind']}`",
|
||||
str(arm["trial_count"]),
|
||||
_fmt(arm["final_best_per_gpu"]),
|
||||
_fmt(arm["final_ratio_to_reference"]),
|
||||
_fmt(arm["trials_to_target"]),
|
||||
_fmt(arm["normalized_auc"]),
|
||||
str(arm["failed_count"]),
|
||||
str(arm["no_feasible_count"]),
|
||||
]
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
if case["harness_vs_naive"]:
|
||||
lines.extend(["", "| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |", "| --- | ---: | ---: | ---: | --- |"])
|
||||
for row in case["harness_vs_naive"]:
|
||||
lines.append(
|
||||
"| "
|
||||
+ " | ".join(
|
||||
[
|
||||
f"`{row['harness']}`",
|
||||
_fmt(row["final_ratio_vs_best_naive"]),
|
||||
_fmt(row["target_trial_speedup_vs_best_naive"]),
|
||||
_fmt(row["auc_ratio_vs_best_naive"]),
|
||||
f"`{row['passes']}`",
|
||||
]
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
@@ -18,7 +18,7 @@ from .engine import build_launch_recipe
|
||||
from .http_client import HttpClientError, stream_chat_completion, wait_for_server
|
||||
from .lca import find_convergence_prefix, resolve_length_mode
|
||||
from .search import ThresholdProbe, binary_search_max_feasible
|
||||
from .slo import RequestOutcome, evaluate_request, summarize_evaluations
|
||||
from .slo import RequestOutcome, _rule_threshold_ms, evaluate_request, summarize_evaluations
|
||||
from .spec import ConfigPatch, SamplingSearchSpec, TrialSpec, load_study_spec, to_jsonable
|
||||
from .trace import TraceRequest, load_trace_requests, select_requests_for_threshold
|
||||
|
||||
@@ -254,6 +254,34 @@ def _ignore_sigterm_if_main() -> None:
|
||||
pass
|
||||
|
||||
|
||||
def _probe_drain_deadline(
|
||||
reqs: list[TraceRequest], slo: Any, *, ceiling: float | None
|
||||
) -> float | None:
|
||||
"""Stop-A-consistent per-probe drain deadline (wall-clock seconds).
|
||||
|
||||
The deadline is the time a *feasible* config needs to drain the admitted set:
|
||||
the last admitted arrival plus the worst-case TTFT budget plus the p99 output
|
||||
length times the TPOT budget. A config that cannot finish by this deadline is
|
||||
genuinely SLO-infeasible, so the clock never pre-empts the LCA-matched offered
|
||||
window (Stop-A) -- it only fails the unfit. ``ceiling`` is a hard safety cap.
|
||||
"""
|
||||
if not reqs or slo.tpot_rule is None:
|
||||
return ceiling
|
||||
last_arrival = max(float(r.arrival_s or 0.0) for r in reqs)
|
||||
inputs = sorted(int(r.prompt_tokens_hint or 0) for r in reqs)
|
||||
outputs = sorted(int(r.completion_tokens_hint or 0) for r in reqs)
|
||||
|
||||
def _p99(xs: list[int]) -> int:
|
||||
return xs[min(len(xs) - 1, int(0.99 * len(xs)))] if xs else 0
|
||||
|
||||
p99_in, p99_out = _p99(inputs), _p99(outputs)
|
||||
tpot_ms = _rule_threshold_ms(slo.tpot_rule, p99_in)
|
||||
ttft_ms = _rule_threshold_ms(slo.ttft_rule, p99_in) if slo.ttft_rule is not None else 0.0
|
||||
margin_s = 30.0
|
||||
deadline = last_arrival + (ttft_ms + p99_out * tpot_ms) / 1000.0 + margin_s
|
||||
return min(float(ceiling), deadline) if ceiling else deadline
|
||||
|
||||
|
||||
def _adaptive_replay_set(
|
||||
selected: list[TraceRequest],
|
||||
*,
|
||||
@@ -640,7 +668,9 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
max_concurrency=study.trace.max_concurrency,
|
||||
target_pass_rate=study.slo.target_pass_rate,
|
||||
max_lag_s=study.trace.early_stop_max_lag_s,
|
||||
max_elapsed_s=study.trace.early_stop_max_elapsed_s,
|
||||
max_elapsed_s=_probe_drain_deadline(
|
||||
reqs, study.slo, ceiling=study.trace.early_stop_max_elapsed_s
|
||||
),
|
||||
evaluate_outcome=lambda outcome: evaluate_request(outcome, study.slo),
|
||||
drain_inflight_on_early_stop=not restart_after_early_stop,
|
||||
)
|
||||
@@ -751,9 +781,17 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
best = primary_search.best_feasible_payload
|
||||
best_source = "primary_search"
|
||||
fallback_search = None
|
||||
skipped_lower_range_fallback = False
|
||||
lower_range_fallback_skip_reason = ""
|
||||
original_search_low = float(study.search.low)
|
||||
inherited_search_floor = float(trial.search.low)
|
||||
if best is None and inherited_search_floor > original_search_low:
|
||||
if trial.search.inherit_incumbent_floor:
|
||||
skipped_lower_range_fallback = True
|
||||
lower_range_fallback_skip_reason = (
|
||||
"primary_search_above_incumbent_floor_all_infeasible"
|
||||
)
|
||||
else:
|
||||
fallback_search = binary_search_max_feasible(
|
||||
low=original_search_low,
|
||||
high=inherited_search_floor,
|
||||
@@ -796,7 +834,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
"best_request_count": best.request_count if best is not None else None,
|
||||
"probes": [serialize_probe(probe) for probe in all_probes],
|
||||
}
|
||||
if fallback_search is not None:
|
||||
if fallback_search is not None or skipped_lower_range_fallback:
|
||||
result["primary_search"] = {
|
||||
"low": inherited_search_floor,
|
||||
"high": trial.search.high,
|
||||
@@ -808,6 +846,16 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
else None,
|
||||
"probes": [serialize_probe(probe) for probe in primary_search.probes],
|
||||
}
|
||||
if skipped_lower_range_fallback:
|
||||
result["lower_range_fallback"] = {
|
||||
"triggered": False,
|
||||
"skipped": True,
|
||||
"reason": lower_range_fallback_skip_reason,
|
||||
"low": original_search_low,
|
||||
"high": inherited_search_floor,
|
||||
"probes": [],
|
||||
}
|
||||
if fallback_search is not None:
|
||||
result["lower_range_fallback"] = {
|
||||
"triggered": True,
|
||||
"low": original_search_low,
|
||||
|
||||
@@ -55,6 +55,7 @@ from aituner.store import StudyStore
|
||||
from aituner.trace import load_trace_requests, summarize_window
|
||||
from aituner.worker import (
|
||||
_adaptive_replay_set,
|
||||
_probe_drain_deadline,
|
||||
_install_sigterm_as_keyboardinterrupt,
|
||||
_restore_sigterm,
|
||||
_should_extend_on_boundary,
|
||||
@@ -535,6 +536,38 @@ class CoreFlowTests(unittest.TestCase):
|
||||
)
|
||||
)
|
||||
|
||||
def test_probe_drain_deadline_tracks_admitted_set_and_caps_at_ceiling(self) -> None:
|
||||
slo = SloSpec.from_dict(
|
||||
{
|
||||
"target_pass_rate": 0.95,
|
||||
"ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125},
|
||||
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||
}
|
||||
)
|
||||
|
||||
def req(arrival_s: float, in_tok: int, out_tok: int) -> TraceRequest:
|
||||
return TraceRequest(
|
||||
row_id="r",
|
||||
arrival_s=arrival_s,
|
||||
sampling_u=0.1,
|
||||
body={},
|
||||
prompt_tokens_hint=in_tok,
|
||||
completion_tokens_hint=out_tok,
|
||||
metadata={},
|
||||
)
|
||||
|
||||
# 100 requests, last arrival 500s, p99 in=8000 / out=2000.
|
||||
reqs = [req(float(i * 5), 8000, 2000) for i in range(100)]
|
||||
# deadline = last_arrival + (ttft_ms + p99_out*tpot_ms)/1000 + margin
|
||||
# = 495 + (5000 + 2000*50)/1000 + 30 = 495 + 105 + 30 = 630
|
||||
self.assertAlmostEqual(
|
||||
_probe_drain_deadline(reqs, slo, ceiling=1000.0), 630.0, places=3
|
||||
)
|
||||
# Ceiling caps a deadline that would otherwise exceed it.
|
||||
self.assertEqual(_probe_drain_deadline(reqs, slo, ceiling=400.0), 400.0)
|
||||
# No requests or no TPOT rule -> fall back to the ceiling.
|
||||
self.assertEqual(_probe_drain_deadline([], slo, ceiling=400.0), 400.0)
|
||||
|
||||
def test_linear_ms_ttft_rule_scales_with_input_length(self) -> None:
|
||||
slo = SloSpec.from_dict(
|
||||
{
|
||||
@@ -965,6 +998,189 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertIsNotNone(proposal)
|
||||
self.assertTrue(proposal.should_stop)
|
||||
|
||||
def test_harness_stop_after_non_improving_feasible_validation_is_exhausted(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0002",
|
||||
best_parallel_size=8,
|
||||
best_sampling_u=0.02,
|
||||
best_request_rate=2.4,
|
||||
best_request_rate_per_gpu=0.3,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=0.8,
|
||||
best_request_rate_per_gpu=0.1,
|
||||
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.4,
|
||||
best_request_rate_per_gpu=0.3,
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 2,
|
||||
"data-parallel-size": 4,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0003",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.0,
|
||||
best_request_rate_per_gpu=0.25,
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 1,
|
||||
"data-parallel-size": 8,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0004",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.1,
|
||||
best_request_rate_per_gpu=0.2625,
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {"max-num-seqs": 160},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study,
|
||||
window_summary={"prompt_tokens_p95": 2048},
|
||||
state=state,
|
||||
)
|
||||
self.assertTrue(context["harness_stop"]["should_stop"])
|
||||
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
|
||||
|
||||
def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
engine_overrides={"tunable_flags": ["max-num-seqs"]},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0006",
|
||||
best_parallel_size=8,
|
||||
best_request_rate=2.4,
|
||||
best_request_rate_per_gpu=0.3,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=0.8,
|
||||
best_request_rate_per_gpu=0.1,
|
||||
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=0.88,
|
||||
best_request_rate_per_gpu=0.11,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0003",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=0.96,
|
||||
best_request_rate_per_gpu=0.12,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0004",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=1.04,
|
||||
best_request_rate_per_gpu=0.13,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 32}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0005",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.24,
|
||||
best_request_rate_per_gpu=0.28,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 40}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0006",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.4,
|
||||
best_request_rate_per_gpu=0.3,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 48}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0007",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 56}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0008",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0009",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 72}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0010",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 80}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0011",
|
||||
status="failed",
|
||||
parallel_size=8,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 88}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0012",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 96}},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study,
|
||||
window_summary={"prompt_tokens_p95": 2048},
|
||||
state=state,
|
||||
)
|
||||
self.assertTrue(context["harness_stop"]["should_stop"])
|
||||
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
|
||||
self.assertGreater(
|
||||
context["harness_stop"]["evidence"]["incumbent_gain_vs_baseline"],
|
||||
2.9,
|
||||
)
|
||||
|
||||
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -1285,6 +1501,305 @@ class CoreFlowTests(unittest.TestCase):
|
||||
},
|
||||
)
|
||||
|
||||
def test_harness_runtime_refinement_preserves_incumbent_runtime_knobs(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
engine_overrides={
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
"max-num-seqs",
|
||||
"enable-chunked-prefill",
|
||||
"max-num-batched-tokens",
|
||||
],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||
"allowed_tp_dp_products": [1, 2, 4],
|
||||
},
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
result_path = tmp_path / "trial-0002.json"
|
||||
result_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.098,
|
||||
"best_request_rate": 3.3,
|
||||
"best_pass_rate": 0.97,
|
||||
"probes": [
|
||||
{
|
||||
"threshold": 0.098,
|
||||
"feasible": True,
|
||||
"payload": {
|
||||
"request_count": 100,
|
||||
"pass_rate": 0.97,
|
||||
"request_rate": 3.3,
|
||||
"early_stopped": False,
|
||||
"early_stop_reason": "",
|
||||
"latency_summary": {"failed_reason_counts": {}},
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0002",
|
||||
best_request_rate=3.3,
|
||||
best_request_rate_per_gpu=0.825,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
best_request_rate=2.5,
|
||||
best_request_rate_per_gpu=0.625,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
best_request_rate=3.3,
|
||||
best_request_rate_per_gpu=0.825,
|
||||
result_path=str(result_path),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.92,
|
||||
"max-num-seqs": 48,
|
||||
},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study,
|
||||
window_summary={"prompt_tokens_p99": 8100},
|
||||
state=state,
|
||||
)
|
||||
proposal = build_harness_guided_proposal(context)
|
||||
self.assertIsNotNone(proposal)
|
||||
self.assertEqual(
|
||||
proposal.config_patch.flag_patch,
|
||||
{
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.92,
|
||||
"max-num-seqs": 48,
|
||||
"enable-chunked-prefill": True,
|
||||
"max-num-batched-tokens": 16384,
|
||||
},
|
||||
)
|
||||
|
||||
def test_harness_raises_gpu_mem_util_on_settled_decode_bound_incumbent(self) -> None:
|
||||
"""Regression for the coverage gap that let the naive baseline beat the harness:
|
||||
a settled TP incumbent that is decode_tpot-bound must get a gpu-memory-utilization
|
||||
raise (KV-cache headroom) before the harness is allowed to stop."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
slo_overrides={
|
||||
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
|
||||
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||
},
|
||||
engine_overrides={
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||
"allowed_data_parallel_sizes": [1],
|
||||
"allowed_tp_dp_products": [1, 2, 4],
|
||||
},
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
result_path = tmp_path / "trial-0002.json"
|
||||
result_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.074,
|
||||
"best_request_rate": 2.6,
|
||||
"best_pass_rate": 0.97,
|
||||
"probes": [
|
||||
{
|
||||
"threshold": 0.074,
|
||||
"feasible": True,
|
||||
"payload": {
|
||||
"request_count": 300,
|
||||
"pass_rate": 0.97,
|
||||
"request_rate": 2.6,
|
||||
"latency_summary": {"failed_reason_counts": {}},
|
||||
},
|
||||
},
|
||||
{
|
||||
"threshold": 0.09,
|
||||
"feasible": False,
|
||||
"payload": {
|
||||
"request_count": 300,
|
||||
"pass_rate": 0.6,
|
||||
"request_rate": 3.2,
|
||||
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||
"latency_summary": {
|
||||
"failed_reason_counts": {"tpot_ms>50.0": 90}
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0002",
|
||||
best_request_rate=2.6,
|
||||
best_request_rate_per_gpu=0.65,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
best_request_rate=1.1,
|
||||
best_request_rate_per_gpu=0.275,
|
||||
config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
best_request_rate=2.6,
|
||||
best_request_rate_per_gpu=0.65,
|
||||
result_path=str(result_path),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.9,
|
||||
},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
|
||||
)
|
||||
proposal = build_harness_guided_proposal(context)
|
||||
self.assertIsNotNone(proposal)
|
||||
self.assertFalse(proposal.should_stop)
|
||||
# TP4 preserved; gpu-memory-utilization hill-climbed one step (0.9 -> 0.92).
|
||||
self.assertEqual(
|
||||
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
|
||||
)
|
||||
self.assertEqual(
|
||||
proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.92
|
||||
)
|
||||
# And the harness must NOT authorize a stop while that knob is untried.
|
||||
self.assertIsNone(build_harness_stop_proposal(context))
|
||||
|
||||
def test_harness_climbs_tp_before_gpu_mem_util_micro_tuning(self) -> None:
|
||||
"""gpu-memory-utilization must not preempt an untried TP increase: at a TP2 incumbent
|
||||
with TP4 still reachable, the harness must climb TP, not micro-tune runtime."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
slo_overrides={
|
||||
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
|
||||
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||
},
|
||||
engine_overrides={
|
||||
"tunable_flags": ["tensor-parallel-size", "gpu-memory-utilization"],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||
"allowed_data_parallel_sizes": [1],
|
||||
"allowed_tp_dp_products": [1, 2, 4],
|
||||
},
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
result_path = tmp_path / "trial-0002.json"
|
||||
result_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.03,
|
||||
"best_request_rate": 1.1,
|
||||
"best_pass_rate": 0.97,
|
||||
"probes": [
|
||||
{
|
||||
"threshold": 0.03,
|
||||
"feasible": True,
|
||||
"payload": {
|
||||
"request_count": 300,
|
||||
"pass_rate": 0.97,
|
||||
"request_rate": 1.1,
|
||||
"latency_summary": {"failed_reason_counts": {}},
|
||||
},
|
||||
},
|
||||
{
|
||||
"threshold": 0.05,
|
||||
"feasible": False,
|
||||
"payload": {
|
||||
"request_count": 300,
|
||||
"pass_rate": 0.6,
|
||||
"request_rate": 1.6,
|
||||
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||
"latency_summary": {
|
||||
"failed_reason_counts": {"tpot_ms>50.0": 90}
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0002",
|
||||
best_request_rate=1.1,
|
||||
best_request_rate_per_gpu=0.55,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
best_request_rate=0.6,
|
||||
best_request_rate_per_gpu=0.6,
|
||||
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
best_request_rate=1.1,
|
||||
best_request_rate_per_gpu=0.55,
|
||||
result_path=str(result_path),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 2,
|
||||
"gpu-memory-utilization": 0.9,
|
||||
},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
|
||||
)
|
||||
proposal = build_harness_guided_proposal(context)
|
||||
self.assertIsNotNone(proposal)
|
||||
# Must climb TP (to 4), and must NOT micro-tune gpu-memory-utilization yet.
|
||||
self.assertEqual(
|
||||
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
|
||||
)
|
||||
self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch)
|
||||
|
||||
def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -1543,6 +2058,282 @@ class CoreFlowTests(unittest.TestCase):
|
||||
{"max-num-seqs": 32},
|
||||
)
|
||||
|
||||
def test_prefill_convergence_stop_waits_for_sequence_concurrency_probe(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
engine_overrides={
|
||||
"base_flags": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 8000,
|
||||
"tensor-parallel-size": 4,
|
||||
"data-parallel-size": 1,
|
||||
"max-num-batched-tokens": 8192,
|
||||
"max-num-seqs": 64,
|
||||
"enable-chunked-prefill": True,
|
||||
},
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"data-parallel-size",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
"enable-chunked-prefill",
|
||||
],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [4, 8],
|
||||
"allowed_data_parallel_sizes": [1, 2],
|
||||
"allowed_tp_dp_products": [4, 8],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
def write_result(name: str, best_rate: float | None, pass_rate: float) -> Path:
|
||||
path = tmp_path / f"{name}.json"
|
||||
payload = {
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.091796875 if best_rate is not None else None,
|
||||
"best_request_rate": best_rate,
|
||||
"best_pass_rate": pass_rate if best_rate is not None else None,
|
||||
"probes": [
|
||||
{
|
||||
"threshold": 0.09375,
|
||||
"feasible": best_rate is not None,
|
||||
"payload": {
|
||||
"request_rate": best_rate,
|
||||
"pass_rate": pass_rate,
|
||||
"early_stop_reason": (
|
||||
"" if best_rate is not None else "slo_pass_rate_unrecoverable"
|
||||
),
|
||||
"latency_summary": {
|
||||
"failed_reason_counts": {"ttft_ms>4000.0": 32}
|
||||
},
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
path.write_text(json.dumps(payload), encoding="utf-8")
|
||||
return path
|
||||
|
||||
study = load_study_spec(study_path)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_parallel_size=8,
|
||||
best_sampling_u=0.091796875,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
best_pass_rate=0.952,
|
||||
result_path=str(write_result("trial-0001", 2.303, 0.952)),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 8,
|
||||
"data-parallel-size": 1,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
best_pass_rate=0.953,
|
||||
result_path=str(write_result("trial-0002", 2.303, 0.953)),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 8,
|
||||
"max-num-batched-tokens": 32768,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0003",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
result_path=str(write_result("trial-0003", None, 0.0)),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"data-parallel-size": 2,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0004",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
best_pass_rate=0.954,
|
||||
result_path=str(write_result("trial-0004", 2.303, 0.954)),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 8,
|
||||
"data-parallel-size": 1,
|
||||
"max-num-batched-tokens": 12288,
|
||||
},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study,
|
||||
window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
|
||||
state=state,
|
||||
)
|
||||
self.assertFalse(context["harness_stop"]["should_stop"])
|
||||
self.assertEqual(
|
||||
context["harness_stop"]["reason"],
|
||||
"experiment_plan_has_high_value_candidate",
|
||||
)
|
||||
action = context["experiment_plan"]["next_action"]
|
||||
self.assertEqual(action["knob_family"], "max-num-seqs")
|
||||
self.assertEqual(action["config_patch"]["flag_patch"]["max-num-seqs"], 96)
|
||||
self.assertEqual(action["config_patch"]["flag_patch"]["tensor-parallel-size"], 8)
|
||||
|
||||
def test_prefill_sequence_probe_followed_by_joint_runtime_probe(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
engine_overrides={
|
||||
"base_flags": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 8000,
|
||||
"tensor-parallel-size": 4,
|
||||
"data-parallel-size": 1,
|
||||
"max-num-batched-tokens": 8192,
|
||||
"max-num-seqs": 64,
|
||||
"enable-chunked-prefill": True,
|
||||
},
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"data-parallel-size",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
"enable-chunked-prefill",
|
||||
],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [4, 8],
|
||||
"allowed_data_parallel_sizes": [1, 2],
|
||||
"allowed_tp_dp_products": [4, 8],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
def write_result(name: str) -> Path:
|
||||
path = tmp_path / f"{name}.json"
|
||||
payload = {
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.091796875,
|
||||
"best_request_rate": 2.303,
|
||||
"best_pass_rate": 0.951,
|
||||
"probes": [
|
||||
{
|
||||
"threshold": 0.09375,
|
||||
"feasible": True,
|
||||
"payload": {
|
||||
"request_rate": 2.303,
|
||||
"pass_rate": 0.951,
|
||||
"latency_summary": {
|
||||
"failed_reason_counts": {"ttft_ms>4000.0": 32}
|
||||
},
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
path.write_text(json.dumps(payload), encoding="utf-8")
|
||||
return path
|
||||
|
||||
study = load_study_spec(study_path)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_parallel_size=8,
|
||||
best_sampling_u=0.091796875,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
best_pass_rate=0.952,
|
||||
result_path=str(write_result("trial-0001")),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 8,
|
||||
"data-parallel-size": 1,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
best_pass_rate=0.950,
|
||||
result_path=str(write_result("trial-0002")),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 8,
|
||||
"max-num-seqs": 96,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0003",
|
||||
status="completed",
|
||||
parallel_size=8,
|
||||
best_request_rate=2.303,
|
||||
best_request_rate_per_gpu=0.288,
|
||||
best_pass_rate=0.950,
|
||||
result_path=str(write_result("trial-0003")),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 8,
|
||||
"data-parallel-size": 1,
|
||||
"max-num-batched-tokens": 12288,
|
||||
},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study,
|
||||
window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
|
||||
state=state,
|
||||
)
|
||||
self.assertFalse(context["harness_stop"]["should_stop"])
|
||||
self.assertEqual(
|
||||
context["harness_stop"]["reason"],
|
||||
"experiment_plan_has_high_value_candidate",
|
||||
)
|
||||
action = context["experiment_plan"]["next_action"]
|
||||
flag_patch = action["config_patch"]["flag_patch"]
|
||||
self.assertEqual(action["knob_family"], "prefill-runtime-interaction")
|
||||
self.assertEqual(flag_patch["tensor-parallel-size"], 8)
|
||||
self.assertEqual(flag_patch["max-num-batched-tokens"], 16384)
|
||||
self.assertEqual(flag_patch["max-num-seqs"], 96)
|
||||
|
||||
def test_slo_unrecoverable_does_not_mask_latency_bottleneck(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -3273,6 +4064,94 @@ class CoreFlowTests(unittest.TestCase):
|
||||
[0.25, 0.375],
|
||||
)
|
||||
|
||||
def test_run_trial_skips_fallback_below_incumbent_floor(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
payload = json.loads(study_path.read_text(encoding="utf-8"))
|
||||
payload["search"]["max_probes"] = 2
|
||||
payload["search"]["inherit_incumbent_floor"] = True
|
||||
study_path.write_text(json.dumps(payload), encoding="utf-8")
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_parallel_size=1,
|
||||
best_sampling_u=0.5,
|
||||
best_request_rate=2.0,
|
||||
best_request_rate_per_gpu=2.0,
|
||||
next_trial_index=2,
|
||||
best_by_parallel_size={
|
||||
"1": {
|
||||
"trial_id": "trial-0001",
|
||||
"parallel_size": 1,
|
||||
"best_sampling_u": 0.5,
|
||||
"best_request_rate": 2.0,
|
||||
"best_request_rate_per_gpu": 2.0,
|
||||
}
|
||||
},
|
||||
trials=[],
|
||||
)
|
||||
proposal = Proposal.from_dict(
|
||||
{
|
||||
"observation": "runtime patch",
|
||||
"diagnosis": "primary range all infeasible",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}},
|
||||
"expected_effects": ["measure"],
|
||||
}
|
||||
)
|
||||
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||
self.assertEqual(trial.search.low, 0.5)
|
||||
self.assertTrue(trial.search.inherit_incumbent_floor)
|
||||
|
||||
def fake_replay(requests, **kwargs):
|
||||
return (
|
||||
[
|
||||
RequestOutcome(
|
||||
request_id=request.row_id,
|
||||
success=True,
|
||||
ttft_ms=10000.0,
|
||||
tpot_ms=1000.0,
|
||||
prompt_tokens=request.prompt_tokens_hint,
|
||||
completion_tokens=request.completion_tokens_hint,
|
||||
)
|
||||
for request in requests
|
||||
],
|
||||
False,
|
||||
"",
|
||||
)
|
||||
|
||||
process = mock.Mock()
|
||||
process.poll.return_value = 0
|
||||
with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
|
||||
with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
|
||||
with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
|
||||
with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
|
||||
result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
|
||||
|
||||
self.assertEqual(result["status"], "completed")
|
||||
self.assertIsNone(result["best_request_rate"])
|
||||
self.assertEqual(result["best_source"], "primary_search")
|
||||
self.assertEqual(result["primary_search"]["low"], 0.5)
|
||||
self.assertIsNone(result["primary_search"]["best_request_rate"])
|
||||
self.assertEqual(
|
||||
[probe["threshold"] for probe in result["primary_search"]["probes"]],
|
||||
[0.75, 0.625],
|
||||
)
|
||||
self.assertEqual(result["lower_range_fallback"]["triggered"], False)
|
||||
self.assertEqual(result["lower_range_fallback"]["skipped"], True)
|
||||
self.assertEqual(result["lower_range_fallback"]["probes"], [])
|
||||
self.assertEqual(
|
||||
result["lower_range_fallback"]["reason"],
|
||||
"primary_search_above_incumbent_floor_all_infeasible",
|
||||
)
|
||||
self.assertEqual(
|
||||
result["all_infeasible_diagnostics"]["threshold"],
|
||||
0.625,
|
||||
)
|
||||
|
||||
def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -4242,6 +5121,18 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertTrue(proposal_path.exists())
|
||||
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
|
||||
self.assertTrue(proposal["should_stop"])
|
||||
state = store.load_state(study.study_id)
|
||||
self.assertEqual(state.tuning_stop_reason, "harness_stop")
|
||||
self.assertEqual(
|
||||
state.tuning_stop_details["proposal_name"],
|
||||
"harness-stop-0005",
|
||||
)
|
||||
self.assertEqual(state.tuning_stop_details["proposal_source"], "harness")
|
||||
self.assertEqual(
|
||||
state.tuning_stop_details["stop_authorized_by"],
|
||||
"validator",
|
||||
)
|
||||
self.assertTrue(state.tuning_stop_diagnosis)
|
||||
|
||||
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
|
||||
109
tests/test_tuning_report.py
Normal file
109
tests/test_tuning_report.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from aituner.tuning_report import run_tuning_report
|
||||
|
||||
|
||||
def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
|
||||
root.mkdir(parents=True)
|
||||
trials = []
|
||||
best_rate = None
|
||||
best_trial_id = None
|
||||
for idx, rate in enumerate(rates, start=1):
|
||||
trial_id = f"trial-{idx:04d}"
|
||||
trials.append(
|
||||
{
|
||||
"trial_id": trial_id,
|
||||
"status": "completed" if rate is not None else "failed",
|
||||
"parallel_size": 1,
|
||||
"best_request_rate": rate,
|
||||
"best_request_rate_per_gpu": rate,
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {}},
|
||||
}
|
||||
)
|
||||
if rate is not None and (best_rate is None or rate > best_rate):
|
||||
best_rate = rate
|
||||
best_trial_id = trial_id
|
||||
payload = {
|
||||
"study_id": study_id,
|
||||
"best_trial_id": best_trial_id,
|
||||
"best_request_rate": best_rate,
|
||||
"best_request_rate_per_gpu": best_rate,
|
||||
"next_trial_index": len(rates) + 1,
|
||||
"trials": trials,
|
||||
}
|
||||
(root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
|
||||
|
||||
|
||||
class TuningReportTests(unittest.TestCase):
|
||||
def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
_write_state(
|
||||
tmp_path / "studies" / "harness-study",
|
||||
study_id="harness-study",
|
||||
rates=[0.4, 0.9],
|
||||
)
|
||||
_write_state(
|
||||
tmp_path / "naive-study",
|
||||
study_id="naive-study",
|
||||
rates=[0.4, None, 0.7, 0.9],
|
||||
)
|
||||
spec_path = tmp_path / "report.json"
|
||||
spec_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"report_id": "report-1",
|
||||
"output_root": str(tmp_path / "out"),
|
||||
"target_fraction": 0.8,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "case-1",
|
||||
"tags": ["model-a", "chat"],
|
||||
"budgets": [1, 2, 4],
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": str(tmp_path / "studies"),
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": str(tmp_path / "naive-study"),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
summary = run_tuning_report(spec_path)
|
||||
|
||||
case = summary["cases"][0]
|
||||
self.assertEqual(case["reference_best_per_gpu"], 0.9)
|
||||
self.assertEqual(case["winners"]["final_best"], "harness")
|
||||
self.assertEqual(case["winners"]["fastest_to_target"], "harness")
|
||||
harness = case["arms"][0]
|
||||
naive = case["arms"][1]
|
||||
self.assertEqual(harness["best_at_budget"]["2"], 0.9)
|
||||
self.assertEqual(naive["best_at_budget"]["2"], 0.4)
|
||||
self.assertEqual(case["target_fraction"], 0.8)
|
||||
self.assertEqual(harness["trials_to_target"], 2)
|
||||
self.assertEqual(naive["trials_to_target"], 4)
|
||||
self.assertEqual(naive["failed_count"], 1)
|
||||
comparison = case["harness_vs_naive"][0]
|
||||
self.assertTrue(comparison["passes"])
|
||||
self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
|
||||
self.assertTrue((tmp_path / "out" / "summary.json").exists())
|
||||
self.assertTrue((tmp_path / "out" / "report.md").exists())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user