From 92eb1860068eecb346aeff6c3cbba5a93dec7cd9 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Fri, 26 Jun 2026 16:44:24 +0800 Subject: [PATCH] Add bad-start harness recovery planning --- docs/aituner-roadmap.md | 25 ++- .../no-llm-harness-mechanism-20260625.md | 110 ++++++++++ src/aituner/harness.py | 81 ++++++- tests/test_core_flow.py | 206 ++++++++++++++++++ 4 files changed, 420 insertions(+), 2 deletions(-) diff --git a/docs/aituner-roadmap.md b/docs/aituner-roadmap.md index 694dd49..719feb0 100644 --- a/docs/aituner-roadmap.md +++ b/docs/aituner-roadmap.md @@ -79,10 +79,33 @@ kernel、KV cache、通信和排队的闭式性能模型。更稳妥也更强的 | C5. AITuner 找到 near-optimal region,而不是只找到一个可行 config | Qwen30B 有解释性信号 | [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 选 1-2 个 case 做局部 grid 或专家配置对照 | | C6. AITuner 能随 SLO tightness 移动到合适 frontier | Qwen30B 已完成 | [Qwen30B SLO robustness](harness-ablation/qwen30b-slo-robustness-20260624.md) | 再选一个非同质 case 做 SLO sweep;同时画 SLO tightness -> frontier/regime transition | | C7. Engine adapter 让 intervention grammar 可迁移到其他 serving engine | 设计上可行,暂不作为主实验 claim | `EngineLaunchSpec` / launch recipe / tunable schema | vLLM 主线完成后,再做 SGLang adapter 和一个低成本验证 case | +| C8. Harness 对坏初始点有恢复能力,不只依赖可信 base config | planner 规则和本地回归测试已补;真机待跑 | [No-LLM harness mechanism](harness-ablation/no-llm-harness-mechanism-20260625.md) | 从 `TP=8, max-num-seqs=8, gmu=0.5` 等坏起点做 no-LLM 真机 recovery run | ## 最高优先级实验 -### P0. 完成 Qwen235B decode 2x2 并整理 aggregate +### P0a. Bad-start recovery confirmation + +目的:回答 harness 是否只能从可信 base config 起步,还是能从明显不合理的初始 config +恢复到正确方向。 + +最小实验矩阵: + +| Case | 初始配置 | 证明点 | +| --- | --- | --- | +| bad-topology | `TP=8, DP=1` | 高 TP 起点会先做相邻低 TP bracket | +| bad-runtime | `TP=2, gmu=0.5, max-num-seqs=8` | 低 runtime headroom 会跳回 nominal floor | +| combined-bad | `TP=8, gmu=0.5, max-num-seqs=8` | topology recovery 和 runtime recovery 能串联 | + +预期图: + +- x-axis: trial index; +- y-axis: best-so-far SLO-constrained req/s/GPU; +- line groups: trusted-start vs bad-start cases; +- annotation: proposal family sequence,例如 `TP downshift`, `gmu floor jump`, `gmu climb`。 + +启动条件:先确认 dash fleet 有空闲 8xH20 机器;用户确认后再开跑。 + +### P0b. 完成 Qwen235B decode 2x2 并整理 aggregate 目的:补齐最核心的 `harness on/off x strong/weak planner` 证据,回答: diff --git a/docs/harness-ablation/no-llm-harness-mechanism-20260625.md b/docs/harness-ablation/no-llm-harness-mechanism-20260625.md index 9f02df8..50f728c 100644 --- a/docs/harness-ablation/no-llm-harness-mechanism-20260625.md +++ b/docs/harness-ablation/no-llm-harness-mechanism-20260625.md @@ -208,6 +208,16 @@ proposal;否则进入 stop validator 或 LLM fallback。 Candidate 只是一个 hypothesis;是否接受由真实 trial 的 SLO-constrained `request_rate_per_gpu` 决定。 +5. Bad-start recovery 需要先 bracket,再微调。 + 如果 no-LLM run 从一个很高 TP 的初始点开始,且同 DP 下更高 TP frontier 已经不存在 + 或已测过,harness 会优先验证相邻低 TP,而不是把当前高 TP 当作 topology 已收敛。 + 这避免了 `TP=8` 这类坏初始点直接进入 `gpu-memory-utilization` 微调。 + +6. Pathological runtime 起点需要跳回正常工作区间。 + `gpu-memory-utilization` 的常规策略是在 settled topology 上小步 hill-climb; + 但如果初始值明显低于正常工作区间,例如 `0.5`,harness 会先跳到 nominal floor + `0.9`,再按 `0.02` 步长向 safe ceiling `0.97` 验证。 + ## Validator stop: 为什么不会过早停止 Harness stop 不是“找到一个不错配置就停”。当前 stop validator 包含几个条件: @@ -356,6 +366,106 @@ No-LLM Qwen30B run 证明了 deterministic harness 可以完整闭环,但 pape - 再选 decode-heavy 或 long-prefill case; - 验证不同 workload/SLO 下 candidate family 会发生合理切换。 +5. Bad-start recovery + - 从非可信初始配置开始,例如 `TP=8, max-num-seqs=8, gmu=0.5`; + - 证明 harness 不是只能从“已经比较合理”的 base config 出发; + - 观察它是否能先恢复 topology,再恢复 runtime headroom,并最终回到同一 near-optimal + region。 + +## Bad-start recovery 审计 - 2026-06-26 + +用户提出的问题是:如果我们不是从可信 base config 开始,而是从一个恶意或不合理的 +配置开始,例如: + +```text +TP=8, DP=1, max-num-seqs=8, gpu-memory-utilization=0.5 +``` + +no-LLM harness 是否仍能自动找到正确方向? + +目前结论要分开说: + +1. **旧 planner 不能直接 claim 任意坏起点可恢复。** + 本地合成审计显示,旧逻辑会把 `TP=8` 误当作 topology frontier 已收敛,并把下一步 + proposal 设为 `gpu-memory-utilization=0.52`。这会在坏 topology 和坏 runtime 上 + 做很慢的小步爬坡,不能作为 robust evidence。 + +2. **已补 planner 机制。** + 当前 harness 增加了两个 no-LLM deterministic recovery rules: + - `bad_start_topology_bracket`:当当前 anchor 在高 TP,且没有未测的更高 TP frontier 时, + 先测相邻低 TP,例如 `TP=8 -> TP=4`; + - `gmu_nominal_floor`:当 settled topology 上的 `gpu-memory-utilization < 0.9` 时, + 先跳到 `0.9`,再做常规 `0.92/0.94/.../0.97` hill-climb。 + +3. **已加本地回归测试,但还没做真机证明。** + 已通过的 planner tests: + - `test_harness_brackets_down_from_bad_high_tp_start_before_runtime_tuning` + - `test_harness_jumps_low_gpu_mem_util_to_nominal_floor_after_topology_settles` + - 以及已有 topology-first / gmu-climb 相关回归测试。 + +因此,当前状态是:planner 侧已经能给出正确方向;paper 级别还需要真机 bad-start +recovery run 来确认真实 vLLM 测量下是否稳定收敛。 + +## 准备中的真机实验 + +实验目的不是再证明默认起点能 work,而是证明: + +```text +same workload + same SLO + same no-LLM harness +不同初始 config + -> 是否收敛到同一 near-optimal region + -> 是否保持可解释 trial path +``` + +Base spec 使用已验证的 Qwen30B community vLLM 0.20 harness setup: + +```text +configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json +``` + +运行时需要设置: + +```json +{ + "llm": { + "use_harness": true, + "endpoint": null + } +} +``` + +建议最小矩阵: + +| Case | Base flags 变化 | 要验证的机制 | 预期 trial path | +| --- | --- | --- | --- | +| trusted-start-control | 保持现有可信 base | 对照已有 stopfix run | `TP=2 -> TP=4 -> TP=2+gmu climb -> stop` | +| bad-topology | `TP=8, DP=1` | 高 TP 起点是否会向下 bracket | `TP8 baseline -> TP4 -> TP2/或同等 better topology -> runtime` | +| bad-runtime | `TP=2, DP=1, gmu=0.5, max-num-seqs=8` | 低 KV headroom 是否跳回正常区间 | `gmu 0.5 baseline -> gmu 0.9 -> 0.92/...` | +| combined-bad | `TP=8, DP=1, gmu=0.5, max-num-seqs=8` | topology recovery 和 runtime recovery 能否串起来 | `TP8 -> TP4 -> TP2/nearby -> gmu 0.9 -> climb -> stop` | + +成功判据: + +- 不配置 LLM endpoint;所有 proposal 来自 harness; +- 不重复相同 config signature; +- high-TP 起点必须先出现相邻低 TP probe,而不是先做 `gmu=0.52`; +- low-gmu 起点必须先跳到 `0.9`,而不是 `0.52`; +- 在 12 个 measured trials 内达到 reference stopfix best 的 `>=95%`: + +```text +reference best = 3.4333 req/s/GPU +95% threshold = 3.2616 req/s/GPU +``` + +- 最终 stop 必须是 validator 授权,例如 `harness_stop`,而不是因为没有 proposal source + 失败退出。 + +如果真机结果失败,需要保留失败路径并分析是哪类机制不足: + +- topology bracket 找到低 TP,但 runtime 仍无法恢复; +- `max-num-seqs=8` 导致 admission 太差,需要 admission recovery floor; +- baseline 自身全不可行,当前 harness 缺少 completed incumbent,不能进入正常 guided loop; +- vLLM launch/OOM 造成 failure memory 覆盖了可恢复路径。 + ## 一句话总结 No-LLM harness 能自动找到配置,是因为它已经实现了一个面向 serving 机制的实验 planner: diff --git a/src/aituner/harness.py b/src/aituner/harness.py index 3c7a404..190d747 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -29,7 +29,10 @@ _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3 # safe ceiling and let measurement find the real peak: a too-high target regresses or # fails to launch and is rejected by the incumbent guard, and its tested signature then # blocks re-proposal so the climb terminates. +# Pathological starts below the nominal floor jump back into the normal operating range +# before this small-step climb begins. _GMU_STEP = 0.02 +_GMU_NOMINAL_FLOOR = 0.9 _GMU_SAFE_CEILING = 0.97 @@ -1147,6 +1150,23 @@ def _topology_candidate_actions( current_ep=current_ep, current_enable_ep=current_enable_ep, ) + adjacent_lower_tp = None + if ( + study.trace.request_mode != "decode_only" + and not _anchor_has_topology_patch(anchor) + and not _has_unmeasured_higher_tp_candidate( + study, + legal, + current_tp=current_tp, + current_dp=current_dp, + tested_signatures=tested_signatures, + ) + ): + adjacent_lower_tp = _adjacent_lower_tp_candidate( + legal, + current_tp=current_tp, + current_dp=current_dp, + ) actions: list[dict[str, Any]] = [] for point in legal: if point["tensor-parallel-size"] == current_tp and point["data-parallel-size"] == current_dp: @@ -1164,6 +1184,14 @@ def _topology_candidate_actions( candidate_tp=point["tensor-parallel-size"], candidate_dp=point["data-parallel-size"], ) + if ( + adjacent_lower_tp is not None + and current_tp > 2 + and point["tensor-parallel-size"] == adjacent_lower_tp + and point["data-parallel-size"] == current_dp + ): + score = max(score, 0.74) + factors["bad_start_topology_bracket"] = 0.74 if score <= 0: continue action_id = _topology_action_id(current_tp, current_dp, point) @@ -1457,7 +1485,11 @@ def _next_gpu_memory_utilization_target( elif item.get("status") == "failed": failed_gmus.append(gmu) climb_from = max(successful_gmus) - target = round(min(_GMU_SAFE_CEILING, climb_from + _GMU_STEP), 4) + if climb_from < _GMU_NOMINAL_FLOOR: + target = min(_GMU_SAFE_CEILING, _GMU_NOMINAL_FLOOR) + else: + target = min(_GMU_SAFE_CEILING, climb_from + _GMU_STEP) + target = round(target, 4) if target <= climb_from: return None if any(failed <= target + EPSILON for failed in failed_gmus): @@ -1490,6 +1522,53 @@ def _runtime_action( } +def _anchor_has_topology_patch(anchor: dict[str, Any]) -> bool: + patch = anchor.get("config_patch") + if not isinstance(patch, dict): + return False + flag_patch = patch.get("flag_patch") + if not isinstance(flag_patch, dict): + return False + return any(key in flag_patch for key in _TOPOLOGY_KEYS) + + +def _has_unmeasured_higher_tp_candidate( + study: StudySpec, + legal: list[dict[str, Any]], + *, + current_tp: int, + current_dp: int, + tested_signatures: set[str], +) -> bool: + for point in legal: + if ( + point["data-parallel-size"] != current_dp + or point["tensor-parallel-size"] <= current_tp + ): + continue + signature = _config_signature( + {"env_patch": {}, "flag_patch": _topology_patch(study, point)} + ) + if signature not in tested_signatures: + return True + return False + + +def _adjacent_lower_tp_candidate( + legal: list[dict[str, Any]], + *, + current_tp: int, + current_dp: int, +) -> int | None: + lower_tps = { + int(point["tensor-parallel-size"]) + for point in legal + if point["data-parallel-size"] == current_dp + and point["tensor-parallel-size"] < current_tp + } + return max(lower_tps) if lower_tps else None + + def _legal_topology_points( study: StudySpec, *, diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index bbc4d5a..8d8794a 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1904,6 +1904,212 @@ class CoreFlowTests(unittest.TestCase): ) self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch) + def test_harness_brackets_down_from_bad_high_tp_start_before_runtime_tuning(self) -> None: + """A no-LLM run that starts at the max TP should validate the adjacent lower + topology before spending trials on runtime micro-tuning.""" + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + slo_overrides={ + "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, + "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, + }, + engine_overrides={ + "base_flags": { + "host": "127.0.0.1", + "port": 8000, + "tensor-parallel-size": 8, + "data-parallel-size": 1, + "gpu-memory-utilization": 0.5, + "max-num-seqs": 8, + }, + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "gpu-memory-utilization", + "max-num-seqs", + ], + "topology_constraints": { + "allowed_tensor_parallel_sizes": [1, 2, 4, 8], + "allowed_data_parallel_sizes": [1], + "allowed_tp_dp_products": [1, 2, 4, 8], + }, + }, + ) + study = load_study_spec(study_path) + result_path = tmp_path / "trial-0001.json" + result_path.write_text( + json.dumps( + { + "status": "completed", + "best_sampling_u": 0.05, + "best_request_rate": 8.0, + "best_pass_rate": 0.96, + "probes": [ + { + "threshold": 0.05, + "feasible": True, + "payload": { + "request_count": 300, + "pass_rate": 0.96, + "request_rate": 8.0, + "latency_summary": {"failed_reason_counts": {}}, + }, + }, + { + "threshold": 0.08, + "feasible": False, + "payload": { + "request_count": 300, + "pass_rate": 0.5, + "request_rate": 10.0, + "early_stop_reason": "slo_pass_rate_unrecoverable", + "latency_summary": { + "failed_reason_counts": {"ttft_ms>4000.0": 120} + }, + }, + }, + ], + } + ), + encoding="utf-8", + ) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0001", + best_request_rate=8.0, + best_request_rate_per_gpu=1.0, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + best_request_rate=8.0, + best_request_rate_per_gpu=1.0, + result_path=str(result_path), + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + ], + ) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 6500}, + state=state, + ) + proposal = build_harness_guided_proposal(context) + self.assertIsNotNone(proposal) + self.assertEqual( + proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4 + ) + self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch) + self.assertNotIn("max-num-seqs", proposal.config_patch.flag_patch) + + def test_harness_jumps_low_gpu_mem_util_to_nominal_floor_after_topology_settles(self) -> None: + """A pathological gmu=0.5 start should jump to the normal operating floor + after topology is bracketed instead of wasting many 0.02 hill-climb trials.""" + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + slo_overrides={ + "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, + "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, + }, + engine_overrides={ + "base_flags": { + "host": "127.0.0.1", + "port": 8000, + "tensor-parallel-size": 2, + "data-parallel-size": 1, + "gpu-memory-utilization": 0.5, + }, + "tunable_flags": [ + "tensor-parallel-size", + "gpu-memory-utilization", + ], + "topology_constraints": { + "allowed_tensor_parallel_sizes": [1, 2, 4], + "allowed_data_parallel_sizes": [1], + "allowed_tp_dp_products": [1, 2, 4], + }, + }, + ) + study = load_study_spec(study_path) + result_path = tmp_path / "trial-0001.json" + result_path.write_text( + json.dumps( + { + "status": "completed", + "best_sampling_u": 0.07, + "best_request_rate": 2.4, + "best_pass_rate": 0.97, + "probes": [ + { + "threshold": 0.07, + "feasible": True, + "payload": { + "request_count": 300, + "pass_rate": 0.97, + "request_rate": 2.4, + "latency_summary": {"failed_reason_counts": {}}, + }, + }, + { + "threshold": 0.1, + "feasible": False, + "payload": { + "request_count": 300, + "pass_rate": 0.55, + "request_rate": 3.1, + "early_stop_reason": "slo_pass_rate_unrecoverable", + "latency_summary": { + "failed_reason_counts": {"tpot_ms>50.0": 90} + }, + }, + }, + ], + } + ), + encoding="utf-8", + ) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0001", + best_request_rate=2.4, + best_request_rate_per_gpu=1.2, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + best_request_rate=2.4, + best_request_rate_per_gpu=1.2, + result_path=str(result_path), + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + TrialSummary( + trial_id="trial-0002", + status="completed", + best_request_rate=2.2, + best_request_rate_per_gpu=0.55, + config_patch={ + "env_patch": {}, + "flag_patch": {"tensor-parallel-size": 4}, + }, + ), + ], + ) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 1500}, + state=state, + ) + proposal = build_harness_guided_proposal(context) + self.assertIsNotNone(proposal) + self.assertEqual( + proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.9 + ) + self.assertNotIn("tensor-parallel-size", proposal.config_patch.flag_patch) + def test_harness_continues_gpu_mem_util_after_tied_same_topology_probe(self) -> None: """After adjacent topology validation, gpu-memory-utilization should hill-climb on the incumbent topology even if an earlier gmu step tied the incumbent and