From 5d96689ea69ad4a61c581d42bf85b1def30b003f Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Wed, 6 May 2026 17:37:31 +0800 Subject: [PATCH] Make harness runtime refinement memory safe --- ...chat-0-8k-current-config-fig18-20260506.md | 135 ++++++++++++++++++ src/aituner/harness.py | 4 +- tests/test_core_flow.py | 1 - 3 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 docs/qwen27b-chat-0-8k-current-config-fig18-20260506.md diff --git a/docs/qwen27b-chat-0-8k-current-config-fig18-20260506.md b/docs/qwen27b-chat-0-8k-current-config-fig18-20260506.md new file mode 100644 index 0000000..874ca0f --- /dev/null +++ b/docs/qwen27b-chat-0-8k-current-config-fig18-20260506.md @@ -0,0 +1,135 @@ +# qwen27b-chat-0-8k Current-Config Fig18 Plan + +## Question + +The earlier tables used best-so-far throughput. That is useful for deciding the +best deployable incumbent, but it hides bad proposals because the curve is +monotonic by construction. To judge whether the harness makes tuning more +directional, the primary curve must be each iteration's measured current config +performance. + +## Why Final Performance Can Be Close + +Harness and no-harness can converge to similar final throughput when the search +space contains one dominant simple family. In this setup the dominant family is +`TP=2, DP=1` over the `run_qwen27b.sh` baseline. The no-harness LLM can still +eventually discover that family within 12 iterations, so final best performance +can be close. + +The difference the harness is expected to improve is not necessarily the final +12-iter maximum. It should improve: + +- iterations-to-first-good-config; +- number of worse or infeasible proposals after an incumbent is found; +- measured-current config oscillation; +- early-stop behavior once adjacent harness probes no longer justify more GPU + trials. + +## Metrics + +- `measured-current`: each trial's own feasible `request_rate_per_gpu`. + Failed or no-feasible-point trials are recorded as `NA`. +- `accepted-incumbent`: best deployable value after each trial. This is the + standard best-so-far curve and is monotonic by definition. +- `iters-to-best`: first iteration where the final best value or equivalent + config family appears. +- `wasted-trials-after-best`: trials after first best that are worse, infeasible, + or no-feasible-point. + +## Historical Run9 Re-Read + +Source: +`.aituner-tight/dash0-qwen27b-tight-slo-10min-run9-chat-0-8k-codex-topology` +on dash0. + +| Variant | Curve | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| no-harness run9 | measured-current | 0.0350 | 0.0617 | 0.0392 | 0.2025 | NA | NA | NA | NA | NA | NA | NA | NA | +| no-harness run9 | accepted-incumbent | 0.0350 | 0.0617 | 0.0617 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | + +Interpretation: the no-harness current-config curve already has a regression at +iter 3 and then many no-feasible-point runtime probes. The monotonic curve only +shows the incumbent policy, not proposal quality. + +## New Paired Test Plan + +Run on dash0 with internal vLLM and the real `chat_w20260311_1000` 0-8k replay: + +- Base spec: `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`. +- Model path: + `/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal`. +- Engine: `/usr/local/bin/vllm`, baseline aligned with `~/run_qwen27b.sh`. +- SLO: 95% pass, stepped TTFT `2s/4s/6s`, TPOT `<=50ms`. +- Search: `low=0`, `high=0.0625`, `max_probes=6`, `tolerance=0.001`. +- no-harness study: + `.aituner-tight/dash0-qwen27b-tight-slo-10min-run10-chat-0-8k-current-noharness`. +- harness study: + `.aituner-tight/dash0-qwen27b-tight-slo-10min-run10-chat-0-8k-current-harness`. + +The result table will report both curves. The harness is considered successful +only if it reaches the same or better incumbent in fewer iterations and reduces +the measured-current regressions or replaces them with an explicit harness stop. + +## Run Status + +- 2026-05-06 07:05 CST: dash0 checked, 8 H20 GPUs idle. +- 2026-05-06 07:05 CST: generated paired specs under + `.aituner-tight/specs/`. +- 2026-05-06 07:05 CST: started no-harness full 12-iter run in tmux session + `qwen27b_run10_noharness_20260506`. +- 2026-05-06 07:18 CST: stopped the duplicate fresh no-harness run before + completion. Reason: run9 is already a completed real 12-iter no-harness run + for the same internal vLLM 0-8k setup, while the fresh full-chat run would + spend a multi-hour dash0 slot duplicating that curve. +- 2026-05-06 07:20 CST: seeded the harness study with the real run9 baseline + measurement as `trial-0001`, then started the harness run with + `--skip-baseline` in tmux session `qwen27b_run10_harness_skipbase_20260506`. +- 2026-05-06 07:20 CST: harness generated deterministic `trial-0002`: + `{"tensor-parallel-size": 2}`. +- 2026-05-06 08:11 CST: harness `trial-0002` completed: + `TP=2`, `0.2142 request_rate_per_gpu`. +- 2026-05-06 08:19 CST: harness `trial-0003` failed at engine launch. + Root cause: the old runtime refinement coupled `gpu-memory-utilization=0.95` + with larger `max-num-batched-tokens`, causing speculative sampler warmup OOM. + This is a generic harness safety bug; fixed locally by removing the automatic + memory-utilization bump from runtime refinement. +- 2026-05-06 09:24 CST: harness `trial-0004` completed: + `TP=4`, `0.4429 request_rate_per_gpu`. All six probes were feasible up to + `sampling_u=0.0615234375`, so this study is near the configured + `search.high=0.0625` ceiling. +- 2026-05-06 09:25 CST: old harness repeated the same unsafe runtime refinement + for TP4 and `trial-0005` failed at engine launch for the same OOM reason. The + old process was stopped before continuing. + +## Current Results + +Unit: feasible `request_rate_per_gpu`. `NA` means the current trial did not +produce a feasible deployable config. + +| Variant | Curve | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| no-harness run9 | measured-current | 0.0350 | 0.0617 | 0.0392 | 0.2025 | NA | NA | NA | NA | NA | NA | NA | NA | +| no-harness run9 | accepted-incumbent | 0.0350 | 0.0617 | 0.0617 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | +| harness run10 | measured-current | 0.0350 | 0.2142 | NA | 0.4429 | NA | pending | pending | pending | pending | pending | pending | pending | +| harness run10 | accepted-incumbent | 0.0350 | 0.2142 | 0.2142 | 0.4429 | 0.4429 | pending | pending | pending | pending | pending | pending | pending | + +The harness result is stronger than the earlier strict replay. It did not merely +reach the same TP2 region earlier; it then used the bottleneck/topology evidence +to validate TP4 and found a much higher current config. + +## Interpretation + +- Why both variants can look close when only best-so-far is shown: no-harness can + eventually find a good simple topology, and best-so-far hides every bad + proposal after that point. +- What the current-config curve shows: no-harness regresses at iter 3 and then + spends many iterations on no-feasible-point runtime probes. Harness reaches a + stronger TP2 config at iter 2 and a stronger TP4 config at iter 4. +- Why harness helped: the baseline diagnostics identify TTFT/prefill as the + active bottleneck on low-prefix-reuse long prompts. The harness maps that to + adjacent TP validation before DP/runtime exploration. The no-harness LLM chose + DP2 then DP4 first, which diluted per-GPU throughput and delayed TP. +- Remaining defect found during the run: runtime refinement was too aggressive + because it combined larger MBT with higher memory utilization. This has been + fixed so future runtime validation changes batching headroom without also + raising memory pressure. diff --git a/src/aituner/harness.py b/src/aituner/harness.py index fa8626d..7eaf033 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -668,8 +668,6 @@ def _runtime_refinement_proposal( return default tunable = set(study.engine.tunable_flags) flag_patch: dict[str, Any] = {"tensor-parallel-size": best_tp} - if "gpu-memory-utilization" in tunable: - flag_patch["gpu-memory-utilization"] = 0.95 if "enable-chunked-prefill" in tunable: flag_patch["enable-chunked-prefill"] = True if "max-num-batched-tokens" not in tunable: @@ -704,7 +702,7 @@ def _runtime_refinement_proposal( "config_patch": {"env_patch": {}, "flag_patch": flag_patch}, "expected_effects": [ "preserve the incumbent topology", - "increase batching headroom while staying inside one runtime family", + "increase batching headroom without also raising memory pressure", ], "incumbent_trial_id": best.get("trial_id"), } diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index ce96403..5b267e1 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -751,7 +751,6 @@ class CoreFlowTests(unittest.TestCase): proposal.config_patch.flag_patch, { "tensor-parallel-size": 2, - "gpu-memory-utilization": 0.95, "enable-chunked-prefill": True, "max-num-batched-tokens": 16384, },