diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json index ba654e5..1e6fa48 100644 --- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json @@ -1,5 +1,5 @@ { - "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness", + "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-harness", "hardware": { "gpu_count": 8, "gpu_model": "H20", @@ -71,7 +71,7 @@ "min_input_tokens": 0, "max_input_tokens": 8192 }, - "max_requests_per_probe": 2048, + "max_requests_per_probe": 512, "replay_time_scale": 1.0, "early_stop_max_lag_s": 120.0, "early_stop_max_elapsed_s": 900.0 diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json index 21051f9..8008fb8 100644 --- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json @@ -1,5 +1,5 @@ { - "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness", + "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-noharness", "hardware": { "gpu_count": 8, "gpu_model": "H20", @@ -71,7 +71,7 @@ "min_input_tokens": 0, "max_input_tokens": 8192 }, - "max_requests_per_probe": 2048, + "max_requests_per_probe": 512, "replay_time_scale": 1.0, "early_stop_max_lag_s": 120.0, "early_stop_max_elapsed_s": 900.0 diff --git a/docs/aituner-harness-summary.md b/docs/aituner-harness-summary.md index 9b12707..2b2680c 100644 --- a/docs/aituner-harness-summary.md +++ b/docs/aituner-harness-summary.md @@ -52,6 +52,7 @@ The speedup comes from reducing wasted proposal families, not from changing the 3. All-infeasible plateau detection - When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family. - This prevents continuing a direction such as DP-only scale-out after DP4 and DP8 plateau. + - Plateau alone does not trigger deterministic early stop; it forces either a different justified family or a later validation/convergence stop. 4. Cleaner early-stop handling - Early-stopped probes no longer leave in-flight requests polluting the next probe. diff --git a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md index 3834249..3349f82 100644 --- a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md +++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md @@ -38,14 +38,14 @@ The experiment reuses the 0-8k chat window that has already been used for qwen27 | window | `chat_w20260311_1000` | | source rows | 32606 | | input filter | 0 to 8192 tokens | -| max requests per probe | 2048 | +| max requests per probe | 512 | | target pass rate | 0.95 | | TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above | | TPOT SLO | 50ms | | search high | 0.125 sampling_u | | max probes per trial | 6 | -The `max_requests_per_probe=2048` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe. +The `max_requests_per_probe=512` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe. A trace-only count check gives 31 to 65 selected requests across the six binary-search thresholds, avoiding the invalid low-cap case where early thresholds can select zero requests. ## Harness Update Under Test @@ -59,6 +59,7 @@ This run tests a stricter early-stop harness: - those validation trials did not produce a feasible per-GPU improvement, - the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts. - If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal. +- A single-family all-infeasible plateau is not enough to stop deterministically. It only blocks repeating that family; the LLM must either justify a different family or later satisfy the validation/convergence stop rule. This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number. @@ -87,8 +88,8 @@ Pending dash0 runs: | Variant | tmux session | Log | Study root | | --- | --- | --- | --- | -| no-harness | `qwen30b_vllm020_noharness_20260502` | `logs/qwen30b_vllm020_noharness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness` | -| harness | `qwen30b_vllm020_harness_20260502` | `logs/qwen30b_vllm020_harness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness` | +| no-harness | `qwen30b_vllm020_noharness_probe512_20260502` | `logs/qwen30b_vllm020_noharness_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-noharness` | +| harness | `qwen30b_vllm020_harness_probe512_20260502` | `logs/qwen30b_vllm020_harness_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-harness` | The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point. diff --git a/src/aituner/harness.py b/src/aituner/harness.py index d1bd40d..22c43a6 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -452,6 +452,7 @@ def _convergence_guard( ): reason = "strong_incumbent_requires_validation_probes" return { + "deterministic_stop": should_stop, "should_stop_if_no_harness_can_justify_a_new_adjacent_probe": ( should_stop or bool(infeasible_progress["stop_if_next_probe_repeats_family"]) @@ -474,7 +475,7 @@ def _harness_stop_decision( recent_diagnostics: list[dict[str, Any]], ) -> dict[str, Any]: guard = _convergence_guard(state, recent_diagnostics) - if guard["should_stop_if_no_harness_can_justify_a_new_adjacent_probe"]: + if guard["deterministic_stop"]: return { "should_stop": True, "reason": guard["reason"], diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index f33125b..580978d 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -364,6 +364,9 @@ class CoreFlowTests(unittest.TestCase): "should_stop_if_no_harness_can_justify_a_new_adjacent_probe" ] ) + self.assertFalse(context["convergence_guard"]["deterministic_stop"]) + self.assertFalse(context["harness_stop"]["should_stop"]) + self.assertIsNone(build_harness_stop_proposal(context)) def test_harness_strong_incumbent_guard_after_large_gain(self) -> None: with tempfile.TemporaryDirectory() as tmp: