Stop after strong incumbent harness gains

2026-04-26 01:29:05 +08:00
parent a53445868e
commit 29d0548e06
2 changed files with 110 additions and 1 deletions
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -135,6 +135,7 @@ def _knob_harnesses(
                "guards": [
                    "Keep MBT changes within a conservative trust region.",
                    "Do not raise MBT after OOM or launch failures involving memory-related knobs.",
+                    "Do not raise MBT when the incumbent MBT already covers prompt p99 unless same-topology history proves prefill fragmentation is the bottleneck.",
                ],
                "active_now": active_bottleneck == "ttft_prefill",
            }
@@ -308,6 +309,7 @@ def _convergence_guard(
    recent_diagnostics: list[dict[str, Any]],
 ) -> dict[str, Any]:
    infeasible_progress = _infeasible_progress_guard(recent_diagnostics)
+    strong_incumbent = _strong_incumbent_guard(state, recent_diagnostics)
    completed = [
        item
        for item in recent_diagnostics
@@ -329,12 +331,21 @@ def _convergence_guard(
        reason = "need_more_evidence_before_stop"
    if not should_stop and infeasible_progress["plateau_detected"]:
        reason = str(infeasible_progress["reason"])
+    if (
+        not should_stop
+        and not infeasible_progress["plateau_detected"]
+        and strong_incumbent["guard_active"]
+    ):
+        reason = str(strong_incumbent["reason"])
    return {
        "should_stop_if_no_harness_can_justify_a_new_adjacent_probe": (
-            should_stop or bool(infeasible_progress["stop_if_next_probe_repeats_family"])
+            should_stop
+            or bool(infeasible_progress["stop_if_next_probe_repeats_family"])
+            or bool(strong_incumbent["guard_active"])
        ),
        "reason": reason,
        "infeasible_progress": infeasible_progress,
+        "strong_incumbent": strong_incumbent,
        "incumbent": {
            "trial_id": state.best_trial_id,
            "parallel_size": state.best_parallel_size,
@@ -345,6 +356,51 @@ def _convergence_guard(
    }


+def _strong_incumbent_guard(
+    state: StudyState,
+    recent_diagnostics: list[dict[str, Any]],
+) -> dict[str, Any]:
+    default = {
+        "guard_active": False,
+        "reason": "no_strong_incumbent_yet",
+        "baseline_trial_id": None,
+        "baseline_request_rate_per_gpu": None,
+        "incumbent_gain_vs_baseline": None,
+    }
+    if state.best_trial_id is None or state.best_request_rate_per_gpu is None:
+        return default
+    completed = [
+        item
+        for item in recent_diagnostics
+        if item.get("status") == "completed"
+        and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
+    ]
+    if len(completed) < 2:
+        return default
+    baseline = completed[0]
+    baseline_rate = float(baseline["best_request_rate_per_gpu"])
+    incumbent_rate = float(state.best_request_rate_per_gpu)
+    if baseline_rate <= 0:
+        return default
+    gain = incumbent_rate / baseline_rate
+    latest = recent_diagnostics[-1] if recent_diagnostics else {}
+    if state.best_trial_id == latest.get("trial_id") and gain >= 3.0:
+        return {
+            "guard_active": True,
+            "reason": "incumbent_exceeds_baseline_by_3x_and_latest_trial_is_best",
+            "baseline_trial_id": baseline.get("trial_id"),
+            "baseline_request_rate_per_gpu": baseline_rate,
+            "incumbent_gain_vs_baseline": gain,
+        }
+    return {
+        **default,
+        "baseline_trial_id": baseline.get("trial_id"),
+        "baseline_request_rate_per_gpu": baseline_rate,
+        "incumbent_gain_vs_baseline": gain,
+        "reason": "need_more_evidence_before_strong_incumbent_stop",
+    }
+
+
 def _infeasible_progress_guard(recent_diagnostics: list[dict[str, Any]]) -> dict[str, Any]:
    points = [
        point
@@ -474,6 +530,7 @@ def _proposal_rules() -> list[str]:
        "First decide the active bottleneck from recent_trial_diagnostics.",
        "Pick at most one primary knob family from knob_harnesses unless the history proves a coupled change is needed.",
        "Use adjacent legal values around the incumbent; avoid broad exploratory jumps.",
+        "When strong_incumbent.guard_active is true, do not propose runtime-only tweaks unless the relevant harness guard is positively satisfied by same-topology evidence.",
        "If infeasible_progress blocks the last primary knob family, do not continue that family; switch families with direct bottleneck evidence or set should_stop=true.",
        "If a proposed config is likely to reduce request_rate_per_gpu under the active guard, set should_stop=true instead of exploring.",
        "Never repeat an already tested config signature.",
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -365,6 +365,58 @@ class CoreFlowTests(unittest.TestCase):
                ]
            )

+    def test_harness_strong_incumbent_guard_after_large_gain(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0002",
+                best_request_rate_per_gpu=0.21,
+                trials=[
+                    TrialSummary(
+                        trial_id="trial-0001",
+                        status="completed",
+                        parallel_size=1,
+                        best_request_rate=0.035,
+                        best_request_rate_per_gpu=0.035,
+                        config_patch={"env_patch": {}, "flag_patch": {}},
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0002",
+                        status="completed",
+                        parallel_size=2,
+                        best_request_rate=0.42,
+                        best_request_rate_per_gpu=0.21,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {
+                                "tensor-parallel-size": 2,
+                                "data-parallel-size": 1,
+                            },
+                        },
+                    ),
+                ],
+            )
+            context = build_harness_context(
+                study=study,
+                window_summary={
+                    "prompt_tokens_p95": 7628,
+                    "prompt_tokens_p99": 8102,
+                    "prompt_tail_ratio_p95_p50": 3.83,
+                },
+                state=state,
+            )
+            guard = context["convergence_guard"]["strong_incumbent"]
+            self.assertTrue(guard["guard_active"])
+            self.assertGreaterEqual(guard["incumbent_gain_vs_baseline"], 3.0)
+            self.assertTrue(
+                context["convergence_guard"][
+                    "should_stop_if_no_harness_can_justify_a_new_adjacent_probe"
+                ]
+            )
+
    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)