Make strong incumbent trigger validation phase

2026-04-28 20:54:05 +08:00
parent 68cdaf56a8
commit 38ff4380e5
4 changed files with 53 additions and 8 deletions
--- a/docs/qwen235b-thinking-decode/harness-20260428.md
+++ b/docs/qwen235b-thinking-decode/harness-20260428.md
@@ -69,10 +69,19 @@ Per-GPU throughput table:
 | before harness req/s/GPU | 0.0158 | 0.0306 | infeasible | launch fail | infeasible | infeasible | infeasible | infeasible | 0.0352 | infeasible | infeasible | infeasible |
 | harness req/s/GPU | 0.0158 | 0.0471 | infeasible | not run | not run | not run | not run | not run | not run | not run | not run | not run |

-Decision: the harness accelerated convergence on qwen235b decode-only. The before-harness run first reached its best observed throughput at iter 9 with 0.2817 request/s. The harness run exceeded that value at iter 2 with 0.3767 request/s, a 1.34x improvement over the before-harness 12-iter best and a 2.97x improvement over the baseline config.
+Decision: the harness accelerated convergence on qwen235b decode-only, but this is not a proof of global optimality after one proposal. The before-harness run first reached its best observed throughput at iter 9 with 0.2817 request/s. The harness run exceeded that value at iter 2 with 0.3767 request/s, a 1.34x improvement over the before-harness 12-iter best and a 2.97x improvement over the baseline config.

 The harness did not stop cleanly after finding the strong incumbent. It spent one additional trial on `TP1/DP8/EP8`, which found no feasible point, and then the next LLM proposal failed schema validation before trial materialization. So the performance convergence goal is met, but the tuning loop should be hardened so a strong incumbent causes deterministic stop or a schema-repair retry rather than relying only on prompt instructions.

+Important interpretation: `trial-0002` should be called the current best observed config, not "proven best". The harness got there quickly because the decode-only harness biases the first proposal toward the most relevant adjacent topology redistribution, `TP4/DP2/EP8 -> TP2/DP4/EP8`, instead of spending trials on prefill-oriented runtime knobs. Later iterations are still needed to validate local optimality by testing nearby topologies and same-topology runtime knobs.
+
+Follow-up implementation after this result:
+
+- `strong_incumbent.guard_active` no longer directly contributes to `should_stop_if_no_harness_can_justify_a_new_adjacent_probe`.
+- A strong incumbent now means "enter validation phase": run adjacent topology or same-topology runtime probes that could falsify the incumbent.
+- The proposal rules now explicitly say not to stop solely because a strong incumbent appeared.
+- Proposal parsing now accepts structured `observation`/`diagnosis` by converting them to text, so a usable validation proposal is not dropped only because the LLM used an object instead of a string.
+
 ## Follow-up Fix

 The seeded prompt exposed a generic diagnosis issue: if the best feasible probe had no latency failures, the harness could miss the prior infeasible probe that showed the real bottleneck at higher load. The harness now scans the probe sequence backward and uses the nearest non-trivial bottleneck before falling back to the best feasible probe. This keeps decode-only runs focused on `decode_tpot` after a feasible low-load point, without adding testcase thresholds.
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -404,12 +404,11 @@ def _convergence_guard(
        and not infeasible_progress["plateau_detected"]
        and strong_incumbent["guard_active"]
    ):
-        reason = str(strong_incumbent["reason"])
+        reason = "strong_incumbent_requires_validation_probes"
    return {
        "should_stop_if_no_harness_can_justify_a_new_adjacent_probe": (
            should_stop
            or bool(infeasible_progress["stop_if_next_probe_repeats_family"])
-            or bool(strong_incumbent["guard_active"])
        ),
        "reason": reason,
        "infeasible_progress": infeasible_progress,
@@ -455,10 +454,14 @@ def _strong_incumbent_guard(
    if state.best_trial_id == latest.get("trial_id") and gain >= 1.8:
        return {
            "guard_active": True,
-            "reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best",
+            "reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best_enter_validation_phase",
            "baseline_trial_id": baseline.get("trial_id"),
            "baseline_request_rate_per_gpu": baseline_rate,
            "incumbent_gain_vs_baseline": gain,
+            "recommended_next_action": (
+                "validate_the_incumbent_with_adjacent_topology_or_same_topology_runtime_probes; "
+                "do_not_claim_global_optimum_from_the_first_large_gain"
+            ),
        }
    return {
        **default,
@@ -466,6 +469,7 @@ def _strong_incumbent_guard(
        "baseline_request_rate_per_gpu": baseline_rate,
        "incumbent_gain_vs_baseline": gain,
        "reason": "need_more_evidence_before_strong_incumbent_stop",
+        "recommended_next_action": "continue_harness_guided_search",
    }


@@ -600,7 +604,9 @@ def _proposal_rules() -> list[str]:
        "For decode_tpot bottlenecks, prefer legal TP/DP topology redistribution before runtime-only knobs, then tune max-num-seqs or max-num-batched-tokens only from observed SLO headroom/failures.",
        "Pick at most one primary knob family from knob_harnesses unless the history proves a coupled change is needed.",
        "Use adjacent legal values around the incumbent; avoid broad exploratory jumps.",
-        "When strong_incumbent.guard_active is true, do not propose runtime-only tweaks unless the relevant harness guard is positively satisfied by same-topology evidence.",
+        "When strong_incumbent.guard_active is true, treat the run as a validation phase, not as proof of optimum: probe only adjacent topology or same-topology runtime knobs that can falsify the incumbent.",
+        "Do not stop solely because a strong incumbent appeared; stop only after nearby validation probes are exhausted, blocked by infeasible evidence, or repeatedly remain near the incumbent.",
+        "During validation, prefer probes that answer whether the incumbent is locally optimal over probes that merely chase another large gain.",
        "Do not enable expert parallel for TTFT-prefill bottlenecks unless expert-parallel is the active harness and there is direct positive EP evidence.",
        "If infeasible_progress blocks the last primary knob family, do not continue that family; switch families with direct bottleneck evidence or set should_stop=true.",
        "If a proposed config is likely to reduce request_rate_per_gpu under the active guard, set should_stop=true instead of exploring.",
--- a/src/aituner/spec.py
+++ b/src/aituner/spec.py
@@ -63,6 +63,16 @@ def _coerce_str_list(value: Any, *, context: str) -> list[str]:
    return result


+def _coerce_text(value: Any, *, context: str) -> str:
+    if isinstance(value, str) and value.strip():
+        return value.strip()
+    if isinstance(value, Mapping) or isinstance(value, list):
+        text = json.dumps(value, ensure_ascii=False, sort_keys=True)
+        if text:
+            return text
+    raise SpecError(f"{context} must be a non-empty string.")
+
+
 def _coerce_int_list(value: Any, *, context: str) -> list[int]:
    if value is None:
        return []
@@ -674,8 +684,8 @@ class Proposal:
                expected_effects, context="proposal.expected_effects"
            )
        return cls(
-            observation=_require_str(data.get("observation"), context="proposal.observation"),
-            diagnosis=_require_str(data.get("diagnosis"), context="proposal.diagnosis"),
+            observation=_coerce_text(data.get("observation"), context="proposal.observation"),
+            diagnosis=_coerce_text(data.get("diagnosis"), context="proposal.diagnosis"),
            config_patch=ConfigPatch.from_dict(
                _require_mapping(data.get("config_patch"), context="proposal.config_patch")
            ),
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -411,11 +411,16 @@ class CoreFlowTests(unittest.TestCase):
            guard = context["convergence_guard"]["strong_incumbent"]
            self.assertTrue(guard["guard_active"])
            self.assertGreaterEqual(guard["incumbent_gain_vs_baseline"], 3.0)
-            self.assertTrue(
+            self.assertFalse(
                context["convergence_guard"][
                    "should_stop_if_no_harness_can_justify_a_new_adjacent_probe"
                ]
            )
+            self.assertEqual(
+                context["convergence_guard"]["reason"],
+                "strong_incumbent_requires_validation_probes",
+            )
+            self.assertIn("validate", guard["recommended_next_action"])

    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
@@ -2418,6 +2423,21 @@ class CoreFlowTests(unittest.TestCase):
            ["throughput: higher", "ttft: lower"],
        )

+    def test_proposal_observation_accepts_object(self) -> None:
+        proposal = Proposal.from_dict(
+            {
+                "observation": {
+                    "incumbent_trial": "trial-0002",
+                    "boundary_signal": "tpot cliff",
+                },
+                "diagnosis": "validate incumbent",
+                "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
+                "expected_effects": ["more TPOT headroom"],
+            }
+        )
+        self.assertIn('"incumbent_trial": "trial-0002"', proposal.observation)
+        self.assertEqual(proposal.diagnosis, "validate incumbent")
+
    def test_proposal_accepts_should_stop(self) -> None:
        proposal = Proposal.from_dict(
            {