Add generic decode-only harness guidance

2026-04-28 06:46:18 +08:00
parent 71902b9fc2
commit 39aa47fbf1
3 changed files with 124 additions and 7 deletions
--- a/docs/qwen235b-thinking-decode/harness-20260428.md
+++ b/docs/qwen235b-thinking-decode/harness-20260428.md
@@ -0,0 +1,33 @@
 # Qwen235B Thinking Decode-Only Harness Run, 2026-04-28
 ## Goal
 Run the qwen235b thinking decode-only tuning with the same harness-guided workflow used for the prefill-only test, while keeping the harness generic. The harness must use workload mode, configured SLOs, legal topology constraints, and measured trial history rather than testcase-specific throughput thresholds.
 ## Baseline Reference
 The before-harness comparison run is `dash0-qwen235b-decode-thinking-run5-tpot40-topology`:
 | Iter | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
 | before harness request/s | 0.1267 | 0.2450 | infeasible | launch fail | infeasible | infeasible | infeasible | infeasible | 0.2817 | infeasible | infeasible | infeasible |
 Before harness, the best feasible config appeared at iter 9 with 0.2817 request/s.
 ## Harness Change
 The decode-only harness now defaults to `decode_tpot` when `trace.request_mode=decode_only` and a TPOT SLO is configured. This avoids treating long decode-only prompt hints as a TTFT-prefill workload.
 Active decode harness families are generic:
 - `tensor-parallel-size`: legal TP/DP redistribution, judged by configured SLO pass rate and request_rate_per_gpu.
 - `data-parallel-size`: legal replica topology changes for decode/admission bottlenecks.
 - `max-num-seqs`: concurrency adjustment from observed TPOT failures or SLO headroom.
 - `max-num-batched-tokens`: decode batching adjustment after topology is stable.
 - `expert-parallel`: preserve known-valid EP topology, but change EP size only with EP-specific evidence.
 No qwen235b-specific threshold or testcase-specific rule was added.
 ## Current Run
 Pending. The next run will use dash0, 8x H20, and store results under `.aituner/harness-qwen235b-decode-20260428`.
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -77,7 +77,7 @@ def _knob_harnesses(
    tunable = set(study.engine.tunable_flags)
    latest = recent_diagnostics[-1] if recent_diagnostics else {}
    active_bottleneck = latest.get("active_bottleneck") or _workload_default_bottleneck(
-        window_summary
+        study, window_summary
    )
    harnesses: list[dict[str, Any]] = []
    if "tensor-parallel-size" in tunable:
@@ -87,18 +87,41 @@ def _knob_harnesses(
                "use_when": [
                    "TTFT failures dominate, especially on long prompt windows.",
                    "The L profile has a heavy tail and prefill service time is the likely bottleneck.",
                    "For decode-only studies, TP/DP redistribution can be the primary way to trade per-request decode latency against GPU-parallel serving replicas.",
                ],
                "procedure": [
                    "Probe only adjacent legal TP choices around the incumbent topology.",
                    "Prefer TP up when it lowers long-prefill latency and the projected request rate remains left of the high-TP queueing knee.",
                    "For decode_tpot bottlenecks, compare legal TP/DP redistributions using request_rate_per_gpu and SLO pass rate; prefer smaller TP with larger DP only when the model fits and observed TPOT remains within the configured SLO.",
                    "Prefer TP down or keep moderate TP when communication overhead or concurrency loss becomes visible.",
                ],
                "guards": [
                    "Do not jump across multiple TP values without a launch-safe reason.",
-                    "Do not raise TP for a short-prompt/cache-heavy window if TTFT is already passing and TPOT or queueing is the active bottleneck.",
+                    "Do not raise TP for a short-prompt/cache-heavy window if TTFT is already passing and TPOT or queueing is the active bottleneck, unless decode-only TPOT evidence points to under-parallelized decoding.",
                    "Keep TP/DP/EP inside topology_constraints.",
                ],
-                "active_now": active_bottleneck == "ttft_prefill",
+                "active_now": active_bottleneck in {"ttft_prefill", "decode_tpot"},
            }
        )
    if "data-parallel-size" in tunable:
        harnesses.append(
            {
                "knob_family": "data-parallel-size",
                "use_when": [
                    "Decode-only or cache-heavy workloads need more independent replicas after TTFT is no longer the limiting objective.",
                    "The active bottleneck is decode_tpot or admission/queueing and topology_constraints allow a legal TP/DP redistribution.",
                ],
                "procedure": [
                    "Change DP only as part of a legal adjacent TP/DP topology move when the product constraints require coupling.",
                    "Evaluate the candidate by request_rate_per_gpu, configured TPOT/TTFT pass rate, and launch stability; do not optimize raw request rate alone.",
                    "Use previous same-topology or adjacent-topology history to decide whether DP improved admission without making token latency infeasible.",
                ],
                "guards": [
                    "Do not reduce TP so far that decode_tpot failures dominate the configured SLO.",
                    "Do not repeat a topology whose launch failure implicated TP/DP/EP.",
                    "Keep TP/DP/EP inside topology_constraints.",
                ],
                "active_now": active_bottleneck in {"decode_tpot", "admission_or_queueing"},
            }
        )
    if "max-num-seqs" in tunable:
@@ -108,16 +131,18 @@ def _knob_harnesses(
                "use_when": [
                    "Prefix-cache reuse is high, requests are short-to-moderate after cache hits, and queueing/admission is limiting throughput.",
                    "TTFT is mostly passing but offered load stalls below the target.",
                    "For decode_tpot bottlenecks, decode concurrency is too high or too low relative to the configured TPOT SLO and observed pass-rate headroom.",
                ],
                "procedure": [
                    "Increase max-num-seqs one step at a time to exploit cache-created parallelism.",
-                    "Decrease it if p95 TTFT worsens, prefill queueing appears, or memory pressure causes launch/runtime failures.",
+                    "For decode_tpot, lower max-num-seqs when TPOT failures dominate; raise it only when the incumbent has SLO headroom and admission/queueing limits throughput.",
                    "Decrease it if p95 TTFT worsens, prefill queueing appears, TPOT failures dominate, or memory pressure causes launch/runtime failures.",
                ],
                "guards": [
                    "Avoid large max-num-seqs increases on low-cache or heavy-tail windows.",
                    "Do not combine a max-num-seqs jump with a TP jump unless the history clearly isolates both bottlenecks.",
                ],
-                "active_now": active_bottleneck == "admission_or_queueing",
+                "active_now": active_bottleneck in {"decode_tpot", "admission_or_queueing"},
            }
        )
    if "max-num-batched-tokens" in tunable:
@@ -127,17 +152,19 @@ def _knob_harnesses(
                "use_when": [
                    "Prefill batching is too small for the L profile or TTFT is hurt by excessive chunking overhead.",
                    "GPU work appears fragmented and the incumbent is stable but under-utilized.",
                    "For decode-only workloads, decode batching pressure affects TPOT pass rate after topology is stable.",
                ],
                "procedure": [
                    "Raise MBT for long prompts when memory headroom and SLO permit.",
                    "Lower MBT if long requests monopolize batches and short-request TTFT regresses.",
                    "For decode_tpot, lower MBT when token-latency failures dominate and raise it only when SLO headroom and under-utilization are both observed.",
                ],
                "guards": [
                    "Keep MBT changes within a conservative trust region.",
                    "Do not raise MBT after OOM or launch failures involving memory-related knobs.",
                    "Do not raise MBT when the incumbent MBT already covers prompt p99 unless same-topology history proves prefill fragmentation is the bottleneck.",
                ],
-                "active_now": active_bottleneck == "ttft_prefill",
+                "active_now": active_bottleneck in {"ttft_prefill", "decode_tpot"},
            }
        )
    if "enable-chunked-prefill" in tunable:
@@ -162,14 +189,17 @@ def _knob_harnesses(
                "knob_family": "expert-parallel",
                "use_when": [
                    "Only when history or a capability profile identifies expert communication or MoE dispatch as the active bottleneck.",
                    "For MoE decode-only studies, retain an already valid EP topology when it is part of the baseline constraints or prior best, but change EP only with direct adjacent evidence.",
                ],
                "procedure": [
                    "Keep expert parallel disabled for pure TTFT/prefill tuning unless there is direct positive evidence for EP on this stack.",
                    "For decode_tpot, preserve the current effective EP setting unless the nearby history shows EP-specific token-latency or launch behavior is the limiting factor.",
                    "If EP is tested, change only EP-related knobs and treat launch/runtime failure as hard negative evidence.",
                ],
                "guards": [
                    "Do not introduce EP as the first follow-up after a successful TP increase.",
                    "Do not use EP to address generic TTFT-prefill bottlenecks; TP and batching harnesses are the relevant families.",
                    "Do not change EP size for a decode-only run just to search broader topology space; require EP-specific evidence or a topology constraint that forces the current EP.",
                    "Do not enable EP after any launch failure involving expert-parallel knobs.",
                ],
                "active_now": False,
@@ -547,6 +577,8 @@ def _relative_delta(new: float | None, old: float | None) -> float | None:
 def _proposal_rules() -> list[str]:
    return [
        "First decide the active bottleneck from recent_trial_diagnostics.",
        "For decode_only studies, ignore TTFT unless a TTFT SLO is explicitly configured; prioritize TPOT pass rate and request_rate_per_gpu.",
        "For decode_tpot bottlenecks, prefer legal TP/DP topology redistribution before runtime-only knobs, then tune max-num-seqs or max-num-batched-tokens only from observed SLO headroom/failures.",
        "Pick at most one primary knob family from knob_harnesses unless the history proves a coupled change is needed.",
        "Use adjacent legal values around the incumbent; avoid broad exploratory jumps.",
        "When strong_incumbent.guard_active is true, do not propose runtime-only tweaks unless the relevant harness guard is positively satisfied by same-topology evidence.",
@@ -558,7 +590,13 @@ def _proposal_rules() -> list[str]:
    ]
-def _workload_default_bottleneck(window_summary: dict[str, Any]) -> str:
+def _workload_default_bottleneck(study: StudySpec, window_summary: dict[str, Any]) -> str:
    if study.trace.request_mode == "decode_only":
        if study.slo.tpot_rule is not None:
            return "decode_tpot"
        if study.slo.ttft_rule is not None:
            return "ttft_prefill"
        return "admission_or_queueing"
    tail_ratio = _as_float(window_summary.get("prompt_tail_ratio_p95_p50"))
    prompt_p95 = _as_float(window_summary.get("prompt_tokens_p95"))
    prefix_cache = window_summary.get("prefix_cache")
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -486,6 +486,52 @@ class CoreFlowTests(unittest.TestCase):
            self.assertIn("There is no TTFT SLO for this study.", prompt)
            self.assertIn("decode-only", prompt)
    def test_decode_only_harness_defaults_to_decode_tpot(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"request_mode": "decode_only"},
                slo_overrides={
                    "ttft_rule": None,
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
                },
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-seqs",
                        "max-num-batched-tokens",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_tp_dp_products": [8],
                        "require_tp_dp_product_equals_gpu_count": True,
                    },
                },
            )
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            context = build_harness_context(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(study_id=study.study_id),
            )
            active = {
                harness["knob_family"]
                for harness in context["knob_harnesses"]
                if harness["active_now"]
            }
            self.assertIn("tensor-parallel-size", active)
            self.assertIn("data-parallel-size", active)
            self.assertIn("max-num-seqs", active)
            self.assertIn("max-num-batched-tokens", active)
            self.assertIn(
                "For decode_only studies, ignore TTFT",
                "\n".join(context["proposal_rules"]),
            )
    def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)