Add generic decode-only harness guidance
This commit is contained in:
33
docs/qwen235b-thinking-decode/harness-20260428.md
Normal file
33
docs/qwen235b-thinking-decode/harness-20260428.md
Normal file
@@ -0,0 +1,33 @@
|
||||
# Qwen235B Thinking Decode-Only Harness Run, 2026-04-28
|
||||
|
||||
## Goal
|
||||
|
||||
Run the qwen235b thinking decode-only tuning with the same harness-guided workflow used for the prefill-only test, while keeping the harness generic. The harness must use workload mode, configured SLOs, legal topology constraints, and measured trial history rather than testcase-specific throughput thresholds.
|
||||
|
||||
## Baseline Reference
|
||||
|
||||
The before-harness comparison run is `dash0-qwen235b-decode-thinking-run5-tpot40-topology`:
|
||||
|
||||
| Iter | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| before harness request/s | 0.1267 | 0.2450 | infeasible | launch fail | infeasible | infeasible | infeasible | infeasible | 0.2817 | infeasible | infeasible | infeasible |
|
||||
|
||||
Before harness, the best feasible config appeared at iter 9 with 0.2817 request/s.
|
||||
|
||||
## Harness Change
|
||||
|
||||
The decode-only harness now defaults to `decode_tpot` when `trace.request_mode=decode_only` and a TPOT SLO is configured. This avoids treating long decode-only prompt hints as a TTFT-prefill workload.
|
||||
|
||||
Active decode harness families are generic:
|
||||
|
||||
- `tensor-parallel-size`: legal TP/DP redistribution, judged by configured SLO pass rate and request_rate_per_gpu.
|
||||
- `data-parallel-size`: legal replica topology changes for decode/admission bottlenecks.
|
||||
- `max-num-seqs`: concurrency adjustment from observed TPOT failures or SLO headroom.
|
||||
- `max-num-batched-tokens`: decode batching adjustment after topology is stable.
|
||||
- `expert-parallel`: preserve known-valid EP topology, but change EP size only with EP-specific evidence.
|
||||
|
||||
No qwen235b-specific threshold or testcase-specific rule was added.
|
||||
|
||||
## Current Run
|
||||
|
||||
Pending. The next run will use dash0, 8x H20, and store results under `.aituner/harness-qwen235b-decode-20260428`.
|
||||
@@ -77,7 +77,7 @@ def _knob_harnesses(
|
||||
tunable = set(study.engine.tunable_flags)
|
||||
latest = recent_diagnostics[-1] if recent_diagnostics else {}
|
||||
active_bottleneck = latest.get("active_bottleneck") or _workload_default_bottleneck(
|
||||
window_summary
|
||||
study, window_summary
|
||||
)
|
||||
harnesses: list[dict[str, Any]] = []
|
||||
if "tensor-parallel-size" in tunable:
|
||||
@@ -87,18 +87,41 @@ def _knob_harnesses(
|
||||
"use_when": [
|
||||
"TTFT failures dominate, especially on long prompt windows.",
|
||||
"The L profile has a heavy tail and prefill service time is the likely bottleneck.",
|
||||
"For decode-only studies, TP/DP redistribution can be the primary way to trade per-request decode latency against GPU-parallel serving replicas.",
|
||||
],
|
||||
"procedure": [
|
||||
"Probe only adjacent legal TP choices around the incumbent topology.",
|
||||
"Prefer TP up when it lowers long-prefill latency and the projected request rate remains left of the high-TP queueing knee.",
|
||||
"For decode_tpot bottlenecks, compare legal TP/DP redistributions using request_rate_per_gpu and SLO pass rate; prefer smaller TP with larger DP only when the model fits and observed TPOT remains within the configured SLO.",
|
||||
"Prefer TP down or keep moderate TP when communication overhead or concurrency loss becomes visible.",
|
||||
],
|
||||
"guards": [
|
||||
"Do not jump across multiple TP values without a launch-safe reason.",
|
||||
"Do not raise TP for a short-prompt/cache-heavy window if TTFT is already passing and TPOT or queueing is the active bottleneck.",
|
||||
"Do not raise TP for a short-prompt/cache-heavy window if TTFT is already passing and TPOT or queueing is the active bottleneck, unless decode-only TPOT evidence points to under-parallelized decoding.",
|
||||
"Keep TP/DP/EP inside topology_constraints.",
|
||||
],
|
||||
"active_now": active_bottleneck == "ttft_prefill",
|
||||
"active_now": active_bottleneck in {"ttft_prefill", "decode_tpot"},
|
||||
}
|
||||
)
|
||||
if "data-parallel-size" in tunable:
|
||||
harnesses.append(
|
||||
{
|
||||
"knob_family": "data-parallel-size",
|
||||
"use_when": [
|
||||
"Decode-only or cache-heavy workloads need more independent replicas after TTFT is no longer the limiting objective.",
|
||||
"The active bottleneck is decode_tpot or admission/queueing and topology_constraints allow a legal TP/DP redistribution.",
|
||||
],
|
||||
"procedure": [
|
||||
"Change DP only as part of a legal adjacent TP/DP topology move when the product constraints require coupling.",
|
||||
"Evaluate the candidate by request_rate_per_gpu, configured TPOT/TTFT pass rate, and launch stability; do not optimize raw request rate alone.",
|
||||
"Use previous same-topology or adjacent-topology history to decide whether DP improved admission without making token latency infeasible.",
|
||||
],
|
||||
"guards": [
|
||||
"Do not reduce TP so far that decode_tpot failures dominate the configured SLO.",
|
||||
"Do not repeat a topology whose launch failure implicated TP/DP/EP.",
|
||||
"Keep TP/DP/EP inside topology_constraints.",
|
||||
],
|
||||
"active_now": active_bottleneck in {"decode_tpot", "admission_or_queueing"},
|
||||
}
|
||||
)
|
||||
if "max-num-seqs" in tunable:
|
||||
@@ -108,16 +131,18 @@ def _knob_harnesses(
|
||||
"use_when": [
|
||||
"Prefix-cache reuse is high, requests are short-to-moderate after cache hits, and queueing/admission is limiting throughput.",
|
||||
"TTFT is mostly passing but offered load stalls below the target.",
|
||||
"For decode_tpot bottlenecks, decode concurrency is too high or too low relative to the configured TPOT SLO and observed pass-rate headroom.",
|
||||
],
|
||||
"procedure": [
|
||||
"Increase max-num-seqs one step at a time to exploit cache-created parallelism.",
|
||||
"Decrease it if p95 TTFT worsens, prefill queueing appears, or memory pressure causes launch/runtime failures.",
|
||||
"For decode_tpot, lower max-num-seqs when TPOT failures dominate; raise it only when the incumbent has SLO headroom and admission/queueing limits throughput.",
|
||||
"Decrease it if p95 TTFT worsens, prefill queueing appears, TPOT failures dominate, or memory pressure causes launch/runtime failures.",
|
||||
],
|
||||
"guards": [
|
||||
"Avoid large max-num-seqs increases on low-cache or heavy-tail windows.",
|
||||
"Do not combine a max-num-seqs jump with a TP jump unless the history clearly isolates both bottlenecks.",
|
||||
],
|
||||
"active_now": active_bottleneck == "admission_or_queueing",
|
||||
"active_now": active_bottleneck in {"decode_tpot", "admission_or_queueing"},
|
||||
}
|
||||
)
|
||||
if "max-num-batched-tokens" in tunable:
|
||||
@@ -127,17 +152,19 @@ def _knob_harnesses(
|
||||
"use_when": [
|
||||
"Prefill batching is too small for the L profile or TTFT is hurt by excessive chunking overhead.",
|
||||
"GPU work appears fragmented and the incumbent is stable but under-utilized.",
|
||||
"For decode-only workloads, decode batching pressure affects TPOT pass rate after topology is stable.",
|
||||
],
|
||||
"procedure": [
|
||||
"Raise MBT for long prompts when memory headroom and SLO permit.",
|
||||
"Lower MBT if long requests monopolize batches and short-request TTFT regresses.",
|
||||
"For decode_tpot, lower MBT when token-latency failures dominate and raise it only when SLO headroom and under-utilization are both observed.",
|
||||
],
|
||||
"guards": [
|
||||
"Keep MBT changes within a conservative trust region.",
|
||||
"Do not raise MBT after OOM or launch failures involving memory-related knobs.",
|
||||
"Do not raise MBT when the incumbent MBT already covers prompt p99 unless same-topology history proves prefill fragmentation is the bottleneck.",
|
||||
],
|
||||
"active_now": active_bottleneck == "ttft_prefill",
|
||||
"active_now": active_bottleneck in {"ttft_prefill", "decode_tpot"},
|
||||
}
|
||||
)
|
||||
if "enable-chunked-prefill" in tunable:
|
||||
@@ -162,14 +189,17 @@ def _knob_harnesses(
|
||||
"knob_family": "expert-parallel",
|
||||
"use_when": [
|
||||
"Only when history or a capability profile identifies expert communication or MoE dispatch as the active bottleneck.",
|
||||
"For MoE decode-only studies, retain an already valid EP topology when it is part of the baseline constraints or prior best, but change EP only with direct adjacent evidence.",
|
||||
],
|
||||
"procedure": [
|
||||
"Keep expert parallel disabled for pure TTFT/prefill tuning unless there is direct positive evidence for EP on this stack.",
|
||||
"For decode_tpot, preserve the current effective EP setting unless the nearby history shows EP-specific token-latency or launch behavior is the limiting factor.",
|
||||
"If EP is tested, change only EP-related knobs and treat launch/runtime failure as hard negative evidence.",
|
||||
],
|
||||
"guards": [
|
||||
"Do not introduce EP as the first follow-up after a successful TP increase.",
|
||||
"Do not use EP to address generic TTFT-prefill bottlenecks; TP and batching harnesses are the relevant families.",
|
||||
"Do not change EP size for a decode-only run just to search broader topology space; require EP-specific evidence or a topology constraint that forces the current EP.",
|
||||
"Do not enable EP after any launch failure involving expert-parallel knobs.",
|
||||
],
|
||||
"active_now": False,
|
||||
@@ -547,6 +577,8 @@ def _relative_delta(new: float | None, old: float | None) -> float | None:
|
||||
def _proposal_rules() -> list[str]:
|
||||
return [
|
||||
"First decide the active bottleneck from recent_trial_diagnostics.",
|
||||
"For decode_only studies, ignore TTFT unless a TTFT SLO is explicitly configured; prioritize TPOT pass rate and request_rate_per_gpu.",
|
||||
"For decode_tpot bottlenecks, prefer legal TP/DP topology redistribution before runtime-only knobs, then tune max-num-seqs or max-num-batched-tokens only from observed SLO headroom/failures.",
|
||||
"Pick at most one primary knob family from knob_harnesses unless the history proves a coupled change is needed.",
|
||||
"Use adjacent legal values around the incumbent; avoid broad exploratory jumps.",
|
||||
"When strong_incumbent.guard_active is true, do not propose runtime-only tweaks unless the relevant harness guard is positively satisfied by same-topology evidence.",
|
||||
@@ -558,7 +590,13 @@ def _proposal_rules() -> list[str]:
|
||||
]
|
||||
|
||||
|
||||
def _workload_default_bottleneck(window_summary: dict[str, Any]) -> str:
|
||||
def _workload_default_bottleneck(study: StudySpec, window_summary: dict[str, Any]) -> str:
|
||||
if study.trace.request_mode == "decode_only":
|
||||
if study.slo.tpot_rule is not None:
|
||||
return "decode_tpot"
|
||||
if study.slo.ttft_rule is not None:
|
||||
return "ttft_prefill"
|
||||
return "admission_or_queueing"
|
||||
tail_ratio = _as_float(window_summary.get("prompt_tail_ratio_p95_p50"))
|
||||
prompt_p95 = _as_float(window_summary.get("prompt_tokens_p95"))
|
||||
prefix_cache = window_summary.get("prefix_cache")
|
||||
|
||||
@@ -486,6 +486,52 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertIn("There is no TTFT SLO for this study.", prompt)
|
||||
self.assertIn("decode-only", prompt)
|
||||
|
||||
def test_decode_only_harness_defaults_to_decode_tpot(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
trace_overrides={"request_mode": "decode_only"},
|
||||
slo_overrides={
|
||||
"ttft_rule": None,
|
||||
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
|
||||
},
|
||||
engine_overrides={
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"data-parallel-size",
|
||||
"max-num-seqs",
|
||||
"max-num-batched-tokens",
|
||||
],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
|
||||
"allowed_data_parallel_sizes": [1, 2, 4, 8],
|
||||
"allowed_tp_dp_products": [8],
|
||||
"require_tp_dp_product_equals_gpu_count": True,
|
||||
},
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
window, requests = load_trace_requests(study, study_spec_path=study_path)
|
||||
context = build_harness_context(
|
||||
study=study,
|
||||
window_summary=summarize_window(requests, window),
|
||||
state=StudyState(study_id=study.study_id),
|
||||
)
|
||||
active = {
|
||||
harness["knob_family"]
|
||||
for harness in context["knob_harnesses"]
|
||||
if harness["active_now"]
|
||||
}
|
||||
self.assertIn("tensor-parallel-size", active)
|
||||
self.assertIn("data-parallel-size", active)
|
||||
self.assertIn("max-num-seqs", active)
|
||||
self.assertIn("max-num-batched-tokens", active)
|
||||
self.assertIn(
|
||||
"For decode_only studies, ignore TTFT",
|
||||
"\n".join(context["proposal_rules"]),
|
||||
)
|
||||
|
||||
def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
|
||||
Reference in New Issue
Block a user