Use probe sequence bottlenecks in harness

This commit is contained in:
2026-04-28 06:57:45 +08:00
parent 39aa47fbf1
commit a9943e0240
3 changed files with 115 additions and 2 deletions

View File

@@ -30,4 +30,23 @@ No qwen235b-specific threshold or testcase-specific rule was added.
## Current Run
Pending. The next run will use dash0, 8x H20, and store results under `.aituner/harness-qwen235b-decode-20260428`.
Started on dash0, 8x H20.
- Remote spec: `.aituner/harness-qwen235b-decode-20260428/dash0_qwen235b_decode_thinking_harness_20260428.json`
- Remote store: `.aituner/harness-qwen235b-decode-20260428/dash0-qwen235b-decode-thinking-harness-20260428`
- Remote tmux: `aituner_qwen235b_decode_harness_20260428`
- Remote log: `logs/qwen235b_decode_harness_20260428.log`
- Code commit: `39aa47f`
- Verification: local and dash0 both passed `PYTHONPATH=src python3 -m unittest discover -s tests`.
The first attempt started a duplicate `trial-0001` baseline. Because the identical baseline was already measured in run5 and the decode probe can run for many minutes, that duplicate run was stopped and GPUs were freed.
The active run is now seeded from the real run5 baseline and continues from `trial-0002`:
- Remote spec: `.aituner/harness-qwen235b-decode-20260428-seeded/dash0_qwen235b_decode_thinking_harness_seeded_20260428.json`
- Remote store: `.aituner/harness-qwen235b-decode-20260428-seeded/dash0-qwen235b-decode-thinking-harness-seeded-20260428`
- Seeded `trial-0001`: 0.1267 request/s, 0.0158 request/s/GPU, pass rate 0.9868.
## Follow-up Fix
The seeded prompt exposed a generic diagnosis issue: if the best feasible probe had no latency failures, the harness could miss the prior infeasible probe that showed the real bottleneck at higher load. The harness now scans the probe sequence backward and uses the nearest non-trivial bottleneck before falling back to the best feasible probe. This keeps decode-only runs focused on `decode_tpot` after a feasible low-load point, without adding testcase thresholds.

View File

@@ -249,7 +249,10 @@ def _recent_trial_diagnostics(state: StudyState) -> list[dict[str, Any]]:
"best_feasible_probe": _compact_probe(best_probe),
"last_probe": _compact_probe(last_probe),
}
item["active_bottleneck"] = _active_bottleneck(best_probe or last_probe)
item["active_bottleneck"] = _active_bottleneck_from_probe_sequence(
probes,
fallback=best_probe or last_probe,
)
elif result.get("all_infeasible_diagnostics"):
diag = result["all_infeasible_diagnostics"]
item["probe_summary"] = {"all_infeasible": diag}
@@ -345,6 +348,20 @@ def _active_bottleneck(probe: dict[str, Any] | None) -> str:
return "admission_or_queueing"
def _active_bottleneck_from_probe_sequence(
probes: list[Any],
*,
fallback: dict[str, Any] | None,
) -> str:
for probe in reversed(probes):
if not isinstance(probe, dict):
continue
bottleneck = _active_bottleneck(probe)
if bottleneck not in {"unknown", "none_obvious"}:
return bottleneck
return _active_bottleneck(fallback)
def _failure_bottleneck(trial: TrialSummary) -> str:
if trial.failure_stage == "engine_launch":
return "launch_or_memory"

View File

@@ -532,6 +532,83 @@ class CoreFlowTests(unittest.TestCase):
"\n".join(context["proposal_rules"]),
)
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"request_mode": "decode_only"},
slo_overrides={
"ttft_rule": None,
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
},
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-seqs",
]
},
)
result_path = tmp_path / "trial-0001-result.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_request_rate": 1.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.1,
"feasible": False,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.1,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>20.0": 20}
},
},
},
{
"threshold": 0.01,
"feasible": True,
"payload": {
"request_rate": 1.0,
"pass_rate": 1.0,
"latency_summary": {"failed_reason_counts": {}},
},
},
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={},
state=StudyState(
study_id=study.study_id,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
result_path=str(result_path),
)
],
),
)
diagnostics = context["recent_trial_diagnostics"]
self.assertEqual(diagnostics[0]["active_bottleneck"], "decode_tpot")
active = {
harness["knob_family"]
for harness in context["knob_harnesses"]
if harness["active_now"]
}
self.assertIn("data-parallel-size", active)
self.assertIn("max-num-seqs", active)
def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)