Use probe sequence bottlenecks in harness
This commit is contained in:
@@ -30,4 +30,23 @@ No qwen235b-specific threshold or testcase-specific rule was added.
|
|||||||
|
|
||||||
## Current Run
|
## Current Run
|
||||||
|
|
||||||
Pending. The next run will use dash0, 8x H20, and store results under `.aituner/harness-qwen235b-decode-20260428`.
|
Started on dash0, 8x H20.
|
||||||
|
|
||||||
|
- Remote spec: `.aituner/harness-qwen235b-decode-20260428/dash0_qwen235b_decode_thinking_harness_20260428.json`
|
||||||
|
- Remote store: `.aituner/harness-qwen235b-decode-20260428/dash0-qwen235b-decode-thinking-harness-20260428`
|
||||||
|
- Remote tmux: `aituner_qwen235b_decode_harness_20260428`
|
||||||
|
- Remote log: `logs/qwen235b_decode_harness_20260428.log`
|
||||||
|
- Code commit: `39aa47f`
|
||||||
|
- Verification: local and dash0 both passed `PYTHONPATH=src python3 -m unittest discover -s tests`.
|
||||||
|
|
||||||
|
The first attempt started a duplicate `trial-0001` baseline. Because the identical baseline was already measured in run5 and the decode probe can run for many minutes, that duplicate run was stopped and GPUs were freed.
|
||||||
|
|
||||||
|
The active run is now seeded from the real run5 baseline and continues from `trial-0002`:
|
||||||
|
|
||||||
|
- Remote spec: `.aituner/harness-qwen235b-decode-20260428-seeded/dash0_qwen235b_decode_thinking_harness_seeded_20260428.json`
|
||||||
|
- Remote store: `.aituner/harness-qwen235b-decode-20260428-seeded/dash0-qwen235b-decode-thinking-harness-seeded-20260428`
|
||||||
|
- Seeded `trial-0001`: 0.1267 request/s, 0.0158 request/s/GPU, pass rate 0.9868.
|
||||||
|
|
||||||
|
## Follow-up Fix
|
||||||
|
|
||||||
|
The seeded prompt exposed a generic diagnosis issue: if the best feasible probe had no latency failures, the harness could miss the prior infeasible probe that showed the real bottleneck at higher load. The harness now scans the probe sequence backward and uses the nearest non-trivial bottleneck before falling back to the best feasible probe. This keeps decode-only runs focused on `decode_tpot` after a feasible low-load point, without adding testcase thresholds.
|
||||||
|
|||||||
@@ -249,7 +249,10 @@ def _recent_trial_diagnostics(state: StudyState) -> list[dict[str, Any]]:
|
|||||||
"best_feasible_probe": _compact_probe(best_probe),
|
"best_feasible_probe": _compact_probe(best_probe),
|
||||||
"last_probe": _compact_probe(last_probe),
|
"last_probe": _compact_probe(last_probe),
|
||||||
}
|
}
|
||||||
item["active_bottleneck"] = _active_bottleneck(best_probe or last_probe)
|
item["active_bottleneck"] = _active_bottleneck_from_probe_sequence(
|
||||||
|
probes,
|
||||||
|
fallback=best_probe or last_probe,
|
||||||
|
)
|
||||||
elif result.get("all_infeasible_diagnostics"):
|
elif result.get("all_infeasible_diagnostics"):
|
||||||
diag = result["all_infeasible_diagnostics"]
|
diag = result["all_infeasible_diagnostics"]
|
||||||
item["probe_summary"] = {"all_infeasible": diag}
|
item["probe_summary"] = {"all_infeasible": diag}
|
||||||
@@ -345,6 +348,20 @@ def _active_bottleneck(probe: dict[str, Any] | None) -> str:
|
|||||||
return "admission_or_queueing"
|
return "admission_or_queueing"
|
||||||
|
|
||||||
|
|
||||||
|
def _active_bottleneck_from_probe_sequence(
|
||||||
|
probes: list[Any],
|
||||||
|
*,
|
||||||
|
fallback: dict[str, Any] | None,
|
||||||
|
) -> str:
|
||||||
|
for probe in reversed(probes):
|
||||||
|
if not isinstance(probe, dict):
|
||||||
|
continue
|
||||||
|
bottleneck = _active_bottleneck(probe)
|
||||||
|
if bottleneck not in {"unknown", "none_obvious"}:
|
||||||
|
return bottleneck
|
||||||
|
return _active_bottleneck(fallback)
|
||||||
|
|
||||||
|
|
||||||
def _failure_bottleneck(trial: TrialSummary) -> str:
|
def _failure_bottleneck(trial: TrialSummary) -> str:
|
||||||
if trial.failure_stage == "engine_launch":
|
if trial.failure_stage == "engine_launch":
|
||||||
return "launch_or_memory"
|
return "launch_or_memory"
|
||||||
|
|||||||
@@ -532,6 +532,83 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
"\n".join(context["proposal_rules"]),
|
"\n".join(context["proposal_rules"]),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
trace_overrides={"request_mode": "decode_only"},
|
||||||
|
slo_overrides={
|
||||||
|
"ttft_rule": None,
|
||||||
|
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
|
||||||
|
},
|
||||||
|
engine_overrides={
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"max-num-seqs",
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result_path = tmp_path / "trial-0001-result.json"
|
||||||
|
result_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"status": "completed",
|
||||||
|
"best_request_rate": 1.0,
|
||||||
|
"best_pass_rate": 1.0,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.1,
|
||||||
|
"feasible": False,
|
||||||
|
"payload": {
|
||||||
|
"request_rate": 2.0,
|
||||||
|
"pass_rate": 0.1,
|
||||||
|
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||||
|
"latency_summary": {
|
||||||
|
"failed_reason_counts": {"tpot_ms>20.0": 20}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"threshold": 0.01,
|
||||||
|
"feasible": True,
|
||||||
|
"payload": {
|
||||||
|
"request_rate": 1.0,
|
||||||
|
"pass_rate": 1.0,
|
||||||
|
"latency_summary": {"failed_reason_counts": {}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={},
|
||||||
|
state=StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
result_path=str(result_path),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
diagnostics = context["recent_trial_diagnostics"]
|
||||||
|
self.assertEqual(diagnostics[0]["active_bottleneck"], "decode_tpot")
|
||||||
|
active = {
|
||||||
|
harness["knob_family"]
|
||||||
|
for harness in context["knob_harnesses"]
|
||||||
|
if harness["active_now"]
|
||||||
|
}
|
||||||
|
self.assertIn("data-parallel-size", active)
|
||||||
|
self.assertIn("max-num-seqs", active)
|
||||||
|
|
||||||
def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None:
|
def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
|
|||||||
Reference in New Issue
Block a user