diff --git a/src/aituner/harness.py b/src/aituner/harness.py index 0d11480..3107c58 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -607,6 +607,7 @@ def _active_bottleneck(probe: dict[str, Any] | None) -> str: if not str(k).startswith("ttft") and not str(k).startswith("tpot") and not str(k).startswith("probe_elapsed_s>") + and str(k) != "slo_pass_rate_unrecoverable" ) if ttft_count == 0 and tpot_count == 0 and request_failed_count == 0: return "none_obvious" diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 627033b..4cedaf9 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1084,6 +1084,97 @@ class CoreFlowTests(unittest.TestCase): {"max-num-seqs": 32}, ) + def test_slo_unrecoverable_does_not_mask_latency_bottleneck(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + slo_overrides={ + "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, + "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 25}, + }, + engine_overrides={ + "tunable_flags": [ + "tensor-parallel-size", + "max-num-seqs", + ], + "topology_constraints": { + "allowed_tensor_parallel_sizes": [1, 2, 4], + "allowed_tp_dp_products": [1, 2, 4], + }, + }, + ) + result_path = tmp_path / "trial-0001.json" + result_path.write_text( + json.dumps( + { + "status": "completed", + "best_request_rate": 0.065, + "best_request_rate_per_gpu": 0.065, + "best_pass_rate": 1.0, + "probes": [ + { + "threshold": 0.015625, + "feasible": False, + "payload": { + "request_count": 290, + "pass_rate": 0.041, + "request_rate": 0.483, + "early_stop_reason": "slo_pass_rate_unrecoverable", + "latency_summary": { + "failed_reason_counts": { + "ttft_ms>4000.0": 2, + "tpot_ms>25.0": 14, + "slo_pass_rate_unrecoverable": 263, + } + }, + }, + }, + { + "threshold": 0.001953125, + "feasible": True, + "payload": { + "request_count": 39, + "pass_rate": 1.0, + "request_rate": 0.065, + "latency_summary": {"failed_reason_counts": {}}, + }, + }, + ], + } + ), + encoding="utf-8", + ) + study = load_study_spec(study_path) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8}, + state=StudyState( + study_id=study.study_id, + best_trial_id="trial-0001", + best_request_rate=0.065, + best_request_rate_per_gpu=0.065, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + best_request_rate=0.065, + best_request_rate_per_gpu=0.065, + best_pass_rate=1.0, + result_path=str(result_path), + config_patch={"env_patch": {}, "flag_patch": {}}, + ) + ], + ), + ) + self.assertNotEqual( + context["bottleneck_hypotheses"][0]["name"], + "admission_or_queueing", + ) + proposal = build_harness_guided_proposal(context) + self.assertIsNotNone(proposal) + self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2}) + def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)