diff --git a/docs/qwen27b-chat-0-8k-tpot40-baseline-infeasible-20260507.md b/docs/qwen27b-chat-0-8k-tpot40-baseline-infeasible-20260507.md index 28cff6d..e879d50 100644 --- a/docs/qwen27b-chat-0-8k-tpot40-baseline-infeasible-20260507.md +++ b/docs/qwen27b-chat-0-8k-tpot40-baseline-infeasible-20260507.md @@ -19,6 +19,7 @@ Commit: `f212673 Stop tuning when baseline is infeasible` Changed behavior: - `study tune` now persists `tuning_stop_reason` and `tuning_stop_diagnosis` in `state.json`. +- `study tune` also persists `tuning_stop_details`, including the lowest sampled probe's TTFT/TPOT mean, p50, p95, and p99. - After the automatic baseline trial is ingested, AITuner checks the worker result: - `status == completed` - `best_request_rate is None` @@ -106,6 +107,15 @@ Baseline probe curve: | 0.001953125 | 0.065000 | 0.205128 | false | `slo_pass_rate_unrecoverable` | | 0.0009765625 | 0.035000 | 0.142857 | false | `slo_pass_rate_unrecoverable` | +Lowest request rate latency summary: + +| Variant | request rate | pass rate | TTFT mean | TTFT p50 | TTFT p95 | TTFT p99 | TPOT mean | TPOT p50 | TPOT p95 | TPOT p99 | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| no-harness | 0.035000 | 0.142857 | 1288.953ms | 446.586ms | 3011.814ms | 3011.814ms | 12.661ms | 13.141ms | 15.097ms | 15.097ms | +| harness | 0.035000 | 0.142857 | 1268.090ms | 445.274ms | 2889.080ms | 2889.080ms | 12.658ms | 13.170ms | 15.102ms | 15.102ms | + +This shows that the TPOT threshold of `40ms` is not the binding constraint at the lowest sampled rate. The observed TPOT p99 is about `15.1ms`; failures are driven by TTFT and by the unrecoverable-pass-rate early stop after too many requests have already failed or been skipped. + Final diagnosis written by AITuner: ```text diff --git a/src/aituner/cli.py b/src/aituner/cli.py index 84cc325..0ce4d77 100644 --- a/src/aituner/cli.py +++ b/src/aituner/cli.py @@ -23,7 +23,32 @@ def _is_empty_config_patch(proposal: Proposal) -> bool: return not proposal.config_patch.env_patch and not proposal.config_patch.flag_patch -def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None: +def _latency_percentiles(summary: object, metric: str) -> dict[str, float]: + if not isinstance(summary, dict): + return {} + payload = summary.get(metric) + if not isinstance(payload, dict): + return {} + selected: dict[str, float] = {} + for key in ("mean", "p50", "p95", "p99"): + value = payload.get(key) + if isinstance(value, (int, float)): + selected[key] = float(value) + return selected + + +def _format_latency_percentiles(metric: str, values: dict[str, float]) -> str: + if not values: + return "" + ordered = ", ".join( + f"{key}={values[key]:.3f}" + for key in ("mean", "p50", "p95", "p99") + if key in values + ) + return f"{metric}({ordered})" + + +def _baseline_all_infeasible_stop(result: dict[str, object]) -> tuple[str, dict[str, object]] | None: if result.get("status") != "completed": return None if isinstance(result.get("best_request_rate"), (int, float)): @@ -41,6 +66,20 @@ def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None: lowest_threshold = diagnostics.get("threshold") pass_rate = diagnostics.get("pass_rate") early_stop_reason = str(diagnostics.get("early_stop_reason") or "").strip() + latency_summary = diagnostics.get("latency_summary") + ttft = _latency_percentiles(latency_summary, "ttft_ms") + tpot = _latency_percentiles(latency_summary, "tpot_ms") + details: dict[str, object] = { + "lowest_sampled_request_rate": lowest_rate, + "lowest_sampling_u": lowest_threshold, + "lowest_probe_pass_rate": pass_rate, + "early_stop_reason": early_stop_reason, + "lowest_probe_latency_ms": { + "ttft": ttft, + "tpot": tpot, + }, + "lowest_probe_latency_summary": latency_summary if isinstance(latency_summary, dict) else {}, + } pieces = [ "Baseline configuration has no feasible probe under the current SLO.", "Stopping tuning because even the lowest sampled request rate did not meet the target pass rate.", @@ -53,7 +92,13 @@ def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None: pieces.append(f"lowest_probe_pass_rate={float(pass_rate):.6g}") if early_stop_reason: pieces.append(f"early_stop_reason={early_stop_reason}") - return " ".join(pieces) + for item in ( + _format_latency_percentiles("lowest_probe_ttft_ms", ttft), + _format_latency_percentiles("lowest_probe_tpot_ms", tpot), + ): + if item: + pieces.append(item) + return " ".join(pieces), details def _study_source_path(study_root: Path) -> Path: @@ -170,6 +215,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int: "stopped": True, "reason": state.tuning_stop_reason, "diagnosis": state.tuning_stop_diagnosis, + "details": state.tuning_stop_details, "state_best_trial_id": state.best_trial_id, "state_best_request_rate": state.best_request_rate, } @@ -305,10 +351,12 @@ def cmd_study_tune(args: argparse.Namespace) -> int: } ) if is_auto_baseline: - diagnosis = _baseline_all_infeasible_diagnosis(result) - if diagnosis is not None: + stop = _baseline_all_infeasible_stop(result) + if stop is not None: + diagnosis, details = stop state.tuning_stop_reason = "baseline_all_infeasible" state.tuning_stop_diagnosis = diagnosis + state.tuning_stop_details = details store.save_state(state) executed.append( { @@ -316,6 +364,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int: "stopped": True, "reason": state.tuning_stop_reason, "diagnosis": diagnosis, + "details": details, "state_best_trial_id": state.best_trial_id, "state_best_request_rate": state.best_request_rate, } @@ -332,6 +381,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int: "best_request_rate": final_state.best_request_rate, "tuning_stop_reason": final_state.tuning_stop_reason, "tuning_stop_diagnosis": final_state.tuning_stop_diagnosis, + "tuning_stop_details": final_state.tuning_stop_details, }, ensure_ascii=False, ) diff --git a/src/aituner/spec.py b/src/aituner/spec.py index 3ae1120..c743977 100644 --- a/src/aituner/spec.py +++ b/src/aituner/spec.py @@ -766,6 +766,7 @@ class StudyState: next_trial_index: int = 1 tuning_stop_reason: str = "" tuning_stop_diagnosis: str = "" + tuning_stop_details: dict[str, Any] = field(default_factory=dict) best_by_parallel_size: dict[str, dict[str, Any]] = field(default_factory=dict) trials: list[TrialSummary] = field(default_factory=list) diff --git a/src/aituner/store.py b/src/aituner/store.py index 356923e..7456431 100644 --- a/src/aituner/store.py +++ b/src/aituner/store.py @@ -47,6 +47,7 @@ class StudyStore: next_trial_index=int(payload.get("next_trial_index", 1)), tuning_stop_reason=str(payload.get("tuning_stop_reason") or ""), tuning_stop_diagnosis=str(payload.get("tuning_stop_diagnosis") or ""), + tuning_stop_details=dict(payload.get("tuning_stop_details") or {}), best_by_parallel_size={ str(key): value for key, value in (payload.get("best_by_parallel_size") or {}).items() diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 01248f4..813a361 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -3040,6 +3040,22 @@ class CoreFlowTests(unittest.TestCase): "request_rate": 1.0, "pass_rate": 0.5, "early_stop_reason": "slo_pass_rate_unrecoverable", + "latency_summary": { + "ttft_ms": { + "count": 2, + "mean": 1200.0, + "p50": 1100.0, + "p95": 1900.0, + "p99": 1980.0, + }, + "tpot_ms": { + "count": 2, + "mean": 35.0, + "p50": 32.0, + "p95": 48.0, + "p99": 49.0, + }, + }, }, } (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") @@ -3068,6 +3084,15 @@ class CoreFlowTests(unittest.TestCase): self.assertEqual(len(state.trials), 1) self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible") self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis) + self.assertIn("lowest_probe_ttft_ms", state.tuning_stop_diagnosis) + self.assertEqual( + state.tuning_stop_details["lowest_probe_latency_ms"]["ttft"]["p95"], + 1900.0, + ) + self.assertEqual( + state.tuning_stop_details["lowest_probe_latency_ms"]["tpot"]["p99"], + 49.0, + ) with mock.patch("aituner.cli.run_trial") as run_trial_mock: with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: