Report latency stats for infeasible baseline

This commit is contained in:
2026-05-08 11:10:34 +08:00
parent eb137a0b62
commit adc4351e5d
5 changed files with 91 additions and 4 deletions

View File

@@ -19,6 +19,7 @@ Commit: `f212673 Stop tuning when baseline is infeasible`
Changed behavior: Changed behavior:
- `study tune` now persists `tuning_stop_reason` and `tuning_stop_diagnosis` in `state.json`. - `study tune` now persists `tuning_stop_reason` and `tuning_stop_diagnosis` in `state.json`.
- `study tune` also persists `tuning_stop_details`, including the lowest sampled probe's TTFT/TPOT mean, p50, p95, and p99.
- After the automatic baseline trial is ingested, AITuner checks the worker result: - After the automatic baseline trial is ingested, AITuner checks the worker result:
- `status == completed` - `status == completed`
- `best_request_rate is None` - `best_request_rate is None`
@@ -106,6 +107,15 @@ Baseline probe curve:
| 0.001953125 | 0.065000 | 0.205128 | false | `slo_pass_rate_unrecoverable` | | 0.001953125 | 0.065000 | 0.205128 | false | `slo_pass_rate_unrecoverable` |
| 0.0009765625 | 0.035000 | 0.142857 | false | `slo_pass_rate_unrecoverable` | | 0.0009765625 | 0.035000 | 0.142857 | false | `slo_pass_rate_unrecoverable` |
Lowest request rate latency summary:
| Variant | request rate | pass rate | TTFT mean | TTFT p50 | TTFT p95 | TTFT p99 | TPOT mean | TPOT p50 | TPOT p95 | TPOT p99 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness | 0.035000 | 0.142857 | 1288.953ms | 446.586ms | 3011.814ms | 3011.814ms | 12.661ms | 13.141ms | 15.097ms | 15.097ms |
| harness | 0.035000 | 0.142857 | 1268.090ms | 445.274ms | 2889.080ms | 2889.080ms | 12.658ms | 13.170ms | 15.102ms | 15.102ms |
This shows that the TPOT threshold of `40ms` is not the binding constraint at the lowest sampled rate. The observed TPOT p99 is about `15.1ms`; failures are driven by TTFT and by the unrecoverable-pass-rate early stop after too many requests have already failed or been skipped.
Final diagnosis written by AITuner: Final diagnosis written by AITuner:
```text ```text

View File

@@ -23,7 +23,32 @@ def _is_empty_config_patch(proposal: Proposal) -> bool:
return not proposal.config_patch.env_patch and not proposal.config_patch.flag_patch return not proposal.config_patch.env_patch and not proposal.config_patch.flag_patch
def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None: def _latency_percentiles(summary: object, metric: str) -> dict[str, float]:
if not isinstance(summary, dict):
return {}
payload = summary.get(metric)
if not isinstance(payload, dict):
return {}
selected: dict[str, float] = {}
for key in ("mean", "p50", "p95", "p99"):
value = payload.get(key)
if isinstance(value, (int, float)):
selected[key] = float(value)
return selected
def _format_latency_percentiles(metric: str, values: dict[str, float]) -> str:
if not values:
return ""
ordered = ", ".join(
f"{key}={values[key]:.3f}"
for key in ("mean", "p50", "p95", "p99")
if key in values
)
return f"{metric}({ordered})"
def _baseline_all_infeasible_stop(result: dict[str, object]) -> tuple[str, dict[str, object]] | None:
if result.get("status") != "completed": if result.get("status") != "completed":
return None return None
if isinstance(result.get("best_request_rate"), (int, float)): if isinstance(result.get("best_request_rate"), (int, float)):
@@ -41,6 +66,20 @@ def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None:
lowest_threshold = diagnostics.get("threshold") lowest_threshold = diagnostics.get("threshold")
pass_rate = diagnostics.get("pass_rate") pass_rate = diagnostics.get("pass_rate")
early_stop_reason = str(diagnostics.get("early_stop_reason") or "").strip() early_stop_reason = str(diagnostics.get("early_stop_reason") or "").strip()
latency_summary = diagnostics.get("latency_summary")
ttft = _latency_percentiles(latency_summary, "ttft_ms")
tpot = _latency_percentiles(latency_summary, "tpot_ms")
details: dict[str, object] = {
"lowest_sampled_request_rate": lowest_rate,
"lowest_sampling_u": lowest_threshold,
"lowest_probe_pass_rate": pass_rate,
"early_stop_reason": early_stop_reason,
"lowest_probe_latency_ms": {
"ttft": ttft,
"tpot": tpot,
},
"lowest_probe_latency_summary": latency_summary if isinstance(latency_summary, dict) else {},
}
pieces = [ pieces = [
"Baseline configuration has no feasible probe under the current SLO.", "Baseline configuration has no feasible probe under the current SLO.",
"Stopping tuning because even the lowest sampled request rate did not meet the target pass rate.", "Stopping tuning because even the lowest sampled request rate did not meet the target pass rate.",
@@ -53,7 +92,13 @@ def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None:
pieces.append(f"lowest_probe_pass_rate={float(pass_rate):.6g}") pieces.append(f"lowest_probe_pass_rate={float(pass_rate):.6g}")
if early_stop_reason: if early_stop_reason:
pieces.append(f"early_stop_reason={early_stop_reason}") pieces.append(f"early_stop_reason={early_stop_reason}")
return " ".join(pieces) for item in (
_format_latency_percentiles("lowest_probe_ttft_ms", ttft),
_format_latency_percentiles("lowest_probe_tpot_ms", tpot),
):
if item:
pieces.append(item)
return " ".join(pieces), details
def _study_source_path(study_root: Path) -> Path: def _study_source_path(study_root: Path) -> Path:
@@ -170,6 +215,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"stopped": True, "stopped": True,
"reason": state.tuning_stop_reason, "reason": state.tuning_stop_reason,
"diagnosis": state.tuning_stop_diagnosis, "diagnosis": state.tuning_stop_diagnosis,
"details": state.tuning_stop_details,
"state_best_trial_id": state.best_trial_id, "state_best_trial_id": state.best_trial_id,
"state_best_request_rate": state.best_request_rate, "state_best_request_rate": state.best_request_rate,
} }
@@ -305,10 +351,12 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
} }
) )
if is_auto_baseline: if is_auto_baseline:
diagnosis = _baseline_all_infeasible_diagnosis(result) stop = _baseline_all_infeasible_stop(result)
if diagnosis is not None: if stop is not None:
diagnosis, details = stop
state.tuning_stop_reason = "baseline_all_infeasible" state.tuning_stop_reason = "baseline_all_infeasible"
state.tuning_stop_diagnosis = diagnosis state.tuning_stop_diagnosis = diagnosis
state.tuning_stop_details = details
store.save_state(state) store.save_state(state)
executed.append( executed.append(
{ {
@@ -316,6 +364,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"stopped": True, "stopped": True,
"reason": state.tuning_stop_reason, "reason": state.tuning_stop_reason,
"diagnosis": diagnosis, "diagnosis": diagnosis,
"details": details,
"state_best_trial_id": state.best_trial_id, "state_best_trial_id": state.best_trial_id,
"state_best_request_rate": state.best_request_rate, "state_best_request_rate": state.best_request_rate,
} }
@@ -332,6 +381,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"best_request_rate": final_state.best_request_rate, "best_request_rate": final_state.best_request_rate,
"tuning_stop_reason": final_state.tuning_stop_reason, "tuning_stop_reason": final_state.tuning_stop_reason,
"tuning_stop_diagnosis": final_state.tuning_stop_diagnosis, "tuning_stop_diagnosis": final_state.tuning_stop_diagnosis,
"tuning_stop_details": final_state.tuning_stop_details,
}, },
ensure_ascii=False, ensure_ascii=False,
) )

View File

@@ -766,6 +766,7 @@ class StudyState:
next_trial_index: int = 1 next_trial_index: int = 1
tuning_stop_reason: str = "" tuning_stop_reason: str = ""
tuning_stop_diagnosis: str = "" tuning_stop_diagnosis: str = ""
tuning_stop_details: dict[str, Any] = field(default_factory=dict)
best_by_parallel_size: dict[str, dict[str, Any]] = field(default_factory=dict) best_by_parallel_size: dict[str, dict[str, Any]] = field(default_factory=dict)
trials: list[TrialSummary] = field(default_factory=list) trials: list[TrialSummary] = field(default_factory=list)

View File

@@ -47,6 +47,7 @@ class StudyStore:
next_trial_index=int(payload.get("next_trial_index", 1)), next_trial_index=int(payload.get("next_trial_index", 1)),
tuning_stop_reason=str(payload.get("tuning_stop_reason") or ""), tuning_stop_reason=str(payload.get("tuning_stop_reason") or ""),
tuning_stop_diagnosis=str(payload.get("tuning_stop_diagnosis") or ""), tuning_stop_diagnosis=str(payload.get("tuning_stop_diagnosis") or ""),
tuning_stop_details=dict(payload.get("tuning_stop_details") or {}),
best_by_parallel_size={ best_by_parallel_size={
str(key): value str(key): value
for key, value in (payload.get("best_by_parallel_size") or {}).items() for key, value in (payload.get("best_by_parallel_size") or {}).items()

View File

@@ -3040,6 +3040,22 @@ class CoreFlowTests(unittest.TestCase):
"request_rate": 1.0, "request_rate": 1.0,
"pass_rate": 0.5, "pass_rate": 0.5,
"early_stop_reason": "slo_pass_rate_unrecoverable", "early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"ttft_ms": {
"count": 2,
"mean": 1200.0,
"p50": 1100.0,
"p95": 1900.0,
"p99": 1980.0,
},
"tpot_ms": {
"count": 2,
"mean": 35.0,
"p50": 32.0,
"p95": 48.0,
"p99": 49.0,
},
},
}, },
} }
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
@@ -3068,6 +3084,15 @@ class CoreFlowTests(unittest.TestCase):
self.assertEqual(len(state.trials), 1) self.assertEqual(len(state.trials), 1)
self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible") self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible")
self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis) self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis)
self.assertIn("lowest_probe_ttft_ms", state.tuning_stop_diagnosis)
self.assertEqual(
state.tuning_stop_details["lowest_probe_latency_ms"]["ttft"]["p95"],
1900.0,
)
self.assertEqual(
state.tuning_stop_details["lowest_probe_latency_ms"]["tpot"]["p99"],
49.0,
)
with mock.patch("aituner.cli.run_trial") as run_trial_mock: with mock.patch("aituner.cli.run_trial") as run_trial_mock:
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: