Report latency stats for infeasible baseline
This commit is contained in:
@@ -19,6 +19,7 @@ Commit: `f212673 Stop tuning when baseline is infeasible`
|
|||||||
Changed behavior:
|
Changed behavior:
|
||||||
|
|
||||||
- `study tune` now persists `tuning_stop_reason` and `tuning_stop_diagnosis` in `state.json`.
|
- `study tune` now persists `tuning_stop_reason` and `tuning_stop_diagnosis` in `state.json`.
|
||||||
|
- `study tune` also persists `tuning_stop_details`, including the lowest sampled probe's TTFT/TPOT mean, p50, p95, and p99.
|
||||||
- After the automatic baseline trial is ingested, AITuner checks the worker result:
|
- After the automatic baseline trial is ingested, AITuner checks the worker result:
|
||||||
- `status == completed`
|
- `status == completed`
|
||||||
- `best_request_rate is None`
|
- `best_request_rate is None`
|
||||||
@@ -106,6 +107,15 @@ Baseline probe curve:
|
|||||||
| 0.001953125 | 0.065000 | 0.205128 | false | `slo_pass_rate_unrecoverable` |
|
| 0.001953125 | 0.065000 | 0.205128 | false | `slo_pass_rate_unrecoverable` |
|
||||||
| 0.0009765625 | 0.035000 | 0.142857 | false | `slo_pass_rate_unrecoverable` |
|
| 0.0009765625 | 0.035000 | 0.142857 | false | `slo_pass_rate_unrecoverable` |
|
||||||
|
|
||||||
|
Lowest request rate latency summary:
|
||||||
|
|
||||||
|
| Variant | request rate | pass rate | TTFT mean | TTFT p50 | TTFT p95 | TTFT p99 | TPOT mean | TPOT p50 | TPOT p95 | TPOT p99 |
|
||||||
|
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||||
|
| no-harness | 0.035000 | 0.142857 | 1288.953ms | 446.586ms | 3011.814ms | 3011.814ms | 12.661ms | 13.141ms | 15.097ms | 15.097ms |
|
||||||
|
| harness | 0.035000 | 0.142857 | 1268.090ms | 445.274ms | 2889.080ms | 2889.080ms | 12.658ms | 13.170ms | 15.102ms | 15.102ms |
|
||||||
|
|
||||||
|
This shows that the TPOT threshold of `40ms` is not the binding constraint at the lowest sampled rate. The observed TPOT p99 is about `15.1ms`; failures are driven by TTFT and by the unrecoverable-pass-rate early stop after too many requests have already failed or been skipped.
|
||||||
|
|
||||||
Final diagnosis written by AITuner:
|
Final diagnosis written by AITuner:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
|
|||||||
@@ -23,7 +23,32 @@ def _is_empty_config_patch(proposal: Proposal) -> bool:
|
|||||||
return not proposal.config_patch.env_patch and not proposal.config_patch.flag_patch
|
return not proposal.config_patch.env_patch and not proposal.config_patch.flag_patch
|
||||||
|
|
||||||
|
|
||||||
def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None:
|
def _latency_percentiles(summary: object, metric: str) -> dict[str, float]:
|
||||||
|
if not isinstance(summary, dict):
|
||||||
|
return {}
|
||||||
|
payload = summary.get(metric)
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
return {}
|
||||||
|
selected: dict[str, float] = {}
|
||||||
|
for key in ("mean", "p50", "p95", "p99"):
|
||||||
|
value = payload.get(key)
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
selected[key] = float(value)
|
||||||
|
return selected
|
||||||
|
|
||||||
|
|
||||||
|
def _format_latency_percentiles(metric: str, values: dict[str, float]) -> str:
|
||||||
|
if not values:
|
||||||
|
return ""
|
||||||
|
ordered = ", ".join(
|
||||||
|
f"{key}={values[key]:.3f}"
|
||||||
|
for key in ("mean", "p50", "p95", "p99")
|
||||||
|
if key in values
|
||||||
|
)
|
||||||
|
return f"{metric}({ordered})"
|
||||||
|
|
||||||
|
|
||||||
|
def _baseline_all_infeasible_stop(result: dict[str, object]) -> tuple[str, dict[str, object]] | None:
|
||||||
if result.get("status") != "completed":
|
if result.get("status") != "completed":
|
||||||
return None
|
return None
|
||||||
if isinstance(result.get("best_request_rate"), (int, float)):
|
if isinstance(result.get("best_request_rate"), (int, float)):
|
||||||
@@ -41,6 +66,20 @@ def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None:
|
|||||||
lowest_threshold = diagnostics.get("threshold")
|
lowest_threshold = diagnostics.get("threshold")
|
||||||
pass_rate = diagnostics.get("pass_rate")
|
pass_rate = diagnostics.get("pass_rate")
|
||||||
early_stop_reason = str(diagnostics.get("early_stop_reason") or "").strip()
|
early_stop_reason = str(diagnostics.get("early_stop_reason") or "").strip()
|
||||||
|
latency_summary = diagnostics.get("latency_summary")
|
||||||
|
ttft = _latency_percentiles(latency_summary, "ttft_ms")
|
||||||
|
tpot = _latency_percentiles(latency_summary, "tpot_ms")
|
||||||
|
details: dict[str, object] = {
|
||||||
|
"lowest_sampled_request_rate": lowest_rate,
|
||||||
|
"lowest_sampling_u": lowest_threshold,
|
||||||
|
"lowest_probe_pass_rate": pass_rate,
|
||||||
|
"early_stop_reason": early_stop_reason,
|
||||||
|
"lowest_probe_latency_ms": {
|
||||||
|
"ttft": ttft,
|
||||||
|
"tpot": tpot,
|
||||||
|
},
|
||||||
|
"lowest_probe_latency_summary": latency_summary if isinstance(latency_summary, dict) else {},
|
||||||
|
}
|
||||||
pieces = [
|
pieces = [
|
||||||
"Baseline configuration has no feasible probe under the current SLO.",
|
"Baseline configuration has no feasible probe under the current SLO.",
|
||||||
"Stopping tuning because even the lowest sampled request rate did not meet the target pass rate.",
|
"Stopping tuning because even the lowest sampled request rate did not meet the target pass rate.",
|
||||||
@@ -53,7 +92,13 @@ def _baseline_all_infeasible_diagnosis(result: dict[str, object]) -> str | None:
|
|||||||
pieces.append(f"lowest_probe_pass_rate={float(pass_rate):.6g}")
|
pieces.append(f"lowest_probe_pass_rate={float(pass_rate):.6g}")
|
||||||
if early_stop_reason:
|
if early_stop_reason:
|
||||||
pieces.append(f"early_stop_reason={early_stop_reason}")
|
pieces.append(f"early_stop_reason={early_stop_reason}")
|
||||||
return " ".join(pieces)
|
for item in (
|
||||||
|
_format_latency_percentiles("lowest_probe_ttft_ms", ttft),
|
||||||
|
_format_latency_percentiles("lowest_probe_tpot_ms", tpot),
|
||||||
|
):
|
||||||
|
if item:
|
||||||
|
pieces.append(item)
|
||||||
|
return " ".join(pieces), details
|
||||||
|
|
||||||
|
|
||||||
def _study_source_path(study_root: Path) -> Path:
|
def _study_source_path(study_root: Path) -> Path:
|
||||||
@@ -170,6 +215,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
"stopped": True,
|
"stopped": True,
|
||||||
"reason": state.tuning_stop_reason,
|
"reason": state.tuning_stop_reason,
|
||||||
"diagnosis": state.tuning_stop_diagnosis,
|
"diagnosis": state.tuning_stop_diagnosis,
|
||||||
|
"details": state.tuning_stop_details,
|
||||||
"state_best_trial_id": state.best_trial_id,
|
"state_best_trial_id": state.best_trial_id,
|
||||||
"state_best_request_rate": state.best_request_rate,
|
"state_best_request_rate": state.best_request_rate,
|
||||||
}
|
}
|
||||||
@@ -305,10 +351,12 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
if is_auto_baseline:
|
if is_auto_baseline:
|
||||||
diagnosis = _baseline_all_infeasible_diagnosis(result)
|
stop = _baseline_all_infeasible_stop(result)
|
||||||
if diagnosis is not None:
|
if stop is not None:
|
||||||
|
diagnosis, details = stop
|
||||||
state.tuning_stop_reason = "baseline_all_infeasible"
|
state.tuning_stop_reason = "baseline_all_infeasible"
|
||||||
state.tuning_stop_diagnosis = diagnosis
|
state.tuning_stop_diagnosis = diagnosis
|
||||||
|
state.tuning_stop_details = details
|
||||||
store.save_state(state)
|
store.save_state(state)
|
||||||
executed.append(
|
executed.append(
|
||||||
{
|
{
|
||||||
@@ -316,6 +364,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
"stopped": True,
|
"stopped": True,
|
||||||
"reason": state.tuning_stop_reason,
|
"reason": state.tuning_stop_reason,
|
||||||
"diagnosis": diagnosis,
|
"diagnosis": diagnosis,
|
||||||
|
"details": details,
|
||||||
"state_best_trial_id": state.best_trial_id,
|
"state_best_trial_id": state.best_trial_id,
|
||||||
"state_best_request_rate": state.best_request_rate,
|
"state_best_request_rate": state.best_request_rate,
|
||||||
}
|
}
|
||||||
@@ -332,6 +381,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
"best_request_rate": final_state.best_request_rate,
|
"best_request_rate": final_state.best_request_rate,
|
||||||
"tuning_stop_reason": final_state.tuning_stop_reason,
|
"tuning_stop_reason": final_state.tuning_stop_reason,
|
||||||
"tuning_stop_diagnosis": final_state.tuning_stop_diagnosis,
|
"tuning_stop_diagnosis": final_state.tuning_stop_diagnosis,
|
||||||
|
"tuning_stop_details": final_state.tuning_stop_details,
|
||||||
},
|
},
|
||||||
ensure_ascii=False,
|
ensure_ascii=False,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -766,6 +766,7 @@ class StudyState:
|
|||||||
next_trial_index: int = 1
|
next_trial_index: int = 1
|
||||||
tuning_stop_reason: str = ""
|
tuning_stop_reason: str = ""
|
||||||
tuning_stop_diagnosis: str = ""
|
tuning_stop_diagnosis: str = ""
|
||||||
|
tuning_stop_details: dict[str, Any] = field(default_factory=dict)
|
||||||
best_by_parallel_size: dict[str, dict[str, Any]] = field(default_factory=dict)
|
best_by_parallel_size: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||||
trials: list[TrialSummary] = field(default_factory=list)
|
trials: list[TrialSummary] = field(default_factory=list)
|
||||||
|
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ class StudyStore:
|
|||||||
next_trial_index=int(payload.get("next_trial_index", 1)),
|
next_trial_index=int(payload.get("next_trial_index", 1)),
|
||||||
tuning_stop_reason=str(payload.get("tuning_stop_reason") or ""),
|
tuning_stop_reason=str(payload.get("tuning_stop_reason") or ""),
|
||||||
tuning_stop_diagnosis=str(payload.get("tuning_stop_diagnosis") or ""),
|
tuning_stop_diagnosis=str(payload.get("tuning_stop_diagnosis") or ""),
|
||||||
|
tuning_stop_details=dict(payload.get("tuning_stop_details") or {}),
|
||||||
best_by_parallel_size={
|
best_by_parallel_size={
|
||||||
str(key): value
|
str(key): value
|
||||||
for key, value in (payload.get("best_by_parallel_size") or {}).items()
|
for key, value in (payload.get("best_by_parallel_size") or {}).items()
|
||||||
|
|||||||
@@ -3040,6 +3040,22 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
"request_rate": 1.0,
|
"request_rate": 1.0,
|
||||||
"pass_rate": 0.5,
|
"pass_rate": 0.5,
|
||||||
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||||
|
"latency_summary": {
|
||||||
|
"ttft_ms": {
|
||||||
|
"count": 2,
|
||||||
|
"mean": 1200.0,
|
||||||
|
"p50": 1100.0,
|
||||||
|
"p95": 1900.0,
|
||||||
|
"p99": 1980.0,
|
||||||
|
},
|
||||||
|
"tpot_ms": {
|
||||||
|
"count": 2,
|
||||||
|
"mean": 35.0,
|
||||||
|
"p50": 32.0,
|
||||||
|
"p95": 48.0,
|
||||||
|
"p99": 49.0,
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
|
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
|
||||||
@@ -3068,6 +3084,15 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
self.assertEqual(len(state.trials), 1)
|
self.assertEqual(len(state.trials), 1)
|
||||||
self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible")
|
self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible")
|
||||||
self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis)
|
self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis)
|
||||||
|
self.assertIn("lowest_probe_ttft_ms", state.tuning_stop_diagnosis)
|
||||||
|
self.assertEqual(
|
||||||
|
state.tuning_stop_details["lowest_probe_latency_ms"]["ttft"]["p95"],
|
||||||
|
1900.0,
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
state.tuning_stop_details["lowest_probe_latency_ms"]["tpot"]["p99"],
|
||||||
|
49.0,
|
||||||
|
)
|
||||||
|
|
||||||
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
|
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
|
||||||
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
|
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
|
||||||
|
|||||||
Reference in New Issue
Block a user