From 14259fcec9bda7a74d475e127d0fc06fa350f0f1 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sun, 10 May 2026 14:30:34 +0800 Subject: [PATCH] Measure lower-range performance for infeasible trials --- ...qwen235b-thinking-prefill-ttft-20260510.md | 4 +- ...wen27b-chat-0-8k-ttft4s-tpot25-20260510.md | 4 +- src/aituner/worker.py | 87 ++++++++++++++----- tests/test_core_flow.py | 84 ++++++++++++++++++ 4 files changed, 157 insertions(+), 22 deletions(-) diff --git a/docs/harness-ablation/qwen235b-thinking-prefill-ttft-20260510.md b/docs/harness-ablation/qwen235b-thinking-prefill-ttft-20260510.md index c7fd2f5..ead8cfa 100644 --- a/docs/harness-ablation/qwen235b-thinking-prefill-ttft-20260510.md +++ b/docs/harness-ablation/qwen235b-thinking-prefill-ttft-20260510.md @@ -26,7 +26,9 @@ Both runs were launched through `python3 -m aituner.cli study tune`; no proposal The table below is the raw per-iteration performance for a Fig18-style plot. Use this table as `perf[i]`; do not replace missing points with `max(perf[:i+1])`. -Metric: `best_request_rate_per_gpu` from that trial's own `result.json`. `NA` means the proposed config did not produce a feasible point under the SLO, either because the engine/probe failed or because every sampled probe was infeasible. +Metric: `best_request_rate_per_gpu` from that trial's own `result.json`. `NA` means the proposed config did not produce a feasible point in the measured search range, either because the engine/probe failed or because every sampled probe was infeasible. + +Important caveat: these runs were produced before the lower-range fallback fix. For same-parallel-size runtime patches, AITuner inherited the incumbent `sampling_u` as the new search floor. If the config was infeasible above that floor, the old worker wrote `NA` without searching below the floor. Therefore the `NA` entries below are not complete Fig18-quality raw performance points; they are "no feasible point above inherited floor." A rerun with the fixed worker is required to fill their true lower-load performance. | Variant | iter1 | iter2 | iter3 | iter4 | iter5 | iter6 | iter7 | iter8 | iter9 | iter10 | iter11 | iter12 | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | diff --git a/docs/harness-ablation/qwen27b-chat-0-8k-ttft4s-tpot25-20260510.md b/docs/harness-ablation/qwen27b-chat-0-8k-ttft4s-tpot25-20260510.md index 7471f5e..4797cf6 100644 --- a/docs/harness-ablation/qwen27b-chat-0-8k-ttft4s-tpot25-20260510.md +++ b/docs/harness-ablation/qwen27b-chat-0-8k-ttft4s-tpot25-20260510.md @@ -26,7 +26,9 @@ The previous no-harness run was affected by the `dash0` migration and had many e The table below is the raw per-iteration performance for a Fig18-style plot. Use this table as `perf[i]`; do not replace missing points with `max(perf[:i+1])`. -Metric: `best_request_rate_per_gpu` from that trial's own `result.json`. `NA` means the proposed config did not produce a feasible point under the SLO. `stop` means the harness stopped before launching another GPU trial. +Metric: `best_request_rate_per_gpu` from that trial's own `result.json`. `NA` means the proposed config did not produce a feasible point in the measured search range. `stop` means the harness stopped before launching another GPU trial. + +Important caveat: these runs were produced before the lower-range fallback fix. For same-parallel-size runtime patches, AITuner inherited the incumbent `sampling_u` as the new search floor. If the config was infeasible above that floor, the old worker wrote `NA` without searching below the floor. Therefore the `NA` entries below are not complete Fig18-quality raw performance points; they are "no feasible point above inherited floor." A rerun with the fixed worker is required to fill their true lower-load performance. | Variant | iter1 | iter2 | iter3 | iter4 | iter5 | iter6 | iter7 | iter8 | iter9 | iter10 | iter11 | iter12 | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | diff --git a/src/aituner/worker.py b/src/aituner/worker.py index 4de67ec..aa96202 100644 --- a/src/aituner/worker.py +++ b/src/aituner/worker.py @@ -539,41 +539,88 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]: payload=payload, ) - search = binary_search_max_feasible( + primary_search = binary_search_max_feasible( low=trial.search.low, high=trial.search.high, tolerance=trial.search.tolerance, max_probes=trial.search.max_probes, evaluator=evaluator, ) - best = search.best_feasible_payload + search_for_best = primary_search + best = primary_search.best_feasible_payload + best_source = "primary_search" + fallback_search = None + original_search_low = float(study.search.low) + inherited_search_floor = float(trial.search.low) + if best is None and inherited_search_floor > original_search_low: + fallback_search = binary_search_max_feasible( + low=original_search_low, + high=inherited_search_floor, + tolerance=trial.search.tolerance, + max_probes=trial.search.max_probes, + evaluator=evaluator, + ) + if fallback_search.best_feasible_payload is not None: + search_for_best = fallback_search + best = fallback_search.best_feasible_payload + best_source = "lower_range_fallback" + + def serialize_probe(probe: ThresholdProbe[ProbePayload]) -> dict[str, Any]: + return { + "threshold": probe.threshold, + "feasible": probe.feasible, + "payload": { + "request_count": probe.payload.request_count, + "pass_rate": probe.payload.pass_rate, + "request_rate": probe.payload.request_rate, + "early_stopped": probe.payload.early_stopped, + "early_stop_reason": probe.payload.early_stop_reason, + "latency_summary": probe.payload.latency_summary, + }, + } + + all_probes = [ + *primary_search.probes, + *((fallback_search.probes if fallback_search is not None else [])), + ] result = { "study_id": trial.study_id, "trial_id": trial.trial_id, "status": "completed", "config_patch": to_jsonable(trial.config_patch), - "best_sampling_u": search.best_threshold if best is not None else None, + "best_source": best_source, + "best_sampling_u": search_for_best.best_threshold if best is not None else None, "best_request_rate": best.request_rate if best is not None else None, "best_pass_rate": best.pass_rate if best is not None else None, "best_request_count": best.request_count if best is not None else None, - "probes": [ - { - "threshold": probe.threshold, - "feasible": probe.feasible, - "payload": { - "request_count": probe.payload.request_count, - "pass_rate": probe.payload.pass_rate, - "request_rate": probe.payload.request_rate, - "early_stopped": probe.payload.early_stopped, - "early_stop_reason": probe.payload.early_stop_reason, - "latency_summary": probe.payload.latency_summary, - }, - } - for probe in search.probes - ], + "probes": [serialize_probe(probe) for probe in all_probes], } - if best is None and search.probes: - last_probe = search.probes[-1] + if fallback_search is not None: + result["primary_search"] = { + "low": inherited_search_floor, + "high": trial.search.high, + "best_sampling_u": primary_search.best_threshold + if primary_search.best_feasible_payload is not None + else None, + "best_request_rate": primary_search.best_feasible_payload.request_rate + if primary_search.best_feasible_payload is not None + else None, + "probes": [serialize_probe(probe) for probe in primary_search.probes], + } + result["lower_range_fallback"] = { + "triggered": True, + "low": original_search_low, + "high": inherited_search_floor, + "best_sampling_u": fallback_search.best_threshold + if fallback_search.best_feasible_payload is not None + else None, + "best_request_rate": fallback_search.best_feasible_payload.request_rate + if fallback_search.best_feasible_payload is not None + else None, + "probes": [serialize_probe(probe) for probe in fallback_search.probes], + } + if best is None and all_probes: + last_probe = all_probes[-1] result["all_infeasible_diagnostics"] = { "threshold": last_probe.threshold, "request_count": last_probe.payload.request_count, diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 528eea8..10ba8fd 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -2062,6 +2062,90 @@ class CoreFlowTests(unittest.TestCase): self.assertEqual(rows[0]["outcomes"][0]["request_id"], "r1") self.assertEqual(rows[0]["outcomes"][0]["sampling_u"], 0.1) + def test_run_trial_falls_back_below_inherited_search_floor(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + payload = json.loads(study_path.read_text(encoding="utf-8")) + payload["search"]["max_probes"] = 2 + study_path.write_text(json.dumps(payload), encoding="utf-8") + study = load_study_spec(study_path) + store = StudyStore(tmp_path / ".aituner" / "studies") + store.init_study(spec_path=study_path, study=study) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0001", + best_parallel_size=1, + best_sampling_u=0.5, + best_request_rate=2.0, + best_request_rate_per_gpu=2.0, + next_trial_index=2, + best_by_parallel_size={ + "1": { + "trial_id": "trial-0001", + "parallel_size": 1, + "best_sampling_u": 0.5, + "best_request_rate": 2.0, + "best_request_rate_per_gpu": 2.0, + } + }, + trials=[], + ) + proposal = Proposal.from_dict( + { + "observation": "runtime patch", + "diagnosis": "measure even if worse than incumbent", + "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}}, + "expected_effects": ["measure"], + } + ) + trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) + self.assertEqual(trial.search.low, 0.5) + + def fake_replay(requests, **kwargs): + passing = len(requests) <= 1 + return ( + [ + RequestOutcome( + request_id=request.row_id, + success=True, + ttft_ms=10.0 if passing else 10000.0, + tpot_ms=5.0 if passing else 1000.0, + prompt_tokens=request.prompt_tokens_hint, + completion_tokens=request.completion_tokens_hint, + ) + for request in requests + ], + False, + "", + ) + + process = mock.Mock() + process.poll.return_value = 0 + with mock.patch("aituner.worker.subprocess.Popen", return_value=process): + with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None): + with mock.patch("aituner.worker._terminate_process_tree", return_value=None): + with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay): + result = run_trial(Path(trial.artifact_dir) / "trial_spec.json") + + self.assertEqual(result["status"], "completed") + self.assertEqual(result["best_source"], "lower_range_fallback") + self.assertEqual(result["best_sampling_u"], 0.375) + self.assertEqual(result["best_request_rate"], 0.1) + self.assertEqual(result["primary_search"]["low"], 0.5) + self.assertIsNone(result["primary_search"]["best_request_rate"]) + self.assertEqual(result["lower_range_fallback"]["low"], 0.0) + self.assertEqual(result["lower_range_fallback"]["high"], 0.5) + self.assertEqual(result["lower_range_fallback"]["best_request_rate"], 0.1) + self.assertEqual( + [probe["threshold"] for probe in result["primary_search"]["probes"]], + [0.75, 0.625], + ) + self.assertEqual( + [probe["threshold"] for probe in result["lower_range_fallback"]["probes"]], + [0.25, 0.375], + ) + def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)