Add trace length bucket tuning support

2026-04-07 11:03:16 +08:00
parent e9b5e9b957
commit 46ed688ace
12 changed files with 922 additions and 14 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job
 from aituner.llm import build_prompt, parse_proposal_text
 from aituner.search import ThresholdProbe, binary_search_max_feasible
 from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
-from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec
+from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec
 from aituner.store import StudyStore
 from aituner.trace import load_trace_requests, summarize_window
 from aituner.worker import (
@@ -25,7 +25,9 @@ from aituner.worker import (
 from aituner.trace import TraceRequest


-def _write_study_assets(tmp_path: Path) -> Path:
+def _write_study_assets(
+    tmp_path: Path, *, trace_overrides: dict[str, object] | None = None
+) -> Path:
    trace_dir = tmp_path / "trace_windows" / "traces"
    trace_dir.mkdir(parents=True)
    trace_path = trace_dir / "chat_w1.jsonl"
@@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path:
    )

    study_path = tmp_path / "study.json"
+    trace_payload: dict[str, object] = {
+        "windows_path": str(windows_path),
+        "window_id": "chat_w1",
+        "u_field": "sampling_u",
+        "timestamp_field": "timestamp",
+        "max_concurrency": 4,
+    }
+    if trace_overrides:
+        trace_payload.update(trace_overrides)
+
    study_payload = {
        "study_id": "study-1",
        "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
@@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path:
            "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
            "python_executable": "python3"
        },
-        "trace": {
-            "windows_path": str(windows_path),
-            "window_id": "chat_w1",
-            "u_field": "sampling_u",
-            "timestamp_field": "timestamp",
-            "max_concurrency": 4
-        },
+        "trace": trace_payload,
        "slo": {
            "target_pass_rate": 0.95,
            "ttft_rule": {
@@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase):
            )
            self.assertIn("allowed_flag_keys", prompt)
            self.assertIn("study-1", prompt)
+            self.assertIn('"current_best"', prompt)
            self.assertIn("queueing_knee_by_bucket", prompt)
            self.assertTrue(study_root.exists())

+    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(
+                tmp_path,
+                trace_overrides={
+                    "input_length_filter": {
+                        "min_input_tokens": 0,
+                        "max_input_tokens": 8192,
+                    }
+                },
+            )
+            study = load_study_spec(study_path)
+            window, requests = load_trace_requests(study, study_spec_path=study_path)
+            summary = summarize_window(requests, window)
+            self.assertEqual(len(requests), 2)
+            self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
+            self.assertEqual(summary["request_count"], 2)
+            self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
+            prompt = build_prompt(
+                study=study,
+                window_summary=summary,
+                state=StudyState(study_id=study.study_id),
+                capability_profile=None,
+            )
+            self.assertIn('"input_length_filter"', prompt)
+            self.assertIn('"max_input_tokens": 8192', prompt)
+
+    def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(
+                tmp_path,
+                trace_overrides={
+                    "input_length_filter": {
+                        "min_input_tokens": 8193,
+                        "max_input_tokens": 8192,
+                    }
+                },
+            )
+            with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
+                load_study_spec(study_path)
+
    def test_prompt_includes_failed_trial_context(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase):
            )
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.best_trial_id, trial.trial_id)
+            self.assertEqual(next_state.best_sampling_u, 0.75)
            self.assertEqual(next_state.best_request_rate, 12.5)

+    def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0001",
+                best_sampling_u=0.375,
+                best_request_rate=3.0,
+                next_trial_index=2,
+                trials=[],
+            )
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "Obs",
+                    "diagnosis": "Diag",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
+                    "expected_effects": ["raise rate"],
+                }
+            )
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+            self.assertEqual(trial.search.low, 0.375)
+            self.assertEqual(trial.search.high, 1.0)
+
    def test_ingest_trial_results_records_failure_reason(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase):
            store = StudyStore(store_root)
            state = store.load_state("study-1")
            self.assertEqual(state.best_trial_id, "trial-0002")
+            self.assertEqual(state.best_sampling_u, 0.75)
            self.assertEqual(state.best_request_rate, 2.0)
            self.assertEqual(state.next_trial_index, 3)

@@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase):
            ["throughput: higher", "ttft: lower"],
        )

+    def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            proposal = parse_proposal_text(
+                """Here is the proposal:
+```json
+{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
+```""",
+                study,
+            )
+            self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
+
    def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
        requests = [
            TraceRequest(
@@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase):
        self.assertEqual(len(replayed), 2)
        self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")

+    def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
+        requests = [
+            TraceRequest(
+                row_id="r0",
+                arrival_s=0.0,
+                sampling_u=0.1,
+                body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
+                prompt_tokens_hint=8,
+                completion_tokens_hint=4,
+            )
+        ]
+
+        class FakeFuture:
+            def result(self, timeout=None):
+                raise AssertionError("future should not be awaited after elapsed early stop")
+
+            def cancel(self):
+                return True
+
+        submitted = []
+
+        class FakeExecutor:
+            def __init__(self, max_workers):
+                self.max_workers = max_workers
+
+            def submit(self, fn, request, **kwargs):
+                submitted.append(request.row_id)
+                return FakeFuture()
+
+            def shutdown(self, wait=False, cancel_futures=True):
+                return None
+
+        wait_timeouts: list[float] = []
+
+        def fake_wait(futures, timeout=None, return_when=None):
+            wait_timeouts.append(timeout)
+            return set(), set(futures)
+
+        def fake_evaluate(outcome: RequestOutcome):
+            return type("Eval", (), {"passed": outcome.success})()
+
+        monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
+
+        with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
+            with mock.patch("aituner.worker.wait", side_effect=fake_wait):
+                with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
+                    replayed, early_stopped, reason = _replay_requests(
+                        requests,
+                        base_url="http://127.0.0.1:8000",
+                        timeout_s=30.0,
+                        max_concurrency=1,
+                        target_pass_rate=0.95,
+                        max_lag_s=None,
+                        max_elapsed_s=1.0,
+                        evaluate_outcome=fake_evaluate,
+                    )
+
+        self.assertEqual(submitted, ["r0"])
+        self.assertTrue(early_stopped)
+        self.assertEqual(reason, "probe_elapsed_s>1.0")
+        self.assertEqual(len(replayed), 1)
+        self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
+        self.assertTrue(wait_timeouts)
+        self.assertLessEqual(wait_timeouts[0], 0.5)
+
    def test_latency_summary_reports_quantiles_and_slo(self) -> None:
        study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
        outcomes = [