Add auto search high measurement policy

2026-06-26 20:05:22 +08:00
parent 95ad124a1b
commit 1dd3eaebaa
5 changed files with 415 additions and 27 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -51,7 +51,7 @@ from aituner.spec import (
    TrialSummary,
    load_study_spec,
 )
-from aituner.store import StudyStore
+from aituner.store import StudyStore, resolve_auto_high_search
 from aituner.trace import load_trace_requests, summarize_window
 from aituner.worker import (
    _adaptive_replay_set,
@@ -79,6 +79,7 @@ def _write_study_assets(
    trace_overrides: dict[str, object] | None = None,
    slo_overrides: dict[str, object] | None = None,
    engine_overrides: dict[str, object] | None = None,
+    search_overrides: dict[str, object] | None = None,
 ) -> Path:
    trace_dir = tmp_path / "trace_windows" / "traces"
    trace_dir.mkdir(parents=True)
@@ -196,6 +197,8 @@ def _write_study_assets(
        study_payload["slo"].update(slo_overrides)
    if engine_overrides:
        study_payload["engine"].update(engine_overrides)
+    if search_overrides:
+        study_payload["search"].update(search_overrides)
    study_path.write_text(json.dumps(study_payload), encoding="utf-8")
    return study_path

@@ -260,6 +263,76 @@ class CoreFlowTests(unittest.TestCase):
            self.assertIn("knob_harnesses", prompt)
            self.assertTrue(study_root.exists())

+    def test_search_auto_high_schema_is_backward_compatible(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            study_path = _write_study_assets(
+                Path(tmp),
+                search_overrides={"high": 0.4},
+            )
+            study = load_study_spec(study_path)
+            self.assertFalse(study.search.auto_high.enabled)
+            updated, evidence = resolve_auto_high_search(
+                search=study.search,
+                sampling_us=[0.1, 0.9],
+            )
+            self.assertEqual(updated.high, 0.4)
+            self.assertEqual(evidence["reason"], "auto_high_disabled")
+
+    def test_search_auto_high_caps_at_policy_and_trace(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            study_path = _write_study_assets(
+                Path(tmp),
+                search_overrides={
+                    "high": 0.2,
+                    "auto_high": {
+                        "enabled": True,
+                        "max_sampling_u": 0.8,
+                        "require_human_confirmation_beyond_trace": True,
+                    },
+                },
+            )
+            study = load_study_spec(study_path)
+            capped_by_policy, policy_evidence = resolve_auto_high_search(
+                search=study.search,
+                sampling_us=[0.1, 0.9],
+            )
+            self.assertEqual(capped_by_policy.high, 0.8)
+            self.assertEqual(
+                policy_evidence["reason"],
+                "search_high_raised_to_trace_ceiling",
+            )
+
+            capped_by_trace, trace_evidence = resolve_auto_high_search(
+                search=study.search,
+                sampling_us=[0.1, 0.7],
+            )
+            self.assertEqual(capped_by_trace.high, 0.7)
+            self.assertEqual(trace_evidence["effective_ceiling"], 0.7)
+
+            high_search = study.search.__class__.from_dict(
+                {
+                    "low": 0.0,
+                    "high": 0.95,
+                    "tolerance": study.search.tolerance,
+                    "max_probes": study.search.max_probes,
+                    "sample_seed": study.search.sample_seed,
+                    "auto_high": {
+                        "enabled": True,
+                        "max_sampling_u": 0.8,
+                        "require_human_confirmation_beyond_trace": True,
+                    },
+                }
+            )
+            lowered, lowered_evidence = resolve_auto_high_search(
+                search=high_search,
+                sampling_us=[0.1, 0.9],
+            )
+            self.assertEqual(lowered.high, 0.8)
+            self.assertEqual(
+                lowered_evidence["reason"],
+                "search_high_lowered_to_trace_ceiling",
+            )
+
    def test_lca_workload_profile_uses_standard_10d_features(self) -> None:
        window = WindowRecord(
            window_id="w1",
@@ -1381,11 +1454,17 @@ class CoreFlowTests(unittest.TestCase):
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
-            self.assertTrue(context["harness_stop"]["should_stop"])
-            self.assertEqual(context["harness_stop"]["reason"], "search_high_saturated_by_incumbent")
+            self.assertFalse(context["harness_stop"]["should_stop"])
+            self.assertEqual(
+                context["harness_stop"]["reason"],
+                "search_high_saturation_requires_parallel_size_evidence",
+            )
+            self.assertEqual(
+                context["harness_stop"]["evidence"]["objective"],
+                "request_rate_per_gpu",
+            )
            proposal = build_harness_stop_proposal(context)
-            self.assertIsNotNone(proposal)
-            self.assertTrue(proposal.should_stop)
+            self.assertIsNone(proposal)

    def test_harness_stop_allows_feasible_high_probe_with_some_failures(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
@@ -1446,8 +1525,11 @@ class CoreFlowTests(unittest.TestCase):
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
-            self.assertTrue(context["harness_stop"]["should_stop"])
-            self.assertEqual(context["harness_stop"]["reason"], "search_high_saturated_by_incumbent")
+            self.assertFalse(context["harness_stop"]["should_stop"])
+            self.assertEqual(
+                context["harness_stop"]["reason"],
+                "search_high_saturation_requires_parallel_size_evidence",
+            )

    def test_harness_guided_first_tp_probe_for_latency_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
@@ -4498,7 +4580,9 @@ class CoreFlowTests(unittest.TestCase):
                with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
                    with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
                        with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
-                            result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
+                            result = run_trial(
+                                Path(trial.artifact_dir) / "trial_spec.json"
+                            )

            self.assertEqual(result["status"], "completed")
            details_path = Path(trial.artifact_dir) / "probe_details.jsonl"
@@ -4512,6 +4596,60 @@ class CoreFlowTests(unittest.TestCase):
            self.assertEqual(rows[0]["outcomes"][0]["request_id"], "r1")
            self.assertEqual(rows[0]["outcomes"][0]["sampling_u"], 0.1)

+    def test_run_trial_marks_full_trace_saturation_as_measurement_ceiling_insufficient(
+        self,
+    ) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = store.load_state(study.study_id)
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "baseline",
+                    "diagnosis": "baseline",
+                    "config_patch": {"env_patch": {}, "flag_patch": {}},
+                    "expected_effects": ["measure"],
+                }
+            )
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+
+            def fake_replay(requests, **kwargs):
+                return (
+                    [
+                        RequestOutcome(
+                            request_id=request.row_id,
+                            success=True,
+                            ttft_ms=10.0,
+                            tpot_ms=5.0,
+                            prompt_tokens=request.prompt_tokens_hint,
+                            completion_tokens=request.completion_tokens_hint,
+                        )
+                        for request in requests
+                    ],
+                    False,
+                    "",
+                )
+
+            process = mock.Mock()
+            process.poll.return_value = 0
+            with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
+                with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
+                    with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
+                        with mock.patch(
+                            "aituner.worker._replay_requests",
+                            side_effect=fake_replay,
+                        ):
+                            result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
+
+            self.assertEqual(result["status"], "completed")
+            self.assertEqual(result["best_request_count"], 3)
+            self.assertTrue(result["measurement"]["measurement_ceiling_insufficient"])
+            self.assertEqual(result["measurement"]["reason"], "measurement_ceiling_insufficient")
+            self.assertIn("auto_high_resolution", result["measurement"])
+
    def test_run_trial_falls_back_below_inherited_search_floor(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)