Add harness early stop ablation

2026-05-02 08:08:14 +08:00
parent 6d3459c82d
commit 1a3d628268
9 changed files with 837 additions and 29 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -13,7 +13,7 @@ from aituner.compare import load_compare_spec, run_compare
 from aituner.engine import build_launch_recipe
 from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
 from aituner.job import append_job, build_trial_job
-from aituner.harness import build_harness_context
+from aituner.harness import build_harness_context, build_harness_stop_proposal
 from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
 from aituner.search import ThresholdProbe, binary_search_max_feasible
 from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
@@ -422,6 +422,119 @@ class CoreFlowTests(unittest.TestCase):
            )
            self.assertIn("validate", guard["recommended_next_action"])

+    def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0002",
+                best_parallel_size=8,
+                best_sampling_u=0.02,
+                best_request_rate=2.4,
+                best_request_rate_per_gpu=0.3,
+                trials=[
+                    TrialSummary(
+                        trial_id="trial-0001",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=0.8,
+                        best_request_rate_per_gpu=0.1,
+                        config_patch={"env_patch": {}, "flag_patch": {}},
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0002",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=2.4,
+                        best_request_rate_per_gpu=0.3,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {
+                                "tensor-parallel-size": 2,
+                                "data-parallel-size": 4,
+                            },
+                        },
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0003",
+                        status="completed",
+                        parallel_size=8,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {
+                                "tensor-parallel-size": 1,
+                                "data-parallel-size": 8,
+                            },
+                        },
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0004",
+                        status="completed",
+                        parallel_size=8,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {"max-num-seqs": 160},
+                        },
+                    ),
+                ],
+            )
+            context = build_harness_context(
+                study=study,
+                window_summary={"prompt_tokens_p95": 2048},
+                state=state,
+            )
+            self.assertTrue(context["harness_stop"]["should_stop"])
+            self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
+            proposal = build_harness_stop_proposal(context)
+            self.assertIsNotNone(proposal)
+            self.assertTrue(proposal.should_stop)
+
+    def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0002",
+                best_parallel_size=8,
+                best_request_rate=2.4,
+                best_request_rate_per_gpu=0.3,
+                trials=[
+                    TrialSummary(
+                        trial_id="trial-0001",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=0.8,
+                        best_request_rate_per_gpu=0.1,
+                        config_patch={"env_patch": {}, "flag_patch": {}},
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0002",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=2.4,
+                        best_request_rate_per_gpu=0.3,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {
+                                "tensor-parallel-size": 2,
+                                "data-parallel-size": 4,
+                            },
+                        },
+                    ),
+                ],
+            )
+            context = build_harness_context(
+                study=study,
+                window_summary={"prompt_tokens_p95": 2048},
+                state=state,
+            )
+            self.assertFalse(context["harness_stop"]["should_stop"])
+            self.assertIsNone(build_harness_stop_proposal(context))
+
    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -564,6 +677,26 @@ class CoreFlowTests(unittest.TestCase):
                "\n".join(context["proposal_rules"]),
            )

+    def test_prompt_can_disable_harness_for_ablation(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            payload = json.loads(study_path.read_text(encoding="utf-8"))
+            payload["llm"]["use_harness"] = False
+            study_path.write_text(json.dumps(payload), encoding="utf-8")
+            study = load_study_spec(study_path)
+            window, requests = load_trace_requests(study, study_spec_path=study_path)
+            prompt = build_prompt(
+                study=study,
+                window_summary=summarize_window(requests, window),
+                state=StudyState(study_id=study.study_id),
+                capability_profile=None,
+            )
+            self.assertFalse(study.llm.use_harness)
+            self.assertIn("Disabled by llm.use_harness=false", prompt)
+            self.assertNotIn('"paper_alignment"', prompt)
+            self.assertIn("without harness hints", prompt)
+
    def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -2299,6 +2432,98 @@ class CoreFlowTests(unittest.TestCase):
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 1)

+    def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store_root = tmp_path / "store"
+            store = StudyStore(store_root)
+            store.init_study(spec_path=study_path, study=study)
+            store.save_state(
+                StudyState(
+                    study_id=study.study_id,
+                    best_trial_id="trial-0002",
+                    best_parallel_size=8,
+                    best_sampling_u=0.02,
+                    best_request_rate=2.4,
+                    best_request_rate_per_gpu=0.3,
+                    next_trial_index=5,
+                    trials=[
+                        TrialSummary(
+                            trial_id="trial-0001",
+                            status="completed",
+                            parallel_size=8,
+                            best_request_rate=0.8,
+                            best_request_rate_per_gpu=0.1,
+                            config_patch={"env_patch": {}, "flag_patch": {}},
+                        ),
+                        TrialSummary(
+                            trial_id="trial-0002",
+                            status="completed",
+                            parallel_size=8,
+                            best_request_rate=2.4,
+                            best_request_rate_per_gpu=0.3,
+                            config_patch={
+                                "env_patch": {},
+                                "flag_patch": {
+                                    "tensor-parallel-size": 2,
+                                    "data-parallel-size": 4,
+                                },
+                            },
+                        ),
+                        TrialSummary(
+                            trial_id="trial-0003",
+                            status="completed",
+                            parallel_size=8,
+                            config_patch={
+                                "env_patch": {},
+                                "flag_patch": {
+                                    "tensor-parallel-size": 1,
+                                    "data-parallel-size": 8,
+                                },
+                            },
+                        ),
+                        TrialSummary(
+                            trial_id="trial-0004",
+                            status="completed",
+                            parallel_size=8,
+                            config_patch={
+                                "env_patch": {},
+                                "flag_patch": {"max-num-seqs": 160},
+                            },
+                        ),
+                    ],
+                )
+            )
+
+            with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
+                with mock.patch("aituner.cli.run_trial") as run_trial_mock:
+                    exit_code = cli_main(
+                        [
+                            "study",
+                            "tune",
+                            "--spec",
+                            str(study_path),
+                            "--store-root",
+                            str(store_root),
+                            "--max-trials",
+                            "1",
+                        ]
+                    )
+
+            self.assertEqual(exit_code, 0)
+            llm_mock.assert_not_called()
+            run_trial_mock.assert_not_called()
+            proposal_path = (
+                store.study_root(study.study_id)
+                / "proposals"
+                / "harness-stop-0005.json"
+            )
+            self.assertTrue(proposal_path.exists())
+            proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
+            self.assertTrue(proposal["should_stop"])
+
    def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)