Evaluate baseline before LLM tuning

2026-04-25 17:14:05 +08:00
parent 2d7ebe50ee
commit 6c04b9dbbc
3 changed files with 97 additions and 2 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -1774,6 +1774,68 @@ class CoreFlowTests(unittest.TestCase):
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 1)

+    def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            payload = json.loads(study_path.read_text(encoding="utf-8"))
+            payload["llm"]["endpoint"] = {
+                "provider": "custom",
+                "base_url": "http://llm.example/v1",
+                "wire_api": "chat.completions",
+                "model": "test-model",
+                "api_key_env": "OPENAI_API_KEY",
+            }
+            study_path.write_text(json.dumps(payload), encoding="utf-8")
+            store_root = tmp_path / "store"
+
+            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
+                payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
+                trial_root = Path(payload["artifact_dir"])
+                result = {
+                    "study_id": payload["study_id"],
+                    "trial_id": payload["trial_id"],
+                    "status": "completed",
+                    "best_sampling_u": 0.25,
+                    "best_request_rate": 1.0,
+                    "best_pass_rate": 1.0,
+                    "best_request_count": 2,
+                    "probes": [],
+                }
+                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
+                return result
+
+            llm_payload = json.dumps(
+                {
+                    "observation": "baseline done",
+                    "diagnosis": "try more batching",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
+                    "expected_effects": ["higher throughput"],
+                    "why_not_previous_failures": "",
+                    "should_stop": False,
+                }
+            )
+            with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
+                with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
+                    exit_code = cli_main(
+                        [
+                            "study",
+                            "tune",
+                            "--spec",
+                            str(study_path),
+                            "--store-root",
+                            str(store_root),
+                            "--max-trials",
+                            "2",
+                        ]
+                    )
+            self.assertEqual(exit_code, 0)
+            store = StudyStore(store_root)
+            state = store.load_state("study-1")
+            self.assertEqual(state.next_trial_index, 3)
+            self.assertEqual(state.trials[0].config_patch, {"env_patch": {}, "flag_patch": {}})
+            self.assertEqual(state.trials[1].config_patch["flag_patch"], {"max-num-seqs": 64})
+
    def test_load_compare_spec_requires_window_selection(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)