From 6c04b9dbbc55ae497424b94cc9f1d2224874e736 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Sat, 25 Apr 2026 17:14:05 +0800
Subject: [PATCH] Evaluate baseline before LLM tuning

---
 docs/harness-tuning-progress.md |  6 ++++
 src/aituner/cli.py              | 31 +++++++++++++++--
 tests/test_core_flow.py         | 62 +++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/docs/harness-tuning-progress.md b/docs/harness-tuning-progress.md
index d489de4..bc76205 100644
--- a/docs/harness-tuning-progress.md
+++ b/docs/harness-tuning-progress.md
@@ -76,6 +76,12 @@ Improve AITuner convergence for the `dash0` internal vLLM + Qwen3.5-27B 0-8k cha
 - Important implementation issue found: after an early-stopped probe, the worker returned while in-flight HTTP requests could continue occupying the engine, stalling/polluting the next binary-search probe.
 - Action: stopped the run and freed GPUs. Updating `worker._replay_requests` to drain in-flight requests after early stop before the next probe starts.
 
+### 2026-04-25 17:00-17:12 CST
+
+- r2 confirmed that draining avoids immediate cross-probe pollution, but the first LLM trial still started from a speculative TP=2 edit without a measured incumbent.
+- This is not aligned with the paper's agentic loop, which evaluates the initial configuration first and then searches from measured feedback.
+- Action: update `study tune` so LLM-driven studies automatically materialize a baseline empty-patch trial first, unless `--skip-baseline` is passed. This should reduce early bad proposals because the first LLM edit will see real baseline bottleneck diagnostics and an incumbent request_rate_per_gpu.
+
 Remaining next steps:
 
 1. Start a real harness-guided Qwen3.5-27B 0-8k chat tuning run from `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`.
diff --git a/src/aituner/cli.py b/src/aituner/cli.py
index e0fddaa..ccd08bb 100644
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -113,7 +113,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
     study_root = store.init_study(spec_path=spec_path, study=study)
     capability_profile = load_capability_profile(study, study_spec_path=spec_path)
     proposal_files = [Path(item).resolve() for item in (args.proposal_file or [])]
-    max_trials = args.max_trials or (len(proposal_files) if proposal_files else 1)
+    max_trials = args.max_trials or (len(proposal_files) if proposal_files else 2)
     if max_trials <= 0:
         raise SpecError("max_trials must be positive")
     if proposal_files and max_trials > len(proposal_files):
@@ -134,7 +134,29 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
         prompt_name = f"prompt-{state.next_trial_index:04d}"
         store.write_prompt(study.study_id, prompt_name, prompt)
 
-        if proposal_files:
+        if (
+            not proposal_files
+            and not args.skip_baseline
+            and state.next_trial_index == 1
+            and not state.trials
+        ):
+            proposal_source = None
+            proposal_name = "baseline-0001"
+            proposal_text = json.dumps(
+                {
+                    "observation": "Evaluate the study's initial engine configuration before LLM-guided edits.",
+                    "diagnosis": "Baseline trial aligned with the AITuner evaluate-then-search loop.",
+                    "config_patch": {"env_patch": {}, "flag_patch": {}},
+                    "expected_effects": [
+                        "establish incumbent performance",
+                        "provide bottleneck evidence for harness-guided proposals",
+                    ],
+                    "why_not_previous_failures": "No config changes are applied.",
+                    "should_stop": False,
+                },
+                ensure_ascii=False,
+            )
+        elif proposal_files:
             proposal_source = proposal_files[idx]
             proposal_text = proposal_source.read_text(encoding="utf-8")
             proposal_name = proposal_source.stem
@@ -264,6 +286,11 @@ def build_parser() -> argparse.ArgumentParser:
     tune.add_argument("--store-root")
     tune.add_argument("--proposal-file", action="append")
     tune.add_argument("--max-trials", type=int)
+    tune.add_argument(
+        "--skip-baseline",
+        action="store_true",
+        help="Do not automatically evaluate the initial config before LLM proposals.",
+    )
     tune.set_defaults(func=cmd_study_tune)
 
     worker = subparsers.add_parser("worker")
diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py
index 0f1c591..351efd3 100644
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -1774,6 +1774,68 @@ class CoreFlowTests(unittest.TestCase):
             state = store.load_state("study-1")
             self.assertEqual(state.next_trial_index, 1)
 
+    def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            payload = json.loads(study_path.read_text(encoding="utf-8"))
+            payload["llm"]["endpoint"] = {
+                "provider": "custom",
+                "base_url": "http://llm.example/v1",
+                "wire_api": "chat.completions",
+                "model": "test-model",
+                "api_key_env": "OPENAI_API_KEY",
+            }
+            study_path.write_text(json.dumps(payload), encoding="utf-8")
+            store_root = tmp_path / "store"
+
+            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
+                payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
+                trial_root = Path(payload["artifact_dir"])
+                result = {
+                    "study_id": payload["study_id"],
+                    "trial_id": payload["trial_id"],
+                    "status": "completed",
+                    "best_sampling_u": 0.25,
+                    "best_request_rate": 1.0,
+                    "best_pass_rate": 1.0,
+                    "best_request_count": 2,
+                    "probes": [],
+                }
+                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
+                return result
+
+            llm_payload = json.dumps(
+                {
+                    "observation": "baseline done",
+                    "diagnosis": "try more batching",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
+                    "expected_effects": ["higher throughput"],
+                    "why_not_previous_failures": "",
+                    "should_stop": False,
+                }
+            )
+            with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
+                with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
+                    exit_code = cli_main(
+                        [
+                            "study",
+                            "tune",
+                            "--spec",
+                            str(study_path),
+                            "--store-root",
+                            str(store_root),
+                            "--max-trials",
+                            "2",
+                        ]
+                    )
+            self.assertEqual(exit_code, 0)
+            store = StudyStore(store_root)
+            state = store.load_state("study-1")
+            self.assertEqual(state.next_trial_index, 3)
+            self.assertEqual(state.trials[0].config_patch, {"env_patch": {}, "flag_patch": {}})
+            self.assertEqual(state.trials[1].config_patch["flag_patch"], {"max-num-seqs": 64})
+
     def test_load_compare_spec_requires_window_selection(self) -> None:
         with tempfile.TemporaryDirectory() as tmp:
             tmp_path = Path(tmp)