From 6c04b9dbbc55ae497424b94cc9f1d2224874e736 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sat, 25 Apr 2026 17:14:05 +0800 Subject: [PATCH] Evaluate baseline before LLM tuning --- docs/harness-tuning-progress.md | 6 ++++ src/aituner/cli.py | 31 +++++++++++++++-- tests/test_core_flow.py | 62 +++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 2 deletions(-) diff --git a/docs/harness-tuning-progress.md b/docs/harness-tuning-progress.md index d489de4..bc76205 100644 --- a/docs/harness-tuning-progress.md +++ b/docs/harness-tuning-progress.md @@ -76,6 +76,12 @@ Improve AITuner convergence for the `dash0` internal vLLM + Qwen3.5-27B 0-8k cha - Important implementation issue found: after an early-stopped probe, the worker returned while in-flight HTTP requests could continue occupying the engine, stalling/polluting the next binary-search probe. - Action: stopped the run and freed GPUs. Updating `worker._replay_requests` to drain in-flight requests after early stop before the next probe starts. +### 2026-04-25 17:00-17:12 CST + +- r2 confirmed that draining avoids immediate cross-probe pollution, but the first LLM trial still started from a speculative TP=2 edit without a measured incumbent. +- This is not aligned with the paper's agentic loop, which evaluates the initial configuration first and then searches from measured feedback. +- Action: update `study tune` so LLM-driven studies automatically materialize a baseline empty-patch trial first, unless `--skip-baseline` is passed. This should reduce early bad proposals because the first LLM edit will see real baseline bottleneck diagnostics and an incumbent request_rate_per_gpu. + Remaining next steps: 1. Start a real harness-guided Qwen3.5-27B 0-8k chat tuning run from `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`. diff --git a/src/aituner/cli.py b/src/aituner/cli.py index e0fddaa..ccd08bb 100644 --- a/src/aituner/cli.py +++ b/src/aituner/cli.py @@ -113,7 +113,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int: study_root = store.init_study(spec_path=spec_path, study=study) capability_profile = load_capability_profile(study, study_spec_path=spec_path) proposal_files = [Path(item).resolve() for item in (args.proposal_file or [])] - max_trials = args.max_trials or (len(proposal_files) if proposal_files else 1) + max_trials = args.max_trials or (len(proposal_files) if proposal_files else 2) if max_trials <= 0: raise SpecError("max_trials must be positive") if proposal_files and max_trials > len(proposal_files): @@ -134,7 +134,29 @@ def cmd_study_tune(args: argparse.Namespace) -> int: prompt_name = f"prompt-{state.next_trial_index:04d}" store.write_prompt(study.study_id, prompt_name, prompt) - if proposal_files: + if ( + not proposal_files + and not args.skip_baseline + and state.next_trial_index == 1 + and not state.trials + ): + proposal_source = None + proposal_name = "baseline-0001" + proposal_text = json.dumps( + { + "observation": "Evaluate the study's initial engine configuration before LLM-guided edits.", + "diagnosis": "Baseline trial aligned with the AITuner evaluate-then-search loop.", + "config_patch": {"env_patch": {}, "flag_patch": {}}, + "expected_effects": [ + "establish incumbent performance", + "provide bottleneck evidence for harness-guided proposals", + ], + "why_not_previous_failures": "No config changes are applied.", + "should_stop": False, + }, + ensure_ascii=False, + ) + elif proposal_files: proposal_source = proposal_files[idx] proposal_text = proposal_source.read_text(encoding="utf-8") proposal_name = proposal_source.stem @@ -264,6 +286,11 @@ def build_parser() -> argparse.ArgumentParser: tune.add_argument("--store-root") tune.add_argument("--proposal-file", action="append") tune.add_argument("--max-trials", type=int) + tune.add_argument( + "--skip-baseline", + action="store_true", + help="Do not automatically evaluate the initial config before LLM proposals.", + ) tune.set_defaults(func=cmd_study_tune) worker = subparsers.add_parser("worker") diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 0f1c591..351efd3 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1774,6 +1774,68 @@ class CoreFlowTests(unittest.TestCase): state = store.load_state("study-1") self.assertEqual(state.next_trial_index, 1) + def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + payload = json.loads(study_path.read_text(encoding="utf-8")) + payload["llm"]["endpoint"] = { + "provider": "custom", + "base_url": "http://llm.example/v1", + "wire_api": "chat.completions", + "model": "test-model", + "api_key_env": "OPENAI_API_KEY", + } + study_path.write_text(json.dumps(payload), encoding="utf-8") + store_root = tmp_path / "store" + + def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: + payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) + trial_root = Path(payload["artifact_dir"]) + result = { + "study_id": payload["study_id"], + "trial_id": payload["trial_id"], + "status": "completed", + "best_sampling_u": 0.25, + "best_request_rate": 1.0, + "best_pass_rate": 1.0, + "best_request_count": 2, + "probes": [], + } + (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") + return result + + llm_payload = json.dumps( + { + "observation": "baseline done", + "diagnosis": "try more batching", + "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}}, + "expected_effects": ["higher throughput"], + "why_not_previous_failures": "", + "should_stop": False, + } + ) + with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial): + with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload): + exit_code = cli_main( + [ + "study", + "tune", + "--spec", + str(study_path), + "--store-root", + str(store_root), + "--max-trials", + "2", + ] + ) + self.assertEqual(exit_code, 0) + store = StudyStore(store_root) + state = store.load_state("study-1") + self.assertEqual(state.next_trial_index, 3) + self.assertEqual(state.trials[0].config_patch, {"env_patch": {}, "flag_patch": {}}) + self.assertEqual(state.trials[1].config_patch["flag_patch"], {"max-num-seqs": 64}) + def test_load_compare_spec_requires_window_selection(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)