From 7ad439730ec97d78be180370f28c47c334cd45b2 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sat, 27 Jun 2026 12:21:51 +0800 Subject: [PATCH] Add llm-first tuning proposal policy --- src/aituner/cli.py | 12 +++++- tests/test_core_flow.py | 93 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/src/aituner/cli.py b/src/aituner/cli.py index 3878035..44c31f9 100644 --- a/src/aituner/cli.py +++ b/src/aituner/cli.py @@ -288,6 +288,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int: capability_profile = load_capability_profile(study, study_spec_path=spec_path) proposal_files = [Path(item).resolve() for item in (args.proposal_file or [])] max_trials = args.max_trials or (len(proposal_files) if proposal_files else 2) + proposal_policy = args.proposal_policy if max_trials <= 0: raise SpecError("max_trials must be positive") if proposal_files and max_trials > len(proposal_files): @@ -387,7 +388,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int: else: guided_proposal = ( build_harness_guided_proposal(harness_context) - if harness_context is not None + if harness_context is not None and proposal_policy == "harness-first" else None ) if guided_proposal is not None: @@ -782,6 +783,15 @@ def build_parser() -> argparse.ArgumentParser: tune.add_argument("--store-root") tune.add_argument("--proposal-file", action="append") tune.add_argument("--max-trials", type=int) + tune.add_argument( + "--proposal-policy", + choices=("harness-first", "llm-first"), + default="harness-first", + help=( + "Choose whether deterministic harness proposals are tried before the LLM " + "or whether the LLM proposes directly from the harness prompt/context." + ), + ) tune.add_argument( "--skip-baseline", action="store_true", diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 3b3d797..6200578 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -6323,6 +6323,99 @@ class CoreFlowTests(unittest.TestCase): ) self.assertTrue(state.tuning_stop_diagnosis) + def test_cli_tune_llm_first_skips_deterministic_harness_proposal(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + payload = json.loads(study_path.read_text(encoding="utf-8")) + payload["llm"]["endpoint"] = { + "provider": "custom", + "base_url": "http://llm.example/v1", + "wire_api": "chat.completions", + "model": "test-model", + "api_key_env": "OPENAI_API_KEY", + } + study_path.write_text(json.dumps(payload), encoding="utf-8") + study = load_study_spec(study_path) + store_root = tmp_path / "store" + store = StudyStore(store_root) + store.init_study(spec_path=study_path, study=study) + store.save_state( + StudyState( + study_id=study.study_id, + best_trial_id="trial-0001", + best_parallel_size=8, + best_sampling_u=0.25, + best_request_rate=1.0, + best_request_rate_per_gpu=0.125, + next_trial_index=2, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + parallel_size=8, + best_request_rate=1.0, + best_request_rate_per_gpu=0.125, + config_patch={"env_patch": {}, "flag_patch": {}}, + ) + ], + ) + ) + + llm_payload = json.dumps( + { + "observation": "Use harness evidence but let the LLM choose.", + "diagnosis": "Try higher admission concurrency.", + "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}}, + "expected_effects": ["measure admission concurrency"], + "why_not_previous_failures": "does not repeat a prior full config", + "should_stop": False, + } + ) + + def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: + payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) + trial_root = Path(payload["artifact_dir"]) + result = { + "study_id": payload["study_id"], + "trial_id": payload["trial_id"], + "status": "completed", + "best_sampling_u": 0.5, + "best_request_rate": 2.0, + "best_pass_rate": 1.0, + "best_request_count": 2, + "probes": [], + } + (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") + return result + + with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload) as llm_mock: + with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial): + exit_code = cli_main( + [ + "study", + "tune", + "--spec", + str(study_path), + "--store-root", + str(store_root), + "--skip-baseline", + "--max-trials", + "2", + "--proposal-policy", + "llm-first", + ] + ) + + self.assertEqual(exit_code, 0) + llm_mock.assert_called_once() + proposal_root = store.study_root(study.study_id) / "proposals" + self.assertTrue((proposal_root / "proposal-0002.json").exists()) + self.assertFalse((proposal_root / "harness-proposal-0002.json").exists()) + self.assertTrue( + (store.study_root(study.study_id) / "harness" / "candidate-set-0002.json").exists() + ) + def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)