Evaluate baseline before LLM tuning
This commit is contained in:
@@ -76,6 +76,12 @@ Improve AITuner convergence for the `dash0` internal vLLM + Qwen3.5-27B 0-8k cha
|
|||||||
- Important implementation issue found: after an early-stopped probe, the worker returned while in-flight HTTP requests could continue occupying the engine, stalling/polluting the next binary-search probe.
|
- Important implementation issue found: after an early-stopped probe, the worker returned while in-flight HTTP requests could continue occupying the engine, stalling/polluting the next binary-search probe.
|
||||||
- Action: stopped the run and freed GPUs. Updating `worker._replay_requests` to drain in-flight requests after early stop before the next probe starts.
|
- Action: stopped the run and freed GPUs. Updating `worker._replay_requests` to drain in-flight requests after early stop before the next probe starts.
|
||||||
|
|
||||||
|
### 2026-04-25 17:00-17:12 CST
|
||||||
|
|
||||||
|
- r2 confirmed that draining avoids immediate cross-probe pollution, but the first LLM trial still started from a speculative TP=2 edit without a measured incumbent.
|
||||||
|
- This is not aligned with the paper's agentic loop, which evaluates the initial configuration first and then searches from measured feedback.
|
||||||
|
- Action: update `study tune` so LLM-driven studies automatically materialize a baseline empty-patch trial first, unless `--skip-baseline` is passed. This should reduce early bad proposals because the first LLM edit will see real baseline bottleneck diagnostics and an incumbent request_rate_per_gpu.
|
||||||
|
|
||||||
Remaining next steps:
|
Remaining next steps:
|
||||||
|
|
||||||
1. Start a real harness-guided Qwen3.5-27B 0-8k chat tuning run from `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`.
|
1. Start a real harness-guided Qwen3.5-27B 0-8k chat tuning run from `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`.
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
study_root = store.init_study(spec_path=spec_path, study=study)
|
study_root = store.init_study(spec_path=spec_path, study=study)
|
||||||
capability_profile = load_capability_profile(study, study_spec_path=spec_path)
|
capability_profile = load_capability_profile(study, study_spec_path=spec_path)
|
||||||
proposal_files = [Path(item).resolve() for item in (args.proposal_file or [])]
|
proposal_files = [Path(item).resolve() for item in (args.proposal_file or [])]
|
||||||
max_trials = args.max_trials or (len(proposal_files) if proposal_files else 1)
|
max_trials = args.max_trials or (len(proposal_files) if proposal_files else 2)
|
||||||
if max_trials <= 0:
|
if max_trials <= 0:
|
||||||
raise SpecError("max_trials must be positive")
|
raise SpecError("max_trials must be positive")
|
||||||
if proposal_files and max_trials > len(proposal_files):
|
if proposal_files and max_trials > len(proposal_files):
|
||||||
@@ -134,7 +134,29 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
prompt_name = f"prompt-{state.next_trial_index:04d}"
|
prompt_name = f"prompt-{state.next_trial_index:04d}"
|
||||||
store.write_prompt(study.study_id, prompt_name, prompt)
|
store.write_prompt(study.study_id, prompt_name, prompt)
|
||||||
|
|
||||||
if proposal_files:
|
if (
|
||||||
|
not proposal_files
|
||||||
|
and not args.skip_baseline
|
||||||
|
and state.next_trial_index == 1
|
||||||
|
and not state.trials
|
||||||
|
):
|
||||||
|
proposal_source = None
|
||||||
|
proposal_name = "baseline-0001"
|
||||||
|
proposal_text = json.dumps(
|
||||||
|
{
|
||||||
|
"observation": "Evaluate the study's initial engine configuration before LLM-guided edits.",
|
||||||
|
"diagnosis": "Baseline trial aligned with the AITuner evaluate-then-search loop.",
|
||||||
|
"config_patch": {"env_patch": {}, "flag_patch": {}},
|
||||||
|
"expected_effects": [
|
||||||
|
"establish incumbent performance",
|
||||||
|
"provide bottleneck evidence for harness-guided proposals",
|
||||||
|
],
|
||||||
|
"why_not_previous_failures": "No config changes are applied.",
|
||||||
|
"should_stop": False,
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
elif proposal_files:
|
||||||
proposal_source = proposal_files[idx]
|
proposal_source = proposal_files[idx]
|
||||||
proposal_text = proposal_source.read_text(encoding="utf-8")
|
proposal_text = proposal_source.read_text(encoding="utf-8")
|
||||||
proposal_name = proposal_source.stem
|
proposal_name = proposal_source.stem
|
||||||
@@ -264,6 +286,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
tune.add_argument("--store-root")
|
tune.add_argument("--store-root")
|
||||||
tune.add_argument("--proposal-file", action="append")
|
tune.add_argument("--proposal-file", action="append")
|
||||||
tune.add_argument("--max-trials", type=int)
|
tune.add_argument("--max-trials", type=int)
|
||||||
|
tune.add_argument(
|
||||||
|
"--skip-baseline",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not automatically evaluate the initial config before LLM proposals.",
|
||||||
|
)
|
||||||
tune.set_defaults(func=cmd_study_tune)
|
tune.set_defaults(func=cmd_study_tune)
|
||||||
|
|
||||||
worker = subparsers.add_parser("worker")
|
worker = subparsers.add_parser("worker")
|
||||||
|
|||||||
@@ -1774,6 +1774,68 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
state = store.load_state("study-1")
|
state = store.load_state("study-1")
|
||||||
self.assertEqual(state.next_trial_index, 1)
|
self.assertEqual(state.next_trial_index, 1)
|
||||||
|
|
||||||
|
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
payload = json.loads(study_path.read_text(encoding="utf-8"))
|
||||||
|
payload["llm"]["endpoint"] = {
|
||||||
|
"provider": "custom",
|
||||||
|
"base_url": "http://llm.example/v1",
|
||||||
|
"wire_api": "chat.completions",
|
||||||
|
"model": "test-model",
|
||||||
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
|
}
|
||||||
|
study_path.write_text(json.dumps(payload), encoding="utf-8")
|
||||||
|
store_root = tmp_path / "store"
|
||||||
|
|
||||||
|
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
|
||||||
|
payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
|
||||||
|
trial_root = Path(payload["artifact_dir"])
|
||||||
|
result = {
|
||||||
|
"study_id": payload["study_id"],
|
||||||
|
"trial_id": payload["trial_id"],
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.25,
|
||||||
|
"best_request_rate": 1.0,
|
||||||
|
"best_pass_rate": 1.0,
|
||||||
|
"best_request_count": 2,
|
||||||
|
"probes": [],
|
||||||
|
}
|
||||||
|
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
|
||||||
|
return result
|
||||||
|
|
||||||
|
llm_payload = json.dumps(
|
||||||
|
{
|
||||||
|
"observation": "baseline done",
|
||||||
|
"diagnosis": "try more batching",
|
||||||
|
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
|
||||||
|
"expected_effects": ["higher throughput"],
|
||||||
|
"why_not_previous_failures": "",
|
||||||
|
"should_stop": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
|
||||||
|
with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
|
||||||
|
exit_code = cli_main(
|
||||||
|
[
|
||||||
|
"study",
|
||||||
|
"tune",
|
||||||
|
"--spec",
|
||||||
|
str(study_path),
|
||||||
|
"--store-root",
|
||||||
|
str(store_root),
|
||||||
|
"--max-trials",
|
||||||
|
"2",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
store = StudyStore(store_root)
|
||||||
|
state = store.load_state("study-1")
|
||||||
|
self.assertEqual(state.next_trial_index, 3)
|
||||||
|
self.assertEqual(state.trials[0].config_patch, {"env_patch": {}, "flag_patch": {}})
|
||||||
|
self.assertEqual(state.trials[1].config_patch["flag_patch"], {"max-num-seqs": 64})
|
||||||
|
|
||||||
def test_load_compare_spec_requires_window_selection(self) -> None:
|
def test_load_compare_spec_requires_window_selection(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
|
|||||||
Reference in New Issue
Block a user