From e1125475aea4cadfb54ef2701e5726bb79c81e16 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 12 May 2026 09:42:53 +0800 Subject: [PATCH] Minimize no-harness ablation prompt --- src/aituner/cli.py | 12 +++- src/aituner/llm.py | 140 ++++++++++++++++++++++++++++++---------- tests/test_core_flow.py | 11 +++- 3 files changed, 126 insertions(+), 37 deletions(-) diff --git a/src/aituner/cli.py b/src/aituner/cli.py index 0ce4d77..cc9fb7c 100644 --- a/src/aituner/cli.py +++ b/src/aituner/cli.py @@ -148,7 +148,11 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int: state=state, capability_profile=capability_profile, ) - proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) + proposal_text = call_llm_for_proposal( + policy=study.llm, + prompt=prompt, + use_harness=study.llm.use_harness, + ) proposal = parse_proposal_text(proposal_text, study) name = args.proposal_name or f"proposal-{state.next_trial_index:04d}" path = store.write_proposal(study.study_id, name, proposal) @@ -300,7 +304,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int: "No proposal files provided, study.llm.endpoint is not configured, " "and the harness stop guard did not fire." ) - proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) + proposal_text = call_llm_for_proposal( + policy=study.llm, + prompt=prompt, + use_harness=study.llm.use_harness, + ) proposal_name = f"proposal-{state.next_trial_index:04d}" raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt" raw_proposal_path.write_text(proposal_text, encoding="utf-8") diff --git a/src/aituner/llm.py b/src/aituner/llm.py index 1dce926..855fdb5 100644 --- a/src/aituner/llm.py +++ b/src/aituner/llm.py @@ -212,16 +212,102 @@ def build_prompt( ) launch_failures = _launch_failure_history(state) parallel_candidates = _enumerate_parallel_candidates(study) - sections = [ + common_preamble = [ "You are tuning an OpenAI-compatible serving engine.", "Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.", "config_patch must contain env_patch and flag_patch.", "expected_effects must be a JSON array of short strings, not an object.", - "should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified.", + ( + "should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified." + if study.llm.use_harness + else "should_stop must be a boolean. Use false unless no valid config can be proposed." + ), "Only use allowed tunable env keys and allowed tunable flag keys.", "Do not wrap the JSON in markdown fences or any extra text.", "Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.", "Treat previous engine launch failures as hard negative evidence. If you touch TP/DP/EP, keep the proposal inside the topology constraints exactly.", + ] + if not study.llm.use_harness: + sections = [ + *common_preamble, + "", + "Study context:", + json.dumps( + { + "study_id": study.study_id, + "objective": "maximize feasible request_rate_per_gpu at the SLO target", + "current_best": { + "trial_id": state.best_trial_id, + "best_parallel_size": state.best_parallel_size, + "best_sampling_u": state.best_sampling_u, + "best_request_rate": state.best_request_rate, + "best_request_rate_per_gpu": state.best_request_rate_per_gpu, + }, + "hardware": { + "gpu_count": study.hardware.gpu_count, + "gpu_model": study.hardware.gpu_model, + }, + "model": { + "model_id": study.model.model_id, + "served_model_name": study.model.served_model_name, + }, + "trace": { + "window_id": study.trace.window_id, + "request_mode": study.trace.request_mode, + "completion_tokens_override": study.trace.completion_tokens_override, + "input_length_filter": ( + { + "min_input_tokens": study.trace.input_length_filter.min_input_tokens, + "max_input_tokens": study.trace.input_length_filter.max_input_tokens, + } + if study.trace.input_length_filter is not None + else None + ), + }, + "engine": { + "engine_name": study.engine.engine_name, + "engine_version": study.engine.engine_version, + "base_flags": study.engine.base_flags, + "base_envs": study.engine.base_envs, + "allowed_flag_keys": study.engine.tunable_flags, + "allowed_env_keys": study.engine.tunable_envs, + "topology_constraints": ( + study.engine.topology_constraints.__dict__ + if study.engine.topology_constraints is not None + else None + ), + }, + }, + ensure_ascii=False, + indent=2, + ), + "", + "SLO:", + json.dumps( + { + "target_pass_rate": study.slo.target_pass_rate, + "ttft_rule": study.slo.ttft_rule, + "tpot_rule": study.slo.tpot_rule, + "objective_notes": objective_notes, + }, + default=lambda value: value.__dict__, + ensure_ascii=False, + indent=2, + ), + "", + "Trial history:", + json.dumps(history, ensure_ascii=False, indent=2), + "", + "Known launch failures:", + json.dumps(launch_failures, ensure_ascii=False, indent=2), + "", + "Tested config signatures:", + json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2), + ] + return "\n".join(sections) + + sections = [ + *common_preamble, ( "TP/DP/EP are part of the tunable space for this study. Prioritize exploring legal topology changes in parallel space before runtime-only knobs unless recent history already proves a topology variant is worse or fails to launch." if parallel_candidates @@ -314,30 +400,20 @@ def build_prompt( "Tested config signatures:", json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2), ] - if study.llm.use_harness: - sections.extend( - [ - "", - "Harnesses:", - render_harness_context( - build_harness_context( - study=study, - window_summary=window_summary, - state=state, - ) - ), - "", - ] - ) - else: - sections.extend( - [ - "", - "Harnesses:", - "Disabled by llm.use_harness=false for ablation.", - "", - ] - ) + sections.extend( + [ + "", + "Harnesses:", + render_harness_context( + build_harness_context( + study=study, + window_summary=window_summary, + state=state, + ) + ), + "", + ] + ) sections.extend( [ "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.", @@ -348,11 +424,7 @@ def build_prompt( else "The evaluator runs each proposal over the full configured search range so raw per-iteration performance is measured directly." ), "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.", - ( - "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged." - if study.llm.use_harness - else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints." - ), + "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.", ] ) return "\n".join(sections) @@ -604,6 +676,7 @@ def call_llm_for_proposal( *, policy: LLMPolicySpec, prompt: str, + use_harness: bool = True, ) -> str: if policy.endpoint is None: raise RuntimeError("study.llm.endpoint is not configured") @@ -611,6 +684,7 @@ def call_llm_for_proposal( max_attempts = 4 for attempt in range(max_attempts): try: + system_prompt = policy.system_prompt if use_harness else "" if policy.endpoint.stream: text = stream_text_completion( base_url=policy.endpoint.base_url, @@ -620,7 +694,7 @@ def call_llm_for_proposal( model=policy.endpoint.model, messages=[{"role": "user", "content": prompt}], timeout_s=policy.endpoint.timeout_s, - system_prompt=policy.system_prompt, + system_prompt=system_prompt, reasoning_effort=policy.endpoint.reasoning_effort, ) else: @@ -632,7 +706,7 @@ def call_llm_for_proposal( model=policy.endpoint.model, messages=[{"role": "user", "content": prompt}], timeout_s=policy.endpoint.timeout_s, - system_prompt=policy.system_prompt, + system_prompt=system_prompt, reasoning_effort=policy.endpoint.reasoning_effort, ) text = _extract_response_text(response) diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 8c704d9..dbdfe54 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1002,9 +1002,16 @@ class CoreFlowTests(unittest.TestCase): capability_profile=None, ) self.assertFalse(study.llm.use_harness) - self.assertIn("Disabled by llm.use_harness=false", prompt) + self.assertIn("Study context:", prompt) + self.assertIn("Trial history:", prompt) + self.assertIn("Known launch failures:", prompt) self.assertNotIn('"paper_alignment"', prompt) - self.assertIn("without harness hints", prompt) + self.assertNotIn("Harnesses:", prompt) + self.assertNotIn("Disabled by llm.use_harness=false", prompt) + self.assertNotIn("without harness hints", prompt) + self.assertNotIn("Window summary:", prompt) + self.assertNotIn("Parallel space candidates:", prompt) + self.assertNotIn("Prioritize exploring legal topology changes", prompt) def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None: with tempfile.TemporaryDirectory() as tmp: