Minimize no-harness ablation prompt

2026-05-12 09:42:53 +08:00
parent ae756600ce
commit e1125475ae
3 changed files with 126 additions and 37 deletions
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -148,7 +148,11 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int:
        state=state,
        capability_profile=capability_profile,
    )
-    proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
+    proposal_text = call_llm_for_proposal(
+        policy=study.llm,
+        prompt=prompt,
+        use_harness=study.llm.use_harness,
+    )
    proposal = parse_proposal_text(proposal_text, study)
    name = args.proposal_name or f"proposal-{state.next_trial_index:04d}"
    path = store.write_proposal(study.study_id, name, proposal)
@@ -300,7 +304,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
                            "No proposal files provided, study.llm.endpoint is not configured, "
                            "and the harness stop guard did not fire."
                        )
-                    proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
+                    proposal_text = call_llm_for_proposal(
+                        policy=study.llm,
+                        prompt=prompt,
+                        use_harness=study.llm.use_harness,
+                    )
                    proposal_name = f"proposal-{state.next_trial_index:04d}"
        raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
        raw_proposal_path.write_text(proposal_text, encoding="utf-8")
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -212,16 +212,102 @@ def build_prompt(
        )
    launch_failures = _launch_failure_history(state)
    parallel_candidates = _enumerate_parallel_candidates(study)
-    sections = [
+    common_preamble = [
        "You are tuning an OpenAI-compatible serving engine.",
        "Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.",
        "config_patch must contain env_patch and flag_patch.",
        "expected_effects must be a JSON array of short strings, not an object.",
-        "should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified.",
+        (
+            "should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified."
+            if study.llm.use_harness
+            else "should_stop must be a boolean. Use false unless no valid config can be proposed."
+        ),
        "Only use allowed tunable env keys and allowed tunable flag keys.",
        "Do not wrap the JSON in markdown fences or any extra text.",
        "Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.",
        "Treat previous engine launch failures as hard negative evidence. If you touch TP/DP/EP, keep the proposal inside the topology constraints exactly.",
+    ]
+    if not study.llm.use_harness:
+        sections = [
+            *common_preamble,
+            "",
+            "Study context:",
+            json.dumps(
+                {
+                    "study_id": study.study_id,
+                    "objective": "maximize feasible request_rate_per_gpu at the SLO target",
+                    "current_best": {
+                        "trial_id": state.best_trial_id,
+                        "best_parallel_size": state.best_parallel_size,
+                        "best_sampling_u": state.best_sampling_u,
+                        "best_request_rate": state.best_request_rate,
+                        "best_request_rate_per_gpu": state.best_request_rate_per_gpu,
+                    },
+                    "hardware": {
+                        "gpu_count": study.hardware.gpu_count,
+                        "gpu_model": study.hardware.gpu_model,
+                    },
+                    "model": {
+                        "model_id": study.model.model_id,
+                        "served_model_name": study.model.served_model_name,
+                    },
+                    "trace": {
+                        "window_id": study.trace.window_id,
+                        "request_mode": study.trace.request_mode,
+                        "completion_tokens_override": study.trace.completion_tokens_override,
+                        "input_length_filter": (
+                            {
+                                "min_input_tokens": study.trace.input_length_filter.min_input_tokens,
+                                "max_input_tokens": study.trace.input_length_filter.max_input_tokens,
+                            }
+                            if study.trace.input_length_filter is not None
+                            else None
+                        ),
+                    },
+                    "engine": {
+                        "engine_name": study.engine.engine_name,
+                        "engine_version": study.engine.engine_version,
+                        "base_flags": study.engine.base_flags,
+                        "base_envs": study.engine.base_envs,
+                        "allowed_flag_keys": study.engine.tunable_flags,
+                        "allowed_env_keys": study.engine.tunable_envs,
+                        "topology_constraints": (
+                            study.engine.topology_constraints.__dict__
+                            if study.engine.topology_constraints is not None
+                            else None
+                        ),
+                    },
+                },
+                ensure_ascii=False,
+                indent=2,
+            ),
+            "",
+            "SLO:",
+            json.dumps(
+                {
+                    "target_pass_rate": study.slo.target_pass_rate,
+                    "ttft_rule": study.slo.ttft_rule,
+                    "tpot_rule": study.slo.tpot_rule,
+                    "objective_notes": objective_notes,
+                },
+                default=lambda value: value.__dict__,
+                ensure_ascii=False,
+                indent=2,
+            ),
+            "",
+            "Trial history:",
+            json.dumps(history, ensure_ascii=False, indent=2),
+            "",
+            "Known launch failures:",
+            json.dumps(launch_failures, ensure_ascii=False, indent=2),
+            "",
+            "Tested config signatures:",
+            json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
+        ]
+        return "\n".join(sections)
+
+    sections = [
+        *common_preamble,
        (
            "TP/DP/EP are part of the tunable space for this study. Prioritize exploring legal topology changes in parallel space before runtime-only knobs unless recent history already proves a topology variant is worse or fails to launch."
            if parallel_candidates
@@ -314,7 +400,6 @@ def build_prompt(
        "Tested config signatures:",
        json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
    ]
-    if study.llm.use_harness:
    sections.extend(
        [
            "",
@@ -329,15 +414,6 @@ def build_prompt(
            "",
        ]
    )
-    else:
-        sections.extend(
-            [
-                "",
-                "Harnesses:",
-                "Disabled by llm.use_harness=false for ablation.",
-                "",
-            ]
-        )
    sections.extend(
        [
        "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
@@ -348,11 +424,7 @@ def build_prompt(
            else "The evaluator runs each proposal over the full configured search range so raw per-iteration performance is measured directly."
        ),
        "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
-        (
-            "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
-            if study.llm.use_harness
-            else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
-        ),
+        "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
        ]
    )
    return "\n".join(sections)
@@ -604,6 +676,7 @@ def call_llm_for_proposal(
    *,
    policy: LLMPolicySpec,
    prompt: str,
+    use_harness: bool = True,
 ) -> str:
    if policy.endpoint is None:
        raise RuntimeError("study.llm.endpoint is not configured")
@@ -611,6 +684,7 @@ def call_llm_for_proposal(
    max_attempts = 4
    for attempt in range(max_attempts):
        try:
+            system_prompt = policy.system_prompt if use_harness else ""
            if policy.endpoint.stream:
                text = stream_text_completion(
                    base_url=policy.endpoint.base_url,
@@ -620,7 +694,7 @@ def call_llm_for_proposal(
                    model=policy.endpoint.model,
                    messages=[{"role": "user", "content": prompt}],
                    timeout_s=policy.endpoint.timeout_s,
-                    system_prompt=policy.system_prompt,
+                    system_prompt=system_prompt,
                    reasoning_effort=policy.endpoint.reasoning_effort,
                )
            else:
@@ -632,7 +706,7 @@ def call_llm_for_proposal(
                    model=policy.endpoint.model,
                    messages=[{"role": "user", "content": prompt}],
                    timeout_s=policy.endpoint.timeout_s,
-                    system_prompt=policy.system_prompt,
+                    system_prompt=system_prompt,
                    reasoning_effort=policy.endpoint.reasoning_effort,
                )
                text = _extract_response_text(response)
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -1002,9 +1002,16 @@ class CoreFlowTests(unittest.TestCase):
                capability_profile=None,
            )
            self.assertFalse(study.llm.use_harness)
-            self.assertIn("Disabled by llm.use_harness=false", prompt)
+            self.assertIn("Study context:", prompt)
+            self.assertIn("Trial history:", prompt)
+            self.assertIn("Known launch failures:", prompt)
            self.assertNotIn('"paper_alignment"', prompt)
-            self.assertIn("without harness hints", prompt)
+            self.assertNotIn("Harnesses:", prompt)
+            self.assertNotIn("Disabled by llm.use_harness=false", prompt)
+            self.assertNotIn("without harness hints", prompt)
+            self.assertNotIn("Window summary:", prompt)
+            self.assertNotIn("Parallel space candidates:", prompt)
+            self.assertNotIn("Prioritize exploring legal topology changes", prompt)

    def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp: