From e1125475aea4cadfb54ef2701e5726bb79c81e16 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Tue, 12 May 2026 09:42:53 +0800
Subject: [PATCH] Minimize no-harness ablation prompt

---
 src/aituner/cli.py      |  12 +++-
 src/aituner/llm.py      | 140 ++++++++++++++++++++++++++++++----------
 tests/test_core_flow.py |  11 +++-
 3 files changed, 126 insertions(+), 37 deletions(-)

diff --git a/src/aituner/cli.py b/src/aituner/cli.py
index 0ce4d77..cc9fb7c 100644
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -148,7 +148,11 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int:
         state=state,
         capability_profile=capability_profile,
     )
-    proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
+    proposal_text = call_llm_for_proposal(
+        policy=study.llm,
+        prompt=prompt,
+        use_harness=study.llm.use_harness,
+    )
     proposal = parse_proposal_text(proposal_text, study)
     name = args.proposal_name or f"proposal-{state.next_trial_index:04d}"
     path = store.write_proposal(study.study_id, name, proposal)
@@ -300,7 +304,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
                             "No proposal files provided, study.llm.endpoint is not configured, "
                             "and the harness stop guard did not fire."
                         )
-                    proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
+                    proposal_text = call_llm_for_proposal(
+                        policy=study.llm,
+                        prompt=prompt,
+                        use_harness=study.llm.use_harness,
+                    )
                     proposal_name = f"proposal-{state.next_trial_index:04d}"
         raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
         raw_proposal_path.write_text(proposal_text, encoding="utf-8")
diff --git a/src/aituner/llm.py b/src/aituner/llm.py
index 1dce926..855fdb5 100644
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -212,16 +212,102 @@ def build_prompt(
         )
     launch_failures = _launch_failure_history(state)
     parallel_candidates = _enumerate_parallel_candidates(study)
-    sections = [
+    common_preamble = [
         "You are tuning an OpenAI-compatible serving engine.",
         "Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.",
         "config_patch must contain env_patch and flag_patch.",
         "expected_effects must be a JSON array of short strings, not an object.",
-        "should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified.",
+        (
+            "should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified."
+            if study.llm.use_harness
+            else "should_stop must be a boolean. Use false unless no valid config can be proposed."
+        ),
         "Only use allowed tunable env keys and allowed tunable flag keys.",
         "Do not wrap the JSON in markdown fences or any extra text.",
         "Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.",
         "Treat previous engine launch failures as hard negative evidence. If you touch TP/DP/EP, keep the proposal inside the topology constraints exactly.",
+    ]
+    if not study.llm.use_harness:
+        sections = [
+            *common_preamble,
+            "",
+            "Study context:",
+            json.dumps(
+                {
+                    "study_id": study.study_id,
+                    "objective": "maximize feasible request_rate_per_gpu at the SLO target",
+                    "current_best": {
+                        "trial_id": state.best_trial_id,
+                        "best_parallel_size": state.best_parallel_size,
+                        "best_sampling_u": state.best_sampling_u,
+                        "best_request_rate": state.best_request_rate,
+                        "best_request_rate_per_gpu": state.best_request_rate_per_gpu,
+                    },
+                    "hardware": {
+                        "gpu_count": study.hardware.gpu_count,
+                        "gpu_model": study.hardware.gpu_model,
+                    },
+                    "model": {
+                        "model_id": study.model.model_id,
+                        "served_model_name": study.model.served_model_name,
+                    },
+                    "trace": {
+                        "window_id": study.trace.window_id,
+                        "request_mode": study.trace.request_mode,
+                        "completion_tokens_override": study.trace.completion_tokens_override,
+                        "input_length_filter": (
+                            {
+                                "min_input_tokens": study.trace.input_length_filter.min_input_tokens,
+                                "max_input_tokens": study.trace.input_length_filter.max_input_tokens,
+                            }
+                            if study.trace.input_length_filter is not None
+                            else None
+                        ),
+                    },
+                    "engine": {
+                        "engine_name": study.engine.engine_name,
+                        "engine_version": study.engine.engine_version,
+                        "base_flags": study.engine.base_flags,
+                        "base_envs": study.engine.base_envs,
+                        "allowed_flag_keys": study.engine.tunable_flags,
+                        "allowed_env_keys": study.engine.tunable_envs,
+                        "topology_constraints": (
+                            study.engine.topology_constraints.__dict__
+                            if study.engine.topology_constraints is not None
+                            else None
+                        ),
+                    },
+                },
+                ensure_ascii=False,
+                indent=2,
+            ),
+            "",
+            "SLO:",
+            json.dumps(
+                {
+                    "target_pass_rate": study.slo.target_pass_rate,
+                    "ttft_rule": study.slo.ttft_rule,
+                    "tpot_rule": study.slo.tpot_rule,
+                    "objective_notes": objective_notes,
+                },
+                default=lambda value: value.__dict__,
+                ensure_ascii=False,
+                indent=2,
+            ),
+            "",
+            "Trial history:",
+            json.dumps(history, ensure_ascii=False, indent=2),
+            "",
+            "Known launch failures:",
+            json.dumps(launch_failures, ensure_ascii=False, indent=2),
+            "",
+            "Tested config signatures:",
+            json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
+        ]
+        return "\n".join(sections)
+
+    sections = [
+        *common_preamble,
         (
             "TP/DP/EP are part of the tunable space for this study. Prioritize exploring legal topology changes in parallel space before runtime-only knobs unless recent history already proves a topology variant is worse or fails to launch."
             if parallel_candidates
@@ -314,30 +400,20 @@ def build_prompt(
         "Tested config signatures:",
         json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
     ]
-    if study.llm.use_harness:
-        sections.extend(
-            [
-                "",
-                "Harnesses:",
-                render_harness_context(
-                    build_harness_context(
-                        study=study,
-                        window_summary=window_summary,
-                        state=state,
-                    )
-                ),
-                "",
-            ]
-        )
-    else:
-        sections.extend(
-            [
-                "",
-                "Harnesses:",
-                "Disabled by llm.use_harness=false for ablation.",
-                "",
-            ]
-        )
+    sections.extend(
+        [
+            "",
+            "Harnesses:",
+            render_harness_context(
+                build_harness_context(
+                    study=study,
+                    window_summary=window_summary,
+                    state=state,
+                )
+            ),
+            "",
+        ]
+    )
     sections.extend(
         [
         "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
@@ -348,11 +424,7 @@ def build_prompt(
             else "The evaluator runs each proposal over the full configured search range so raw per-iteration performance is measured directly."
         ),
         "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
-        (
-            "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
-            if study.llm.use_harness
-            else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
-        ),
+        "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
         ]
     )
     return "\n".join(sections)
@@ -604,6 +676,7 @@ def call_llm_for_proposal(
     *,
     policy: LLMPolicySpec,
     prompt: str,
+    use_harness: bool = True,
 ) -> str:
     if policy.endpoint is None:
         raise RuntimeError("study.llm.endpoint is not configured")
@@ -611,6 +684,7 @@ def call_llm_for_proposal(
     max_attempts = 4
     for attempt in range(max_attempts):
         try:
+            system_prompt = policy.system_prompt if use_harness else ""
             if policy.endpoint.stream:
                 text = stream_text_completion(
                     base_url=policy.endpoint.base_url,
@@ -620,7 +694,7 @@ def call_llm_for_proposal(
                     model=policy.endpoint.model,
                     messages=[{"role": "user", "content": prompt}],
                     timeout_s=policy.endpoint.timeout_s,
-                    system_prompt=policy.system_prompt,
+                    system_prompt=system_prompt,
                     reasoning_effort=policy.endpoint.reasoning_effort,
                 )
             else:
@@ -632,7 +706,7 @@ def call_llm_for_proposal(
                     model=policy.endpoint.model,
                     messages=[{"role": "user", "content": prompt}],
                     timeout_s=policy.endpoint.timeout_s,
-                    system_prompt=policy.system_prompt,
+                    system_prompt=system_prompt,
                     reasoning_effort=policy.endpoint.reasoning_effort,
                 )
                 text = _extract_response_text(response)
diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py
index 8c704d9..dbdfe54 100644
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -1002,9 +1002,16 @@ class CoreFlowTests(unittest.TestCase):
                 capability_profile=None,
             )
             self.assertFalse(study.llm.use_harness)
-            self.assertIn("Disabled by llm.use_harness=false", prompt)
+            self.assertIn("Study context:", prompt)
+            self.assertIn("Trial history:", prompt)
+            self.assertIn("Known launch failures:", prompt)
             self.assertNotIn('"paper_alignment"', prompt)
-            self.assertIn("without harness hints", prompt)
+            self.assertNotIn("Harnesses:", prompt)
+            self.assertNotIn("Disabled by llm.use_harness=false", prompt)
+            self.assertNotIn("without harness hints", prompt)
+            self.assertNotIn("Window summary:", prompt)
+            self.assertNotIn("Parallel space candidates:", prompt)
+            self.assertNotIn("Prioritize exploring legal topology changes", prompt)
 
     def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
         with tempfile.TemporaryDirectory() as tmp: