Minimize no-harness ablation prompt

This commit is contained in:
2026-05-12 09:42:53 +08:00
parent ae756600ce
commit e1125475ae
3 changed files with 126 additions and 37 deletions

View File

@@ -148,7 +148,11 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int:
state=state,
capability_profile=capability_profile,
)
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
proposal_text = call_llm_for_proposal(
policy=study.llm,
prompt=prompt,
use_harness=study.llm.use_harness,
)
proposal = parse_proposal_text(proposal_text, study)
name = args.proposal_name or f"proposal-{state.next_trial_index:04d}"
path = store.write_proposal(study.study_id, name, proposal)
@@ -300,7 +304,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"No proposal files provided, study.llm.endpoint is not configured, "
"and the harness stop guard did not fire."
)
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
proposal_text = call_llm_for_proposal(
policy=study.llm,
prompt=prompt,
use_harness=study.llm.use_harness,
)
proposal_name = f"proposal-{state.next_trial_index:04d}"
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
raw_proposal_path.write_text(proposal_text, encoding="utf-8")

View File

@@ -212,16 +212,102 @@ def build_prompt(
)
launch_failures = _launch_failure_history(state)
parallel_candidates = _enumerate_parallel_candidates(study)
sections = [
common_preamble = [
"You are tuning an OpenAI-compatible serving engine.",
"Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.",
"config_patch must contain env_patch and flag_patch.",
"expected_effects must be a JSON array of short strings, not an object.",
"should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified.",
(
"should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified."
if study.llm.use_harness
else "should_stop must be a boolean. Use false unless no valid config can be proposed."
),
"Only use allowed tunable env keys and allowed tunable flag keys.",
"Do not wrap the JSON in markdown fences or any extra text.",
"Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.",
"Treat previous engine launch failures as hard negative evidence. If you touch TP/DP/EP, keep the proposal inside the topology constraints exactly.",
]
if not study.llm.use_harness:
sections = [
*common_preamble,
"",
"Study context:",
json.dumps(
{
"study_id": study.study_id,
"objective": "maximize feasible request_rate_per_gpu at the SLO target",
"current_best": {
"trial_id": state.best_trial_id,
"best_parallel_size": state.best_parallel_size,
"best_sampling_u": state.best_sampling_u,
"best_request_rate": state.best_request_rate,
"best_request_rate_per_gpu": state.best_request_rate_per_gpu,
},
"hardware": {
"gpu_count": study.hardware.gpu_count,
"gpu_model": study.hardware.gpu_model,
},
"model": {
"model_id": study.model.model_id,
"served_model_name": study.model.served_model_name,
},
"trace": {
"window_id": study.trace.window_id,
"request_mode": study.trace.request_mode,
"completion_tokens_override": study.trace.completion_tokens_override,
"input_length_filter": (
{
"min_input_tokens": study.trace.input_length_filter.min_input_tokens,
"max_input_tokens": study.trace.input_length_filter.max_input_tokens,
}
if study.trace.input_length_filter is not None
else None
),
},
"engine": {
"engine_name": study.engine.engine_name,
"engine_version": study.engine.engine_version,
"base_flags": study.engine.base_flags,
"base_envs": study.engine.base_envs,
"allowed_flag_keys": study.engine.tunable_flags,
"allowed_env_keys": study.engine.tunable_envs,
"topology_constraints": (
study.engine.topology_constraints.__dict__
if study.engine.topology_constraints is not None
else None
),
},
},
ensure_ascii=False,
indent=2,
),
"",
"SLO:",
json.dumps(
{
"target_pass_rate": study.slo.target_pass_rate,
"ttft_rule": study.slo.ttft_rule,
"tpot_rule": study.slo.tpot_rule,
"objective_notes": objective_notes,
},
default=lambda value: value.__dict__,
ensure_ascii=False,
indent=2,
),
"",
"Trial history:",
json.dumps(history, ensure_ascii=False, indent=2),
"",
"Known launch failures:",
json.dumps(launch_failures, ensure_ascii=False, indent=2),
"",
"Tested config signatures:",
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
]
return "\n".join(sections)
sections = [
*common_preamble,
(
"TP/DP/EP are part of the tunable space for this study. Prioritize exploring legal topology changes in parallel space before runtime-only knobs unless recent history already proves a topology variant is worse or fails to launch."
if parallel_candidates
@@ -314,7 +400,6 @@ def build_prompt(
"Tested config signatures:",
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
]
if study.llm.use_harness:
sections.extend(
[
"",
@@ -329,15 +414,6 @@ def build_prompt(
"",
]
)
else:
sections.extend(
[
"",
"Harnesses:",
"Disabled by llm.use_harness=false for ablation.",
"",
]
)
sections.extend(
[
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
@@ -348,11 +424,7 @@ def build_prompt(
else "The evaluator runs each proposal over the full configured search range so raw per-iteration performance is measured directly."
),
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
(
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
if study.llm.use_harness
else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
),
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
]
)
return "\n".join(sections)
@@ -604,6 +676,7 @@ def call_llm_for_proposal(
*,
policy: LLMPolicySpec,
prompt: str,
use_harness: bool = True,
) -> str:
if policy.endpoint is None:
raise RuntimeError("study.llm.endpoint is not configured")
@@ -611,6 +684,7 @@ def call_llm_for_proposal(
max_attempts = 4
for attempt in range(max_attempts):
try:
system_prompt = policy.system_prompt if use_harness else ""
if policy.endpoint.stream:
text = stream_text_completion(
base_url=policy.endpoint.base_url,
@@ -620,7 +694,7 @@ def call_llm_for_proposal(
model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt,
system_prompt=system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort,
)
else:
@@ -632,7 +706,7 @@ def call_llm_for_proposal(
model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt,
system_prompt=system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort,
)
text = _extract_response_text(response)

View File

@@ -1002,9 +1002,16 @@ class CoreFlowTests(unittest.TestCase):
capability_profile=None,
)
self.assertFalse(study.llm.use_harness)
self.assertIn("Disabled by llm.use_harness=false", prompt)
self.assertIn("Study context:", prompt)
self.assertIn("Trial history:", prompt)
self.assertIn("Known launch failures:", prompt)
self.assertNotIn('"paper_alignment"', prompt)
self.assertIn("without harness hints", prompt)
self.assertNotIn("Harnesses:", prompt)
self.assertNotIn("Disabled by llm.use_harness=false", prompt)
self.assertNotIn("without harness hints", prompt)
self.assertNotIn("Window summary:", prompt)
self.assertNotIn("Parallel space candidates:", prompt)
self.assertNotIn("Prioritize exploring legal topology changes", prompt)
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp: