Minimize no-harness ablation prompt

This commit is contained in:
2026-05-12 09:42:53 +08:00
parent ae756600ce
commit e1125475ae
3 changed files with 126 additions and 37 deletions

View File

@@ -148,7 +148,11 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int:
state=state, state=state,
capability_profile=capability_profile, capability_profile=capability_profile,
) )
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) proposal_text = call_llm_for_proposal(
policy=study.llm,
prompt=prompt,
use_harness=study.llm.use_harness,
)
proposal = parse_proposal_text(proposal_text, study) proposal = parse_proposal_text(proposal_text, study)
name = args.proposal_name or f"proposal-{state.next_trial_index:04d}" name = args.proposal_name or f"proposal-{state.next_trial_index:04d}"
path = store.write_proposal(study.study_id, name, proposal) path = store.write_proposal(study.study_id, name, proposal)
@@ -300,7 +304,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"No proposal files provided, study.llm.endpoint is not configured, " "No proposal files provided, study.llm.endpoint is not configured, "
"and the harness stop guard did not fire." "and the harness stop guard did not fire."
) )
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) proposal_text = call_llm_for_proposal(
policy=study.llm,
prompt=prompt,
use_harness=study.llm.use_harness,
)
proposal_name = f"proposal-{state.next_trial_index:04d}" proposal_name = f"proposal-{state.next_trial_index:04d}"
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt" raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
raw_proposal_path.write_text(proposal_text, encoding="utf-8") raw_proposal_path.write_text(proposal_text, encoding="utf-8")

View File

@@ -212,16 +212,102 @@ def build_prompt(
) )
launch_failures = _launch_failure_history(state) launch_failures = _launch_failure_history(state)
parallel_candidates = _enumerate_parallel_candidates(study) parallel_candidates = _enumerate_parallel_candidates(study)
sections = [ common_preamble = [
"You are tuning an OpenAI-compatible serving engine.", "You are tuning an OpenAI-compatible serving engine.",
"Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.", "Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.",
"config_patch must contain env_patch and flag_patch.", "config_patch must contain env_patch and flag_patch.",
"expected_effects must be a JSON array of short strings, not an object.", "expected_effects must be a JSON array of short strings, not an object.",
"should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified.", (
"should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified."
if study.llm.use_harness
else "should_stop must be a boolean. Use false unless no valid config can be proposed."
),
"Only use allowed tunable env keys and allowed tunable flag keys.", "Only use allowed tunable env keys and allowed tunable flag keys.",
"Do not wrap the JSON in markdown fences or any extra text.", "Do not wrap the JSON in markdown fences or any extra text.",
"Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.", "Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.",
"Treat previous engine launch failures as hard negative evidence. If you touch TP/DP/EP, keep the proposal inside the topology constraints exactly.", "Treat previous engine launch failures as hard negative evidence. If you touch TP/DP/EP, keep the proposal inside the topology constraints exactly.",
]
if not study.llm.use_harness:
sections = [
*common_preamble,
"",
"Study context:",
json.dumps(
{
"study_id": study.study_id,
"objective": "maximize feasible request_rate_per_gpu at the SLO target",
"current_best": {
"trial_id": state.best_trial_id,
"best_parallel_size": state.best_parallel_size,
"best_sampling_u": state.best_sampling_u,
"best_request_rate": state.best_request_rate,
"best_request_rate_per_gpu": state.best_request_rate_per_gpu,
},
"hardware": {
"gpu_count": study.hardware.gpu_count,
"gpu_model": study.hardware.gpu_model,
},
"model": {
"model_id": study.model.model_id,
"served_model_name": study.model.served_model_name,
},
"trace": {
"window_id": study.trace.window_id,
"request_mode": study.trace.request_mode,
"completion_tokens_override": study.trace.completion_tokens_override,
"input_length_filter": (
{
"min_input_tokens": study.trace.input_length_filter.min_input_tokens,
"max_input_tokens": study.trace.input_length_filter.max_input_tokens,
}
if study.trace.input_length_filter is not None
else None
),
},
"engine": {
"engine_name": study.engine.engine_name,
"engine_version": study.engine.engine_version,
"base_flags": study.engine.base_flags,
"base_envs": study.engine.base_envs,
"allowed_flag_keys": study.engine.tunable_flags,
"allowed_env_keys": study.engine.tunable_envs,
"topology_constraints": (
study.engine.topology_constraints.__dict__
if study.engine.topology_constraints is not None
else None
),
},
},
ensure_ascii=False,
indent=2,
),
"",
"SLO:",
json.dumps(
{
"target_pass_rate": study.slo.target_pass_rate,
"ttft_rule": study.slo.ttft_rule,
"tpot_rule": study.slo.tpot_rule,
"objective_notes": objective_notes,
},
default=lambda value: value.__dict__,
ensure_ascii=False,
indent=2,
),
"",
"Trial history:",
json.dumps(history, ensure_ascii=False, indent=2),
"",
"Known launch failures:",
json.dumps(launch_failures, ensure_ascii=False, indent=2),
"",
"Tested config signatures:",
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
]
return "\n".join(sections)
sections = [
*common_preamble,
( (
"TP/DP/EP are part of the tunable space for this study. Prioritize exploring legal topology changes in parallel space before runtime-only knobs unless recent history already proves a topology variant is worse or fails to launch." "TP/DP/EP are part of the tunable space for this study. Prioritize exploring legal topology changes in parallel space before runtime-only knobs unless recent history already proves a topology variant is worse or fails to launch."
if parallel_candidates if parallel_candidates
@@ -314,7 +400,6 @@ def build_prompt(
"Tested config signatures:", "Tested config signatures:",
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2), json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
] ]
if study.llm.use_harness:
sections.extend( sections.extend(
[ [
"", "",
@@ -329,15 +414,6 @@ def build_prompt(
"", "",
] ]
) )
else:
sections.extend(
[
"",
"Harnesses:",
"Disabled by llm.use_harness=false for ablation.",
"",
]
)
sections.extend( sections.extend(
[ [
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.", "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
@@ -348,11 +424,7 @@ def build_prompt(
else "The evaluator runs each proposal over the full configured search range so raw per-iteration performance is measured directly." else "The evaluator runs each proposal over the full configured search range so raw per-iteration performance is measured directly."
), ),
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.", "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
( "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
if study.llm.use_harness
else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
),
] ]
) )
return "\n".join(sections) return "\n".join(sections)
@@ -604,6 +676,7 @@ def call_llm_for_proposal(
*, *,
policy: LLMPolicySpec, policy: LLMPolicySpec,
prompt: str, prompt: str,
use_harness: bool = True,
) -> str: ) -> str:
if policy.endpoint is None: if policy.endpoint is None:
raise RuntimeError("study.llm.endpoint is not configured") raise RuntimeError("study.llm.endpoint is not configured")
@@ -611,6 +684,7 @@ def call_llm_for_proposal(
max_attempts = 4 max_attempts = 4
for attempt in range(max_attempts): for attempt in range(max_attempts):
try: try:
system_prompt = policy.system_prompt if use_harness else ""
if policy.endpoint.stream: if policy.endpoint.stream:
text = stream_text_completion( text = stream_text_completion(
base_url=policy.endpoint.base_url, base_url=policy.endpoint.base_url,
@@ -620,7 +694,7 @@ def call_llm_for_proposal(
model=policy.endpoint.model, model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s, timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt, system_prompt=system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort, reasoning_effort=policy.endpoint.reasoning_effort,
) )
else: else:
@@ -632,7 +706,7 @@ def call_llm_for_proposal(
model=policy.endpoint.model, model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s, timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt, system_prompt=system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort, reasoning_effort=policy.endpoint.reasoning_effort,
) )
text = _extract_response_text(response) text = _extract_response_text(response)

View File

@@ -1002,9 +1002,16 @@ class CoreFlowTests(unittest.TestCase):
capability_profile=None, capability_profile=None,
) )
self.assertFalse(study.llm.use_harness) self.assertFalse(study.llm.use_harness)
self.assertIn("Disabled by llm.use_harness=false", prompt) self.assertIn("Study context:", prompt)
self.assertIn("Trial history:", prompt)
self.assertIn("Known launch failures:", prompt)
self.assertNotIn('"paper_alignment"', prompt) self.assertNotIn('"paper_alignment"', prompt)
self.assertIn("without harness hints", prompt) self.assertNotIn("Harnesses:", prompt)
self.assertNotIn("Disabled by llm.use_harness=false", prompt)
self.assertNotIn("without harness hints", prompt)
self.assertNotIn("Window summary:", prompt)
self.assertNotIn("Parallel space candidates:", prompt)
self.assertNotIn("Prioritize exploring legal topology changes", prompt)
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None: def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp: