Minimize no-harness ablation prompt
This commit is contained in:
@@ -148,7 +148,11 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int:
|
||||
state=state,
|
||||
capability_profile=capability_profile,
|
||||
)
|
||||
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
|
||||
proposal_text = call_llm_for_proposal(
|
||||
policy=study.llm,
|
||||
prompt=prompt,
|
||||
use_harness=study.llm.use_harness,
|
||||
)
|
||||
proposal = parse_proposal_text(proposal_text, study)
|
||||
name = args.proposal_name or f"proposal-{state.next_trial_index:04d}"
|
||||
path = store.write_proposal(study.study_id, name, proposal)
|
||||
@@ -300,7 +304,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
||||
"No proposal files provided, study.llm.endpoint is not configured, "
|
||||
"and the harness stop guard did not fire."
|
||||
)
|
||||
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
|
||||
proposal_text = call_llm_for_proposal(
|
||||
policy=study.llm,
|
||||
prompt=prompt,
|
||||
use_harness=study.llm.use_harness,
|
||||
)
|
||||
proposal_name = f"proposal-{state.next_trial_index:04d}"
|
||||
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
|
||||
raw_proposal_path.write_text(proposal_text, encoding="utf-8")
|
||||
|
||||
@@ -212,16 +212,102 @@ def build_prompt(
|
||||
)
|
||||
launch_failures = _launch_failure_history(state)
|
||||
parallel_candidates = _enumerate_parallel_candidates(study)
|
||||
sections = [
|
||||
common_preamble = [
|
||||
"You are tuning an OpenAI-compatible serving engine.",
|
||||
"Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.",
|
||||
"config_patch must contain env_patch and flag_patch.",
|
||||
"expected_effects must be a JSON array of short strings, not an object.",
|
||||
"should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified.",
|
||||
(
|
||||
"should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified."
|
||||
if study.llm.use_harness
|
||||
else "should_stop must be a boolean. Use false unless no valid config can be proposed."
|
||||
),
|
||||
"Only use allowed tunable env keys and allowed tunable flag keys.",
|
||||
"Do not wrap the JSON in markdown fences or any extra text.",
|
||||
"Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.",
|
||||
"Treat previous engine launch failures as hard negative evidence. If you touch TP/DP/EP, keep the proposal inside the topology constraints exactly.",
|
||||
]
|
||||
if not study.llm.use_harness:
|
||||
sections = [
|
||||
*common_preamble,
|
||||
"",
|
||||
"Study context:",
|
||||
json.dumps(
|
||||
{
|
||||
"study_id": study.study_id,
|
||||
"objective": "maximize feasible request_rate_per_gpu at the SLO target",
|
||||
"current_best": {
|
||||
"trial_id": state.best_trial_id,
|
||||
"best_parallel_size": state.best_parallel_size,
|
||||
"best_sampling_u": state.best_sampling_u,
|
||||
"best_request_rate": state.best_request_rate,
|
||||
"best_request_rate_per_gpu": state.best_request_rate_per_gpu,
|
||||
},
|
||||
"hardware": {
|
||||
"gpu_count": study.hardware.gpu_count,
|
||||
"gpu_model": study.hardware.gpu_model,
|
||||
},
|
||||
"model": {
|
||||
"model_id": study.model.model_id,
|
||||
"served_model_name": study.model.served_model_name,
|
||||
},
|
||||
"trace": {
|
||||
"window_id": study.trace.window_id,
|
||||
"request_mode": study.trace.request_mode,
|
||||
"completion_tokens_override": study.trace.completion_tokens_override,
|
||||
"input_length_filter": (
|
||||
{
|
||||
"min_input_tokens": study.trace.input_length_filter.min_input_tokens,
|
||||
"max_input_tokens": study.trace.input_length_filter.max_input_tokens,
|
||||
}
|
||||
if study.trace.input_length_filter is not None
|
||||
else None
|
||||
),
|
||||
},
|
||||
"engine": {
|
||||
"engine_name": study.engine.engine_name,
|
||||
"engine_version": study.engine.engine_version,
|
||||
"base_flags": study.engine.base_flags,
|
||||
"base_envs": study.engine.base_envs,
|
||||
"allowed_flag_keys": study.engine.tunable_flags,
|
||||
"allowed_env_keys": study.engine.tunable_envs,
|
||||
"topology_constraints": (
|
||||
study.engine.topology_constraints.__dict__
|
||||
if study.engine.topology_constraints is not None
|
||||
else None
|
||||
),
|
||||
},
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
),
|
||||
"",
|
||||
"SLO:",
|
||||
json.dumps(
|
||||
{
|
||||
"target_pass_rate": study.slo.target_pass_rate,
|
||||
"ttft_rule": study.slo.ttft_rule,
|
||||
"tpot_rule": study.slo.tpot_rule,
|
||||
"objective_notes": objective_notes,
|
||||
},
|
||||
default=lambda value: value.__dict__,
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
),
|
||||
"",
|
||||
"Trial history:",
|
||||
json.dumps(history, ensure_ascii=False, indent=2),
|
||||
"",
|
||||
"Known launch failures:",
|
||||
json.dumps(launch_failures, ensure_ascii=False, indent=2),
|
||||
"",
|
||||
"Tested config signatures:",
|
||||
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
|
||||
]
|
||||
return "\n".join(sections)
|
||||
|
||||
sections = [
|
||||
*common_preamble,
|
||||
(
|
||||
"TP/DP/EP are part of the tunable space for this study. Prioritize exploring legal topology changes in parallel space before runtime-only knobs unless recent history already proves a topology variant is worse or fails to launch."
|
||||
if parallel_candidates
|
||||
@@ -314,7 +400,6 @@ def build_prompt(
|
||||
"Tested config signatures:",
|
||||
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
|
||||
]
|
||||
if study.llm.use_harness:
|
||||
sections.extend(
|
||||
[
|
||||
"",
|
||||
@@ -329,15 +414,6 @@ def build_prompt(
|
||||
"",
|
||||
]
|
||||
)
|
||||
else:
|
||||
sections.extend(
|
||||
[
|
||||
"",
|
||||
"Harnesses:",
|
||||
"Disabled by llm.use_harness=false for ablation.",
|
||||
"",
|
||||
]
|
||||
)
|
||||
sections.extend(
|
||||
[
|
||||
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
|
||||
@@ -348,11 +424,7 @@ def build_prompt(
|
||||
else "The evaluator runs each proposal over the full configured search range so raw per-iteration performance is measured directly."
|
||||
),
|
||||
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
|
||||
(
|
||||
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
|
||||
if study.llm.use_harness
|
||||
else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
|
||||
),
|
||||
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
|
||||
]
|
||||
)
|
||||
return "\n".join(sections)
|
||||
@@ -604,6 +676,7 @@ def call_llm_for_proposal(
|
||||
*,
|
||||
policy: LLMPolicySpec,
|
||||
prompt: str,
|
||||
use_harness: bool = True,
|
||||
) -> str:
|
||||
if policy.endpoint is None:
|
||||
raise RuntimeError("study.llm.endpoint is not configured")
|
||||
@@ -611,6 +684,7 @@ def call_llm_for_proposal(
|
||||
max_attempts = 4
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
system_prompt = policy.system_prompt if use_harness else ""
|
||||
if policy.endpoint.stream:
|
||||
text = stream_text_completion(
|
||||
base_url=policy.endpoint.base_url,
|
||||
@@ -620,7 +694,7 @@ def call_llm_for_proposal(
|
||||
model=policy.endpoint.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
timeout_s=policy.endpoint.timeout_s,
|
||||
system_prompt=policy.system_prompt,
|
||||
system_prompt=system_prompt,
|
||||
reasoning_effort=policy.endpoint.reasoning_effort,
|
||||
)
|
||||
else:
|
||||
@@ -632,7 +706,7 @@ def call_llm_for_proposal(
|
||||
model=policy.endpoint.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
timeout_s=policy.endpoint.timeout_s,
|
||||
system_prompt=policy.system_prompt,
|
||||
system_prompt=system_prompt,
|
||||
reasoning_effort=policy.endpoint.reasoning_effort,
|
||||
)
|
||||
text = _extract_response_text(response)
|
||||
|
||||
@@ -1002,9 +1002,16 @@ class CoreFlowTests(unittest.TestCase):
|
||||
capability_profile=None,
|
||||
)
|
||||
self.assertFalse(study.llm.use_harness)
|
||||
self.assertIn("Disabled by llm.use_harness=false", prompt)
|
||||
self.assertIn("Study context:", prompt)
|
||||
self.assertIn("Trial history:", prompt)
|
||||
self.assertIn("Known launch failures:", prompt)
|
||||
self.assertNotIn('"paper_alignment"', prompt)
|
||||
self.assertIn("without harness hints", prompt)
|
||||
self.assertNotIn("Harnesses:", prompt)
|
||||
self.assertNotIn("Disabled by llm.use_harness=false", prompt)
|
||||
self.assertNotIn("without harness hints", prompt)
|
||||
self.assertNotIn("Window summary:", prompt)
|
||||
self.assertNotIn("Parallel space candidates:", prompt)
|
||||
self.assertNotIn("Prioritize exploring legal topology changes", prompt)
|
||||
|
||||
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
|
||||
Reference in New Issue
Block a user