Harness: gate gpu-mem-util/seqs-raise on 'no untested TP increase' (frontier-closed)
The first gpt-5.5 verification run exposed a bug in the prior gate: topology_settled = cur_tp>base_tp let gpu-memory-utilization fire on a TP2 incumbent (TP2>baseline TP1) and preempt the still-open TP4 frontier -- the harness proposed TP2+gpu-mem-util=0.92 at iter 2 instead of climbing to TP4. The candidate path runs before the topology- frontier check, so a score>=0.35 runtime candidate wins. Fix: gate runtime micro-tuning (gpu-mem-util, raising max-num-seqs) on the TP frontier being closed -- topology_settled = no untested _next_allowed_tp remains (respects GPU count, so TP4 is the real ceiling on 6 GPUs). New regression test: TP2 incumbent with TP4 reachable must climb TP and must NOT propose gpu-mem-util. 116 tests pass. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1194,14 +1194,21 @@ def _runtime_candidate_actions(
|
||||
topology_patch = _preserve_topology_patch(study, anchor_flags)
|
||||
actions: list[dict[str, Any]] = []
|
||||
|
||||
base_tp = _parse_int_like(study.engine.base_flags.get("tensor-parallel-size"), default=1)
|
||||
base_dp = _parse_int_like(study.engine.base_flags.get("data-parallel-size"), default=1)
|
||||
cur_tp = _parse_int_like(anchor_flags.get("tensor-parallel-size"), default=base_tp)
|
||||
cur_dp = _parse_int_like(anchor_flags.get("data-parallel-size"), default=base_dp)
|
||||
cur_tp = _parse_int_like(anchor_flags.get("tensor-parallel-size"), default=1)
|
||||
cur_dp = _parse_int_like(anchor_flags.get("data-parallel-size"), default=1)
|
||||
# Topology-before-runtime: gpu-mem-util / raising max-num-seqs are micro-tuning that is
|
||||
# only justified once topology has moved off the baseline. At the baseline a latency
|
||||
# bottleneck must still be answered with a topology change, not a runtime tweak.
|
||||
topology_settled = cur_tp > base_tp or cur_dp > base_dp
|
||||
# only justified once no untested TP increase remains. At an intermediate TP (e.g. TP2
|
||||
# while TP4 is still reachable and untried) a latency bottleneck must still be answered
|
||||
# by climbing TP, not a runtime tweak -- otherwise runtime tuning preempts the frontier.
|
||||
_next_tp = _next_allowed_tp(study, current_tp=cur_tp, current_dp=cur_dp)
|
||||
tp_frontier_open = (
|
||||
_next_tp is not None
|
||||
and _config_signature(
|
||||
{"env_patch": {}, "flag_patch": {"tensor-parallel-size": _next_tp}}
|
||||
)
|
||||
not in tested_signatures
|
||||
)
|
||||
topology_settled = not tp_frontier_open
|
||||
|
||||
if "max-num-batched-tokens" in tunable:
|
||||
current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
|
||||
|
||||
@@ -1425,6 +1425,104 @@ class CoreFlowTests(unittest.TestCase):
|
||||
# And the harness must NOT authorize a stop while that knob is untried.
|
||||
self.assertIsNone(build_harness_stop_proposal(context))
|
||||
|
||||
def test_harness_climbs_tp_before_gpu_mem_util_micro_tuning(self) -> None:
|
||||
"""gpu-memory-utilization must not preempt an untried TP increase: at a TP2 incumbent
|
||||
with TP4 still reachable, the harness must climb TP, not micro-tune runtime."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
slo_overrides={
|
||||
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
|
||||
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||
},
|
||||
engine_overrides={
|
||||
"tunable_flags": ["tensor-parallel-size", "gpu-memory-utilization"],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||
"allowed_data_parallel_sizes": [1],
|
||||
"allowed_tp_dp_products": [1, 2, 4],
|
||||
},
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
result_path = tmp_path / "trial-0002.json"
|
||||
result_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.03,
|
||||
"best_request_rate": 1.1,
|
||||
"best_pass_rate": 0.97,
|
||||
"probes": [
|
||||
{
|
||||
"threshold": 0.03,
|
||||
"feasible": True,
|
||||
"payload": {
|
||||
"request_count": 300,
|
||||
"pass_rate": 0.97,
|
||||
"request_rate": 1.1,
|
||||
"latency_summary": {"failed_reason_counts": {}},
|
||||
},
|
||||
},
|
||||
{
|
||||
"threshold": 0.05,
|
||||
"feasible": False,
|
||||
"payload": {
|
||||
"request_count": 300,
|
||||
"pass_rate": 0.6,
|
||||
"request_rate": 1.6,
|
||||
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||
"latency_summary": {
|
||||
"failed_reason_counts": {"tpot_ms>50.0": 90}
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0002",
|
||||
best_request_rate=1.1,
|
||||
best_request_rate_per_gpu=0.55,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
best_request_rate=0.6,
|
||||
best_request_rate_per_gpu=0.6,
|
||||
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0002",
|
||||
status="completed",
|
||||
best_request_rate=1.1,
|
||||
best_request_rate_per_gpu=0.55,
|
||||
result_path=str(result_path),
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 2,
|
||||
"gpu-memory-utilization": 0.9,
|
||||
},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
|
||||
)
|
||||
proposal = build_harness_guided_proposal(context)
|
||||
self.assertIsNotNone(proposal)
|
||||
# Must climb TP (to 4), and must NOT micro-tune gpu-memory-utilization yet.
|
||||
self.assertEqual(
|
||||
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
|
||||
)
|
||||
self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch)
|
||||
|
||||
def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
|
||||
Reference in New Issue
Block a user