Gate GMU climb on measured improvement
This commit is contained in:
@@ -104,9 +104,11 @@ harness 的 family、signature、scoring 和 validator 约束。
|
||||
- 当 scheduler dimension 还没有被 materialized config 覆盖时,加入
|
||||
`uncovered_scheduler_dimension_bonus`,让该 family 在 topology settled 后优先于
|
||||
`gpu-memory-utilization` 这类 resource micro-tuning。
|
||||
- 当该 family 已生成有效候选时,旧的 standalone `raise_mbt`、
|
||||
`enable_chunked_prefill`、`raise_mbt_and_max_num_seqs` 只作为 fallback,不作为同级
|
||||
prefill runtime 候选抢排序。
|
||||
- 当该 family 已生成有效候选时,旧的 standalone `raise_mbt`、
|
||||
`enable_chunked_prefill`、`raise_mbt_and_max_num_seqs` 只作为 fallback,不作为同级
|
||||
prefill runtime 候选抢排序。
|
||||
- `gpu-memory-utilization` 仍保留小步 hill-climb,但继续爬升必须由同拓扑
|
||||
request_rate_per_gpu 改善支撑;仅仅 launch 成功或打平 incumbent 不再算成功。
|
||||
|
||||
## 为什么不是 rule-based hack
|
||||
|
||||
@@ -185,6 +187,8 @@ harness 的 family、signature、scoring 和 validator 约束。
|
||||
`lower_admission_pressure_with_chunked_prefill`。
|
||||
- 抽出 `_higher_tp_frontier_patch`,让 runtime gate 与
|
||||
`_topology_frontier_status` 使用同一套 higher-TP signature。
|
||||
- GMU hill-climb 改为 measurement-gated:同拓扑 GMU trial 没有提升
|
||||
request_rate_per_gpu 时,阻断继续向更高 GMU 爬升,避免连续浪费 trials。
|
||||
|
||||
### 2026-06-29 远端 review feedback
|
||||
|
||||
@@ -290,5 +294,8 @@ trial-0003 已完成,best request_rate_per_gpu 约为 2.025,和 baseline 持
|
||||
falsification evidence:coverage priority 改变了探索顺序,具体 `chunked + MBT ~= p95`
|
||||
hypothesis 被验证后没有改进。系统随后进入 candidate-set-0004,开始测试
|
||||
`gpu-memory-utilization=0.9`。trial-0004 同样完成在约 2.025,没有超过 baseline;
|
||||
当前旧 run 已进入 trial-0005,继续测试 `gpu-memory-utilization=0.92`。后续需要观察
|
||||
GMU climb 是否会停下并转向 admission pressure、topology/DP 或其他 family。
|
||||
trial-0005 的 `gpu-memory-utilization=0.92` 仍然打平 baseline,旧 run 随后继续排
|
||||
`gpu-memory-utilization=0.94`。这暴露出旧实现的 GMU hill-climb 问题:它把 launch
|
||||
成功当成 climb 成功,而没有要求 request_rate_per_gpu 改善。最新本地实现已经修正为
|
||||
measurement-gated GMU climb;下一轮应使用新提交重新跑,验证 GMU tie 后是否转向
|
||||
admission pressure、topology/DP 或其他 family。
|
||||
|
||||
@@ -1590,6 +1590,7 @@ def _runtime_candidate_actions(
|
||||
study,
|
||||
anchor_flags,
|
||||
recent_diagnostics,
|
||||
anchor_rate_per_gpu=_profile_request_rate_per_gpu(anchor),
|
||||
)
|
||||
if target is not None:
|
||||
patch = {**runtime_base_patch, "gpu-memory-utilization": target}
|
||||
@@ -1629,6 +1630,8 @@ def _next_gpu_memory_utilization_target(
|
||||
study: StudySpec,
|
||||
anchor_flags: dict[str, Any],
|
||||
recent_diagnostics: list[dict[str, Any]],
|
||||
*,
|
||||
anchor_rate_per_gpu: float = 0.0,
|
||||
) -> float | None:
|
||||
current_gmu = _parse_float_like(
|
||||
anchor_flags.get("gpu-memory-utilization"), default=0.9
|
||||
@@ -1651,8 +1654,14 @@ def _next_gpu_memory_utilization_target(
|
||||
gmu = _parse_float_like(flag_patch.get("gpu-memory-utilization"), default=0.0)
|
||||
if gmu <= 0:
|
||||
continue
|
||||
if abs(gmu - current_gmu) <= EPSILON:
|
||||
continue
|
||||
if item.get("status") == "completed":
|
||||
successful_gmus.append(gmu)
|
||||
rate = _as_float(item.get("best_request_rate_per_gpu"))
|
||||
if anchor_rate_per_gpu > 0 and rate <= anchor_rate_per_gpu + EPSILON:
|
||||
failed_gmus.append(gmu)
|
||||
else:
|
||||
successful_gmus.append(gmu)
|
||||
elif item.get("status") == "failed":
|
||||
failed_gmus.append(gmu)
|
||||
climb_from = max(successful_gmus)
|
||||
@@ -1668,6 +1677,13 @@ def _next_gpu_memory_utilization_target(
|
||||
return target
|
||||
|
||||
|
||||
def _profile_request_rate_per_gpu(profile: dict[str, Any]) -> float:
|
||||
performance = profile.get("performance")
|
||||
if isinstance(performance, dict):
|
||||
return _as_float(performance.get("best_request_rate_per_gpu"))
|
||||
return _as_float(profile.get("best_request_rate_per_gpu"))
|
||||
|
||||
|
||||
def _prefill_scheduler_candidate_actions(
|
||||
study: StudySpec,
|
||||
window_summary: dict[str, Any],
|
||||
|
||||
@@ -2594,10 +2594,9 @@ class CoreFlowTests(unittest.TestCase):
|
||||
)
|
||||
self.assertNotIn("tensor-parallel-size", proposal.config_patch.flag_patch)
|
||||
|
||||
def test_harness_continues_gpu_mem_util_after_tied_same_topology_probe(self) -> None:
|
||||
"""After adjacent topology validation, gpu-memory-utilization should hill-climb
|
||||
on the incumbent topology even if an earlier gmu step tied the incumbent and
|
||||
did not become state.best_trial_id."""
|
||||
def test_harness_stops_gpu_mem_util_climb_after_tied_same_topology_probe(self) -> None:
|
||||
"""A same-topology gpu-memory-utilization probe must improve per-GPU rate before
|
||||
the hill-climb continues; launch success alone is not evidence to keep climbing."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
@@ -2711,11 +2710,14 @@ class CoreFlowTests(unittest.TestCase):
|
||||
window_summary={"prompt_tokens_p95": 1500},
|
||||
state=state,
|
||||
)
|
||||
proposal = build_harness_guided_proposal(context)
|
||||
self.assertIsNotNone(proposal)
|
||||
self.assertEqual(
|
||||
proposal.config_patch.flag_patch,
|
||||
candidates = context["experiment_plan"]["candidate_actions"]
|
||||
self.assertNotIn(
|
||||
{"tensor-parallel-size": 2, "gpu-memory-utilization": 0.94},
|
||||
[
|
||||
item["config_patch"]["flag_patch"]
|
||||
for item in candidates
|
||||
if item["knob_family"] == "gpu-memory-utilization"
|
||||
],
|
||||
)
|
||||
|
||||
def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None:
|
||||
|
||||
Reference in New Issue
Block a user