Gate GMU climb on measured improvement

2026-06-29 02:00:41 +08:00
parent ee101a7c24
commit 6b25d56c1f
3 changed files with 39 additions and 14 deletions
--- a/docs/harness-ablation/prefill-scheduler-normalized-design-20260629.md
+++ b/docs/harness-ablation/prefill-scheduler-normalized-design-20260629.md
@@ -104,9 +104,11 @@ harness 的 family、signature、scoring 和 validator 约束。
  - 当 scheduler dimension 还没有被 materialized config 覆盖时，加入
    `uncovered_scheduler_dimension_bonus`，让该 family 在 topology settled 后优先于
    `gpu-memory-utilization` 这类 resource micro-tuning。
-  - 当该 family 已生成有效候选时，旧的 standalone `raise_mbt`、
-    `enable_chunked_prefill`、`raise_mbt_and_max_num_seqs` 只作为 fallback，不作为同级
-    prefill runtime 候选抢排序。
+- 当该 family 已生成有效候选时，旧的 standalone `raise_mbt`、
+  `enable_chunked_prefill`、`raise_mbt_and_max_num_seqs` 只作为 fallback，不作为同级
+  prefill runtime 候选抢排序。
+- `gpu-memory-utilization` 仍保留小步 hill-climb，但继续爬升必须由同拓扑
+  request_rate_per_gpu 改善支撑；仅仅 launch 成功或打平 incumbent 不再算成功。

 ## 为什么不是 rule-based hack

@@ -185,6 +187,8 @@ harness 的 family、signature、scoring 和 validator 约束。
  `lower_admission_pressure_with_chunked_prefill`。
 - 抽出 `_higher_tp_frontier_patch`，让 runtime gate 与
  `_topology_frontier_status` 使用同一套 higher-TP signature。
+- GMU hill-climb 改为 measurement-gated：同拓扑 GMU trial 没有提升
+  request_rate_per_gpu 时，阻断继续向更高 GMU 爬升，避免连续浪费 trials。

 ### 2026-06-29 远端 review feedback

@@ -290,5 +294,8 @@ trial-0003 已完成，best request_rate_per_gpu 约为 2.025，和 baseline 持
 falsification evidence：coverage priority 改变了探索顺序，具体 `chunked + MBT ~= p95`
 hypothesis 被验证后没有改进。系统随后进入 candidate-set-0004，开始测试
 `gpu-memory-utilization=0.9`。trial-0004 同样完成在约 2.025，没有超过 baseline；
-当前旧 run 已进入 trial-0005，继续测试 `gpu-memory-utilization=0.92`。后续需要观察
-GMU climb 是否会停下并转向 admission pressure、topology/DP 或其他 family。
+trial-0005 的 `gpu-memory-utilization=0.92` 仍然打平 baseline，旧 run 随后继续排
+`gpu-memory-utilization=0.94`。这暴露出旧实现的 GMU hill-climb 问题：它把 launch
+成功当成 climb 成功，而没有要求 request_rate_per_gpu 改善。最新本地实现已经修正为
+measurement-gated GMU climb；下一轮应使用新提交重新跑，验证 GMU tie 后是否转向
+admission pressure、topology/DP 或其他 family。
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -1590,6 +1590,7 @@ def _runtime_candidate_actions(
            study,
            anchor_flags,
            recent_diagnostics,
+            anchor_rate_per_gpu=_profile_request_rate_per_gpu(anchor),
        )
        if target is not None:
            patch = {**runtime_base_patch, "gpu-memory-utilization": target}
@@ -1629,6 +1630,8 @@ def _next_gpu_memory_utilization_target(
    study: StudySpec,
    anchor_flags: dict[str, Any],
    recent_diagnostics: list[dict[str, Any]],
+    *,
+    anchor_rate_per_gpu: float = 0.0,
 ) -> float | None:
    current_gmu = _parse_float_like(
        anchor_flags.get("gpu-memory-utilization"), default=0.9
@@ -1651,8 +1654,14 @@ def _next_gpu_memory_utilization_target(
        gmu = _parse_float_like(flag_patch.get("gpu-memory-utilization"), default=0.0)
        if gmu <= 0:
            continue
+        if abs(gmu - current_gmu) <= EPSILON:
+            continue
        if item.get("status") == "completed":
-            successful_gmus.append(gmu)
+            rate = _as_float(item.get("best_request_rate_per_gpu"))
+            if anchor_rate_per_gpu > 0 and rate <= anchor_rate_per_gpu + EPSILON:
+                failed_gmus.append(gmu)
+            else:
+                successful_gmus.append(gmu)
        elif item.get("status") == "failed":
            failed_gmus.append(gmu)
    climb_from = max(successful_gmus)
@@ -1668,6 +1677,13 @@ def _next_gpu_memory_utilization_target(
    return target


+def _profile_request_rate_per_gpu(profile: dict[str, Any]) -> float:
+    performance = profile.get("performance")
+    if isinstance(performance, dict):
+        return _as_float(performance.get("best_request_rate_per_gpu"))
+    return _as_float(profile.get("best_request_rate_per_gpu"))
+
+
 def _prefill_scheduler_candidate_actions(
    study: StudySpec,
    window_summary: dict[str, Any],
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -2594,10 +2594,9 @@ class CoreFlowTests(unittest.TestCase):
            )
            self.assertNotIn("tensor-parallel-size", proposal.config_patch.flag_patch)

-    def test_harness_continues_gpu_mem_util_after_tied_same_topology_probe(self) -> None:
-        """After adjacent topology validation, gpu-memory-utilization should hill-climb
-        on the incumbent topology even if an earlier gmu step tied the incumbent and
-        did not become state.best_trial_id."""
+    def test_harness_stops_gpu_mem_util_climb_after_tied_same_topology_probe(self) -> None:
+        """A same-topology gpu-memory-utilization probe must improve per-GPU rate before
+        the hill-climb continues; launch success alone is not evidence to keep climbing."""
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
@@ -2711,11 +2710,14 @@ class CoreFlowTests(unittest.TestCase):
                window_summary={"prompt_tokens_p95": 1500},
                state=state,
            )
-            proposal = build_harness_guided_proposal(context)
-            self.assertIsNotNone(proposal)
-            self.assertEqual(
-                proposal.config_patch.flag_patch,
+            candidates = context["experiment_plan"]["candidate_actions"]
+            self.assertNotIn(
                {"tensor-parallel-size": 2, "gpu-memory-utilization": 0.94},
+                [
+                    item["config_patch"]["flag_patch"]
+                    for item in candidates
+                    if item["knob_family"] == "gpu-memory-utilization"
+                ],
            )

    def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None: