From 5c2958e6c1577c42e6b4eaa9fc83463fa7839542 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Wed, 13 May 2026 01:25:31 +0800 Subject: [PATCH] Constrain harness topology by visible GPUs --- ...-driven-harness-implementation-20260512.md | 26 +++++ src/aituner/harness.py | 30 ++++-- tests/test_core_flow.py | 94 +++++++++++++++++++ 3 files changed, 143 insertions(+), 7 deletions(-) diff --git a/docs/harness-ablation/profile-driven-harness-implementation-20260512.md b/docs/harness-ablation/profile-driven-harness-implementation-20260512.md index 661ab9b..3dbc416 100644 --- a/docs/harness-ablation/profile-driven-harness-implementation-20260512.md +++ b/docs/harness-ablation/profile-driven-harness-implementation-20260512.md @@ -104,3 +104,29 @@ Started on `dash0` (`11.73.2.172`) at commit `e3ed775`. - monitor: read-only subagent `Wegener` Acceptance for this run is based on end-to-end trial results, not unit tests. If the first four trials lag the min-prompt no-harness baseline (`0.0650`, `0.1992`, `0.2696`, then failed/NA), the run should be treated as a failed harness iteration and the harness should be optimized again. + +## V2 Result And Failure + +V2 was stopped early after four trials because it did not improve the no-harness baseline and made a preventable launch-risk proposal. + +Raw `request_rate/GPU`: + +| Variant | iter1 | iter2 | iter3 | iter4 | +| --- | ---: | ---: | ---: | --- | +| no-harness min-prompt | 0.0650 | 0.1992 | 0.2696 | 0.2696 | +| harness v2 | 0.0650 | 0.1992 | 0.2696 | failed | + +Harness v2 did correctly diagnose the first bottleneck and proposed: + +- iter2: `tensor-parallel-size=2`, raw `0.1992 req/s/GPU`; +- iter3: `tensor-parallel-size=4`, raw `0.2696 req/s/GPU`. + +However, iter4 proposed `tensor-parallel-size=8` and failed at engine launch. The study's `hardware.gpu_count` is 8, but the launch environment sets `CUDA_VISIBLE_DEVICES=0,1,2,4,5,6,7`, which exposes only 7 GPUs. Therefore TP=8 should not have been considered launch-safe. + +This is a general harness bug: topology planning must use the effective visible GPU count from the execution profile, not just the nominal hardware count. + +Fix: + +- parse `engine.base_envs.CUDA_VISIBLE_DEVICES`; +- compute effective GPU count as `min(hardware.gpu_count, visible_device_count)`; +- filter topology candidates and adjacent TP frontier candidates by the effective GPU count. diff --git a/src/aituner/harness.py b/src/aituner/harness.py index 3107c58..ffdf35b 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -1210,17 +1210,18 @@ def _legal_topology_points( ) -> list[dict[str, Any]]: constraints = study.engine.topology_constraints tunable = set(study.engine.tunable_flags) + effective_gpu_count = _effective_gpu_count(study) if constraints is not None and constraints.allowed_tensor_parallel_sizes: tp_values = sorted(set(constraints.allowed_tensor_parallel_sizes)) elif "tensor-parallel-size" in tunable: - tp_values = [value for value in [1, 2, 4, 8] if value <= study.hardware.gpu_count] + tp_values = [value for value in [1, 2, 4, 8] if value <= effective_gpu_count] else: tp_values = [current_tp] if constraints is not None and constraints.allowed_data_parallel_sizes: dp_values = sorted(set(constraints.allowed_data_parallel_sizes)) elif "data-parallel-size" in tunable: - dp_values = [value for value in [1, 2, 4, 8] if value <= study.hardware.gpu_count] + dp_values = [value for value in [1, 2, 4, 8] if value <= effective_gpu_count] else: dp_values = [current_dp] @@ -1243,14 +1244,16 @@ def _legal_topology_points( continue if ( constraints.require_tp_dp_product_equals_gpu_count - and tp_dp_product != study.hardware.gpu_count + and tp_dp_product != effective_gpu_count ): continue - elif tp_dp_product > study.hardware.gpu_count: + elif tp_dp_product > effective_gpu_count: continue if constraints is not None and not constraints.allowed_tp_dp_products: - if tp_dp_product > study.hardware.gpu_count: + if tp_dp_product > effective_gpu_count: continue + if tp_dp_product > effective_gpu_count: + continue for ep in ep_values: enable_ep = current_enable_ep or ep > 1 if constraints is not None: @@ -1330,6 +1333,18 @@ def _normalized_topology_flags(flags: dict[str, Any]) -> dict[str, Any]: } +def _effective_gpu_count(study: StudySpec) -> int: + visible = str(study.engine.base_envs.get("CUDA_VISIBLE_DEVICES") or "").strip() + if not visible: + return study.hardware.gpu_count + if visible.lower() in {"none", "void", "-1"}: + return 0 + devices = [item.strip() for item in visible.split(",") if item.strip()] + if not devices: + return study.hardware.gpu_count + return min(study.hardware.gpu_count, len(devices)) + + def _score_topology_candidate( top_bottleneck: str, bottleneck_hypotheses: list[dict[str, Any]], @@ -1621,6 +1636,7 @@ def _round_up_to_multiple(value: int, multiple: int) -> int: def _next_allowed_tp(study: StudySpec, *, current_tp: int, current_dp: int) -> int | None: constraints = study.engine.topology_constraints + effective_gpu_count = _effective_gpu_count(study) if constraints is not None and constraints.allowed_tensor_parallel_sizes: candidates = sorted({int(item) for item in constraints.allowed_tensor_parallel_sizes}) else: @@ -1629,7 +1645,7 @@ def _next_allowed_tp(study: StudySpec, *, current_tp: int, current_dp: int) -> i if candidate <= current_tp: continue tp_dp_product = candidate * current_dp - if tp_dp_product > study.hardware.gpu_count: + if tp_dp_product > effective_gpu_count: continue if constraints is not None: if ( @@ -1639,7 +1655,7 @@ def _next_allowed_tp(study: StudySpec, *, current_tp: int, current_dp: int) -> i continue if ( constraints.require_tp_dp_product_equals_gpu_count - and tp_dp_product != study.hardware.gpu_count + and tp_dp_product != effective_gpu_count ): continue return candidate diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 4cedaf9..3520da1 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1175,6 +1175,100 @@ class CoreFlowTests(unittest.TestCase): self.assertIsNotNone(proposal) self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2}) + def test_harness_excludes_topology_above_visible_gpu_count(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + engine_overrides={ + "base_envs": {"CUDA_VISIBLE_DEVICES": "0,1,2,4,5,6,7"}, + "tunable_flags": ["tensor-parallel-size"], + "topology_constraints": { + "allowed_tensor_parallel_sizes": [1, 2, 4, 8], + "allowed_tp_dp_products": [1, 2, 4, 8], + }, + }, + ) + result_path = tmp_path / "trial-0003.json" + result_path.write_text( + json.dumps( + { + "status": "completed", + "best_request_rate": 1.078, + "best_pass_rate": 0.958, + "probes": [ + { + "threshold": 0.039, + "feasible": False, + "payload": { + "request_count": 100, + "pass_rate": 0.8, + "request_rate": 1.10, + "early_stop_reason": "slo_pass_rate_unrecoverable", + "latency_summary": { + "failed_reason_counts": {"tpot_ms>25.0": 20} + }, + }, + } + ], + } + ), + encoding="utf-8", + ) + study = load_study_spec(study_path) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8}, + state=StudyState( + study_id=study.study_id, + best_trial_id="trial-0003", + best_request_rate=1.078, + best_request_rate_per_gpu=0.2695, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + best_request_rate=0.065, + best_request_rate_per_gpu=0.065, + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + TrialSummary( + trial_id="trial-0002", + status="completed", + best_request_rate=0.398, + best_request_rate_per_gpu=0.199, + config_patch={ + "env_patch": {}, + "flag_patch": {"tensor-parallel-size": 2}, + }, + ), + TrialSummary( + trial_id="trial-0003", + status="completed", + best_request_rate=1.078, + best_request_rate_per_gpu=0.2695, + result_path=str(result_path), + config_patch={ + "env_patch": {}, + "flag_patch": {"tensor-parallel-size": 4}, + }, + ), + ], + ), + ) + candidates = context["candidate_actions"] + self.assertFalse( + any( + action["config_patch"]["flag_patch"].get("tensor-parallel-size") == 8 + for action in candidates + ) + ) + proposal = build_harness_guided_proposal(context) + self.assertTrue( + proposal is None + or proposal.config_patch.flag_patch.get("tensor-parallel-size") != 8 + ) + def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)