diff --git a/src/aituner/harness.py b/src/aituner/harness.py index 2d7ac11..3c7a404 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -2104,12 +2104,6 @@ def _validation_exhausted_guard( if baseline_rate <= 0 or incumbent_rate <= 0: return default gain = incumbent_rate / baseline_rate - if gain < _STRONG_INCUMBENT_MIN_GAIN: - return { - **default, - "reason": "incumbent_gain_not_large_enough_for_validation_stop", - "incumbent_gain_vs_baseline": gain, - } best_index = next( ( @@ -2130,6 +2124,21 @@ def _validation_exhausted_guard( for item in recent_diagnostics[best_index + 1 :] if item.get("status") in {"completed", "failed"} ] + incumbent = next( + ( + item + for item in recent_diagnostics + if item.get("trial_id") == state.best_trial_id + ), + {}, + ) + gmu_ceiling_incumbent = _is_gpu_memory_utilization_ceiling_incumbent(incumbent) + if gain < _STRONG_INCUMBENT_MIN_GAIN and not gmu_ceiling_incumbent: + return { + **default, + "reason": "incumbent_gain_not_large_enough_for_validation_stop", + "incumbent_gain_vs_baseline": gain, + } if len(after_best) < _MIN_POST_INCUMBENT_VALIDATION_TRIALS: return { **default, @@ -2154,6 +2163,7 @@ def _validation_exhausted_guard( families: set[str] = set() for item in after_best: families.update(_validation_families(item)) + families.update(_validation_families(incumbent)) has_topology = "topology" in families has_runtime = bool(families & {"runtime", "max-num-seqs", "max-num-batched-tokens"}) enough_evidence = ( @@ -2202,6 +2212,17 @@ def _validation_families(item: dict[str, Any]) -> set[str]: return families +def _is_gpu_memory_utilization_ceiling_incumbent(item: dict[str, Any]) -> bool: + config_patch = item.get("config_patch") + if not isinstance(config_patch, dict): + return False + flag_patch = config_patch.get("flag_patch") + if not isinstance(flag_patch, dict): + return False + gmu = _parse_float_like(flag_patch.get("gpu-memory-utilization"), default=0.0) + return gmu >= _GMU_SAFE_CEILING - EPSILON + + def _strong_incumbent_guard( state: StudyState, recent_diagnostics: list[dict[str, Any]], diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 4eaa5d9..bbc4d5a 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1068,6 +1068,110 @@ class CoreFlowTests(unittest.TestCase): self.assertTrue(context["harness_stop"]["should_stop"]) self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") + def test_harness_stop_after_gmu_incumbent_and_non_improving_topology_validation(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + engine_overrides={ + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "gpu-memory-utilization", + ], + "topology_constraints": { + "allowed_tensor_parallel_sizes": [1, 2, 4, 8], + "allowed_data_parallel_sizes": [1, 2], + "allowed_tp_dp_products": [1, 2, 4, 8], + }, + }, + ) + study = load_study_spec(study_path) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0007", + best_request_rate=6.8667, + best_request_rate_per_gpu=3.4333, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + best_request_rate=2.2, + best_request_rate_per_gpu=2.2, + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + TrialSummary( + trial_id="trial-0002", + status="completed", + best_request_rate=6.5167, + best_request_rate_per_gpu=3.2583, + config_patch={ + "env_patch": {}, + "flag_patch": {"tensor-parallel-size": 2}, + }, + ), + TrialSummary( + trial_id="trial-0003", + status="completed", + best_request_rate=8.3667, + best_request_rate_per_gpu=2.0917, + config_patch={ + "env_patch": {}, + "flag_patch": {"tensor-parallel-size": 4}, + }, + ), + TrialSummary( + trial_id="trial-0007", + status="completed", + best_request_rate=6.8667, + best_request_rate_per_gpu=3.4333, + config_patch={ + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 2, + "gpu-memory-utilization": 0.97, + }, + }, + ), + TrialSummary( + trial_id="trial-0008", + status="completed", + best_request_rate=4.1833, + best_request_rate_per_gpu=1.0458, + config_patch={ + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 4, + "data-parallel-size": 2, + }, + }, + ), + TrialSummary( + trial_id="trial-0009", + status="completed", + best_request_rate=8.3667, + best_request_rate_per_gpu=1.0458, + config_patch={ + "env_patch": {}, + "flag_patch": {"tensor-parallel-size": 8}, + }, + ), + ], + ) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 1500}, + state=state, + ) + self.assertTrue(context["harness_stop"]["should_stop"]) + self.assertEqual( + context["harness_stop"]["reason"], + "post_incumbent_validation_exhausted", + ) + proposal = build_harness_stop_proposal(context) + self.assertIsNotNone(proposal) + self.assertTrue(proposal.should_stop) + def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)