From 426151bc9f3f4fc5f248009c55baf4370ae2044d Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sat, 20 Jun 2026 22:48:27 +0800 Subject: [PATCH] Harness stop uses full state baseline --- src/aituner/harness.py | 44 +++++++++------- tests/test_core_flow.py | 113 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 18 deletions(-) diff --git a/src/aituner/harness.py b/src/aituner/harness.py index d774f83..e6b365c 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -607,6 +607,15 @@ def stateful_history_limit() -> int: return 8 +def _state_completed_trials_with_rates(state: StudyState) -> list[TrialSummary]: + return [ + trial + for trial in state.trials + if trial.status == "completed" + and isinstance(trial.best_request_rate_per_gpu, (int, float)) + ] + + def _load_result(trial: TrialSummary) -> dict[str, Any] | None: if not trial.result_path: return None @@ -1960,15 +1969,19 @@ def _validation_exhausted_guard( } if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)): return default - completed = [ - item - for item in recent_diagnostics - if item.get("status") == "completed" - and isinstance(item.get("best_request_rate_per_gpu"), (int, float)) - ] - if not completed: - return default - baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu")) + state_completed = _state_completed_trials_with_rates(state) + if state_completed: + baseline_rate = float(state_completed[0].best_request_rate_per_gpu) + else: + completed = [ + item + for item in recent_diagnostics + if item.get("status") == "completed" + and isinstance(item.get("best_request_rate_per_gpu"), (int, float)) + ] + if not completed: + return default + baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu")) incumbent_rate = _as_float(state.best_request_rate_per_gpu) if baseline_rate <= 0 or incumbent_rate <= 0: return default @@ -2084,16 +2097,11 @@ def _strong_incumbent_guard( } if state.best_trial_id is None or state.best_request_rate_per_gpu is None: return default - completed = [ - item - for item in recent_diagnostics - if item.get("status") == "completed" - and isinstance(item.get("best_request_rate_per_gpu"), (int, float)) - ] + completed = _state_completed_trials_with_rates(state) if len(completed) < 2: return default baseline = completed[0] - baseline_rate = float(baseline["best_request_rate_per_gpu"]) + baseline_rate = float(baseline.best_request_rate_per_gpu) incumbent_rate = float(state.best_request_rate_per_gpu) if baseline_rate <= 0: return default @@ -2103,7 +2111,7 @@ def _strong_incumbent_guard( return { "guard_active": True, "reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best_enter_validation_phase", - "baseline_trial_id": baseline.get("trial_id"), + "baseline_trial_id": baseline.trial_id, "baseline_request_rate_per_gpu": baseline_rate, "incumbent_gain_vs_baseline": gain, "recommended_next_action": ( @@ -2113,7 +2121,7 @@ def _strong_incumbent_guard( } return { **default, - "baseline_trial_id": baseline.get("trial_id"), + "baseline_trial_id": baseline.trial_id, "baseline_request_rate_per_gpu": baseline_rate, "incumbent_gain_vs_baseline": gain, "reason": "need_more_evidence_before_strong_incumbent_stop", diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 9b3395f..6c90bd8 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1068,6 +1068,119 @@ class CoreFlowTests(unittest.TestCase): self.assertTrue(context["harness_stop"]["should_stop"]) self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") + def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + engine_overrides={"tunable_flags": ["max-num-seqs"]}, + ) + study = load_study_spec(study_path) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0006", + best_parallel_size=8, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + parallel_size=8, + best_request_rate=0.8, + best_request_rate_per_gpu=0.1, + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + TrialSummary( + trial_id="trial-0002", + status="completed", + parallel_size=8, + best_request_rate=0.88, + best_request_rate_per_gpu=0.11, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}}, + ), + TrialSummary( + trial_id="trial-0003", + status="completed", + parallel_size=8, + best_request_rate=0.96, + best_request_rate_per_gpu=0.12, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 24}}, + ), + TrialSummary( + trial_id="trial-0004", + status="completed", + parallel_size=8, + best_request_rate=1.04, + best_request_rate_per_gpu=0.13, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 32}}, + ), + TrialSummary( + trial_id="trial-0005", + status="completed", + parallel_size=8, + best_request_rate=2.24, + best_request_rate_per_gpu=0.28, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 40}}, + ), + TrialSummary( + trial_id="trial-0006", + status="completed", + parallel_size=8, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 48}}, + ), + TrialSummary( + trial_id="trial-0007", + status="completed", + parallel_size=8, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 56}}, + ), + TrialSummary( + trial_id="trial-0008", + status="completed", + parallel_size=8, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 64}}, + ), + TrialSummary( + trial_id="trial-0009", + status="completed", + parallel_size=8, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 72}}, + ), + TrialSummary( + trial_id="trial-0010", + status="completed", + parallel_size=8, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 80}}, + ), + TrialSummary( + trial_id="trial-0011", + status="failed", + parallel_size=8, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 88}}, + ), + TrialSummary( + trial_id="trial-0012", + status="completed", + parallel_size=8, + config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 96}}, + ), + ], + ) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 2048}, + state=state, + ) + self.assertTrue(context["harness_stop"]["should_stop"]) + self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") + self.assertGreater( + context["harness_stop"]["evidence"]["incumbent_gain_vs_baseline"], + 2.9, + ) + def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)