diff --git a/src/aituner/harness.py b/src/aituner/harness.py index 22c43a6..d6096b3 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -41,7 +41,7 @@ def build_harness_context( "workload_lca_profile": _workload_lca_profile(window_summary), "recent_trial_diagnostics": recent_diagnostics, "convergence_guard": _convergence_guard(state, recent_diagnostics), - "harness_stop": _harness_stop_decision(state, recent_diagnostics), + "harness_stop": _harness_stop_decision(study, state, recent_diagnostics), "knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics), "proposal_rules": _proposal_rules(), } @@ -471,9 +471,17 @@ def _convergence_guard( def _harness_stop_decision( + study: StudySpec, state: StudyState, recent_diagnostics: list[dict[str, Any]], ) -> dict[str, Any]: + high_saturation = _search_high_saturation_guard(study, state, recent_diagnostics) + if high_saturation["saturated"]: + return { + "should_stop": True, + "reason": high_saturation["reason"], + "evidence": high_saturation, + } guard = _convergence_guard(state, recent_diagnostics) if guard["deterministic_stop"]: return { @@ -496,12 +504,102 @@ def _harness_stop_decision( "reason": "continue_harness_guided_search", "evidence": { "summary": "No deterministic harness stop condition is satisfied.", + "search_high_saturation": high_saturation, "convergence_guard": guard, "validation_exhausted": validation, }, } +def _search_high_saturation_guard( + study: StudySpec, + state: StudyState, + recent_diagnostics: list[dict[str, Any]], +) -> dict[str, Any]: + default = { + "saturated": False, + "reason": "search_high_not_saturated", + "summary": "The incumbent has not saturated the configured search high.", + "incumbent_trial_id": state.best_trial_id, + "search_high": study.search.high, + "last_threshold": None, + "threshold_gap_to_high": None, + } + if not state.best_trial_id: + return default + incumbent = next( + ( + item + for item in recent_diagnostics + if item.get("trial_id") == state.best_trial_id + ), + None, + ) + if not incumbent: + return { + **default, + "reason": "incumbent_not_in_recent_harness_history", + } + probe_summary = incumbent.get("probe_summary") + if not isinstance(probe_summary, dict): + return { + **default, + "reason": "incumbent_probe_summary_missing", + } + last_probe = probe_summary.get("last_probe") + if not isinstance(last_probe, dict): + return { + **default, + "reason": "incumbent_last_probe_missing", + } + last_threshold = _as_float(last_probe.get("threshold")) + threshold_gap = float(study.search.high) - last_threshold + binary_probe_resolution = max( + float(study.search.tolerance), + (float(study.search.high) - float(study.search.low)) / float(2 ** max(study.search.max_probes, 1)), + ) + latency_summary = last_probe.get("latency_summary") + failed = latency_summary.get("failed_reason_counts") if isinstance(latency_summary, dict) else {} + if not isinstance(failed, dict): + failed = {} + if not last_probe.get("feasible"): + return { + **default, + "reason": "incumbent_last_probe_not_feasible", + "last_threshold": last_threshold, + "threshold_gap_to_high": threshold_gap, + } + if threshold_gap > binary_probe_resolution + 1e-12: + return { + **default, + "reason": "incumbent_not_close_to_search_high", + "last_threshold": last_threshold, + "threshold_gap_to_high": threshold_gap, + "binary_probe_resolution": binary_probe_resolution, + } + if failed: + return { + **default, + "reason": "incumbent_high_probe_has_slo_failures", + "last_threshold": last_threshold, + "threshold_gap_to_high": threshold_gap, + "failed_reason_counts": failed, + } + return { + "saturated": True, + "reason": "search_high_saturated_by_incumbent", + "summary": ( + "The incumbent's highest measured probe is feasible, has no SLO failures, " + "and is within the configured binary-search resolution of search.high." + ), + "incumbent_trial_id": state.best_trial_id, + "search_high": study.search.high, + "last_threshold": last_threshold, + "threshold_gap_to_high": threshold_gap, + "binary_probe_resolution": binary_probe_resolution, + } + + def _validation_exhausted_guard( state: StudyState, recent_diagnostics: list[dict[str, Any]], diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 580978d..7bfa1ec 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -538,6 +538,64 @@ class CoreFlowTests(unittest.TestCase): self.assertFalse(context["harness_stop"]["should_stop"]) self.assertIsNone(build_harness_stop_proposal(context)) + def test_harness_stop_when_incumbent_saturates_search_high(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + study = load_study_spec(study_path) + result_path = tmp_path / "trial-0001.json" + result_path.write_text( + json.dumps( + { + "status": "completed", + "best_sampling_u": 0.99609375, + "best_request_rate": 9.0, + "best_pass_rate": 1.0, + "probes": [ + { + "threshold": 0.99609375, + "feasible": True, + "payload": { + "request_count": 10, + "pass_rate": 1.0, + "request_rate": 9.0, + "early_stopped": False, + "early_stop_reason": "", + "latency_summary": {"failed_reason_counts": {}}, + }, + } + ], + } + ), + encoding="utf-8", + ) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0001", + best_request_rate=9.0, + best_request_rate_per_gpu=9.0, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + best_request_rate=9.0, + best_request_rate_per_gpu=9.0, + result_path=str(result_path), + config_patch={"env_patch": {}, "flag_patch": {}}, + ) + ], + ) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 2048}, + state=state, + ) + self.assertTrue(context["harness_stop"]["should_stop"]) + self.assertEqual(context["harness_stop"]["reason"], "search_high_saturated_by_incumbent") + proposal = build_harness_stop_proposal(context) + self.assertIsNotNone(proposal) + self.assertTrue(proposal.should_stop) + def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)