Add Stop-B authority: deterministic validator overrides LLM stop

Phase 4 of the two-stop work. The harness already pre-empts the LLM with deterministic stops and guided probes, but an LLM-originated should_stop could still end the loop while the validator saw remaining opportunity. Add harness._stop_authority, exposed as context["stop_authority"], whose `authorized` mirrors the deterministic harness stop decision and whose `opportunity_remains` flags an open topology frontier or a high-value planned candidate. In study tune, an LLM-originated should_stop is now honored only when the validator authorizes it; an unauthorized stop is vetoed (bounded budget) so the loop cannot converge prematurely on the agent's say-so. File- and harness-originated stops are unaffected, and the stop reason chain is recorded. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 14:45:14 +08:00
parent 51a9e4a007
commit a8f903498d
3 changed files with 150 additions and 3 deletions
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -226,6 +226,8 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
    if proposal_files and max_trials > len(proposal_files):
        max_trials = len(proposal_files)
    executed: list[dict[str, object]] = []
+    stop_vetoes = 0
+    max_llm_stop_vetoes = 1
    for idx in range(max_trials):
        state = store.load_state(study.study_id)
        if state.tuning_stop_reason:
@@ -334,7 +336,34 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
        proposal = parse_proposal_text(proposal_text, study)
        store.write_proposal(study.study_id, proposal_name, proposal)
        if proposal.should_stop:
-            if proposal_name.startswith("harness-stop-"):
+            is_harness_stop = proposal_name.startswith("harness-stop-")
+            is_llm_stop = not is_harness_stop and proposal_source is None
+            stop_authority = (
+                harness_context.get("stop_authority")
+                if isinstance(harness_context, dict)
+                else None
+            )
+            authorized = stop_authority is None or bool(stop_authority.get("authorized"))
+            # Stop-B authority: the deterministic validator overrides an
+            # LLM-originated stop. Veto an unauthorized stop (bounded) so the
+            # loop does not converge prematurely on the agent's say-so alone.
+            if is_llm_stop and not authorized and stop_vetoes < max_llm_stop_vetoes:
+                stop_vetoes += 1
+                executed.append(
+                    {
+                        "trial_id": None,
+                        "proposal_name": proposal_name,
+                        "proposal_source": "llm",
+                        "stop_vetoed": True,
+                        "reason": "validator_did_not_authorize_stop",
+                        "validator_reason": (
+                            stop_authority.get("reason") if stop_authority else None
+                        ),
+                        "diagnosis": proposal.diagnosis,
+                    }
+                )
+                continue
+            if is_harness_stop:
                proposal_source_label = "harness"
            else:
                proposal_source_label = str(proposal_source) if proposal_source else "llm"
@@ -344,6 +373,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
                    "proposal_name": proposal_name,
                    "proposal_source": proposal_source_label,
                    "stopped": True,
+                    "stop_authorized_by": (
+                        "validator"
+                        if (is_harness_stop or authorized)
+                        else "llm_after_veto_budget"
+                    ),
                    "diagnosis": proposal.diagnosis,
                    "state_best_trial_id": state.best_trial_id,
                    "state_best_request_rate": state.best_request_rate,
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -48,6 +48,12 @@ def build_harness_context(
        trial_profiles,
        bottleneck_hypotheses,
    )
+    harness_stop = _harness_stop_decision(
+        study,
+        state,
+        recent_diagnostics,
+        experiment_plan=experiment_plan,
+    )
    return {
        "paper_alignment": {
            "goal": "Use workload-feature-to-knob harnesses to reduce wasted trials and avoid regressing after a good configuration is found.",
@@ -61,11 +67,13 @@ def build_harness_context(
        "candidate_actions": experiment_plan["candidate_actions"],
        "experiment_plan": experiment_plan,
        "convergence_guard": _convergence_guard(state, recent_diagnostics),
-        "harness_stop": _harness_stop_decision(
+        "harness_stop": harness_stop,
+        "stop_authority": _stop_authority(
            study,
            state,
            recent_diagnostics,
-            experiment_plan=experiment_plan,
+            experiment_plan,
+            harness_stop,
        ),
        "harness_proposal": _harness_proposal_decision(
            study,
@@ -808,6 +816,43 @@ def _harness_stop_decision(
    }


+def _stop_authority(
+    study: StudySpec,
+    state: StudyState,
+    recent_diagnostics: list[dict[str, Any]],
+    experiment_plan: dict[str, Any] | None,
+    harness_stop: dict[str, Any],
+) -> dict[str, Any]:
+    """Stop-B authority: the deterministic validator decides if stopping is justified.
+
+    ``authorized`` mirrors the deterministic harness stop decision. The LLM's
+    should_stop is only a corroborating signal: the tuning loop honors an
+    LLM-originated stop only when this validator authorizes it (or when the
+    harness is disabled). ``opportunity_remains`` flags that a concrete adjacent
+    probe (open topology frontier or a high-value planned candidate) still exists,
+    so an early stop would leave measured headroom on the table.
+    """
+    frontier = _topology_frontier_status(study, state, recent_diagnostics)
+    next_action = (
+        experiment_plan.get("next_action") if isinstance(experiment_plan, dict) else None
+    )
+    has_candidate = (
+        isinstance(next_action, dict) and _as_float(next_action.get("score")) >= 0.35
+    )
+    opportunity_remains = bool(frontier.get("frontier_open")) or has_candidate
+    authorized = bool(harness_stop.get("should_stop"))
+    return {
+        "authorized": authorized,
+        "reason": harness_stop.get("reason"),
+        "opportunity_remains": opportunity_remains,
+        "summary": (
+            "Deterministic validator authorizes stop; no adjacent bottleneck probe remains."
+            if authorized
+            else "Validator does not authorize stop; LLM should_stop is advisory only."
+        ),
+    }
+
+
 def _harness_proposal_decision(
    study: StudySpec,
    window_summary: dict[str, Any],
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import json
+import contextlib
 import io
 import math
 import os
@@ -418,6 +419,23 @@ class CoreFlowTests(unittest.TestCase):
        self.assertTrue(early)
        self.assertTrue(any(c["family_similarity"]["C"] < 0.9 for c in early))

+    def test_stop_authority_mirrors_validator_and_blocks_fresh_stop(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            study = load_study_spec(_write_study_assets(Path(tmp)))
+            state = StudyState(study_id=study.study_id, trials=[])
+            context = build_harness_context(
+                study=study,
+                window_summary={},
+                state=state,
+            )
+            authority = context["stop_authority"]
+            # The authority is the deterministic validator; with no completed
+            # trials it must not authorize a stop.
+            self.assertEqual(
+                authority["authorized"], context["harness_stop"]["should_stop"]
+            )
+            self.assertFalse(authority["authorized"])
+
    def test_adaptive_replay_set_truncates_only_when_enabled(self) -> None:
        from types import SimpleNamespace

@@ -3956,6 +3974,56 @@ class CoreFlowTests(unittest.TestCase):
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 1)

+    def test_cli_tune_vetoes_unauthorized_llm_stop(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            spec = json.loads(study_path.read_text(encoding="utf-8"))
+            spec["llm"]["endpoint"] = {
+                "provider": "custom",
+                "base_url": "http://localhost:9/v1",
+                "model": "test-model",
+                "api_key_env": "AITUNER_TEST_KEY",
+            }
+            study_path.write_text(json.dumps(spec), encoding="utf-8")
+            store_root = tmp_path / "store"
+            stop_payload = json.dumps(
+                {
+                    "observation": "looks done",
+                    "diagnosis": "agent thinks it converged",
+                    "config_patch": {"env_patch": {}, "flag_patch": {}},
+                    "expected_effects": ["stop"],
+                    "why_not_previous_failures": "n/a",
+                    "should_stop": True,
+                }
+            )
+            buffer = io.StringIO()
+            with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch(
+                "aituner.cli.call_llm_for_proposal", return_value=stop_payload
+            ), contextlib.redirect_stdout(buffer):
+                exit_code = cli_main(
+                    [
+                        "study",
+                        "tune",
+                        "--spec",
+                        str(study_path),
+                        "--store-root",
+                        str(store_root),
+                        "--skip-baseline",
+                        "--max-trials",
+                        "2",
+                    ]
+                )
+            self.assertEqual(exit_code, 0)
+            run_trial_mock.assert_not_called()
+            executed = json.loads(buffer.getvalue())["executed_trials"]
+            # The first unauthorized LLM stop is vetoed; the second is honored
+            # only after the veto budget is spent.
+            self.assertTrue(any(item.get("stop_vetoed") for item in executed))
+            honored = [item for item in executed if item.get("stopped")]
+            self.assertTrue(honored)
+            self.assertEqual(honored[-1]["stop_authorized_by"], "llm_after_veto_budget")
+
    def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)