Add Stop-B authority: deterministic validator overrides LLM stop

Phase 4 of the two-stop work. The harness already pre-empts the LLM with
deterministic stops and guided probes, but an LLM-originated should_stop could
still end the loop while the validator saw remaining opportunity.

Add harness._stop_authority, exposed as context["stop_authority"], whose
`authorized` mirrors the deterministic harness stop decision and whose
`opportunity_remains` flags an open topology frontier or a high-value planned
candidate. In study tune, an LLM-originated should_stop is now honored only when
the validator authorizes it; an unauthorized stop is vetoed (bounded budget) so
the loop cannot converge prematurely on the agent's say-so. File- and
harness-originated stops are unaffected, and the stop reason chain is recorded.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 14:45:14 +08:00
parent 51a9e4a007
commit a8f903498d
3 changed files with 150 additions and 3 deletions

View File

@@ -226,6 +226,8 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
if proposal_files and max_trials > len(proposal_files):
max_trials = len(proposal_files)
executed: list[dict[str, object]] = []
stop_vetoes = 0
max_llm_stop_vetoes = 1
for idx in range(max_trials):
state = store.load_state(study.study_id)
if state.tuning_stop_reason:
@@ -334,7 +336,34 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
proposal = parse_proposal_text(proposal_text, study)
store.write_proposal(study.study_id, proposal_name, proposal)
if proposal.should_stop:
if proposal_name.startswith("harness-stop-"):
is_harness_stop = proposal_name.startswith("harness-stop-")
is_llm_stop = not is_harness_stop and proposal_source is None
stop_authority = (
harness_context.get("stop_authority")
if isinstance(harness_context, dict)
else None
)
authorized = stop_authority is None or bool(stop_authority.get("authorized"))
# Stop-B authority: the deterministic validator overrides an
# LLM-originated stop. Veto an unauthorized stop (bounded) so the
# loop does not converge prematurely on the agent's say-so alone.
if is_llm_stop and not authorized and stop_vetoes < max_llm_stop_vetoes:
stop_vetoes += 1
executed.append(
{
"trial_id": None,
"proposal_name": proposal_name,
"proposal_source": "llm",
"stop_vetoed": True,
"reason": "validator_did_not_authorize_stop",
"validator_reason": (
stop_authority.get("reason") if stop_authority else None
),
"diagnosis": proposal.diagnosis,
}
)
continue
if is_harness_stop:
proposal_source_label = "harness"
else:
proposal_source_label = str(proposal_source) if proposal_source else "llm"
@@ -344,6 +373,11 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"proposal_name": proposal_name,
"proposal_source": proposal_source_label,
"stopped": True,
"stop_authorized_by": (
"validator"
if (is_harness_stop or authorized)
else "llm_after_veto_budget"
),
"diagnosis": proposal.diagnosis,
"state_best_trial_id": state.best_trial_id,
"state_best_request_rate": state.best_request_rate,

View File

@@ -48,6 +48,12 @@ def build_harness_context(
trial_profiles,
bottleneck_hypotheses,
)
harness_stop = _harness_stop_decision(
study,
state,
recent_diagnostics,
experiment_plan=experiment_plan,
)
return {
"paper_alignment": {
"goal": "Use workload-feature-to-knob harnesses to reduce wasted trials and avoid regressing after a good configuration is found.",
@@ -61,11 +67,13 @@ def build_harness_context(
"candidate_actions": experiment_plan["candidate_actions"],
"experiment_plan": experiment_plan,
"convergence_guard": _convergence_guard(state, recent_diagnostics),
"harness_stop": _harness_stop_decision(
"harness_stop": harness_stop,
"stop_authority": _stop_authority(
study,
state,
recent_diagnostics,
experiment_plan=experiment_plan,
experiment_plan,
harness_stop,
),
"harness_proposal": _harness_proposal_decision(
study,
@@ -808,6 +816,43 @@ def _harness_stop_decision(
}
def _stop_authority(
study: StudySpec,
state: StudyState,
recent_diagnostics: list[dict[str, Any]],
experiment_plan: dict[str, Any] | None,
harness_stop: dict[str, Any],
) -> dict[str, Any]:
"""Stop-B authority: the deterministic validator decides if stopping is justified.
``authorized`` mirrors the deterministic harness stop decision. The LLM's
should_stop is only a corroborating signal: the tuning loop honors an
LLM-originated stop only when this validator authorizes it (or when the
harness is disabled). ``opportunity_remains`` flags that a concrete adjacent
probe (open topology frontier or a high-value planned candidate) still exists,
so an early stop would leave measured headroom on the table.
"""
frontier = _topology_frontier_status(study, state, recent_diagnostics)
next_action = (
experiment_plan.get("next_action") if isinstance(experiment_plan, dict) else None
)
has_candidate = (
isinstance(next_action, dict) and _as_float(next_action.get("score")) >= 0.35
)
opportunity_remains = bool(frontier.get("frontier_open")) or has_candidate
authorized = bool(harness_stop.get("should_stop"))
return {
"authorized": authorized,
"reason": harness_stop.get("reason"),
"opportunity_remains": opportunity_remains,
"summary": (
"Deterministic validator authorizes stop; no adjacent bottleneck probe remains."
if authorized
else "Validator does not authorize stop; LLM should_stop is advisory only."
),
}
def _harness_proposal_decision(
study: StudySpec,
window_summary: dict[str, Any],

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import json
import contextlib
import io
import math
import os
@@ -418,6 +419,23 @@ class CoreFlowTests(unittest.TestCase):
self.assertTrue(early)
self.assertTrue(any(c["family_similarity"]["C"] < 0.9 for c in early))
def test_stop_authority_mirrors_validator_and_blocks_fresh_stop(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study = load_study_spec(_write_study_assets(Path(tmp)))
state = StudyState(study_id=study.study_id, trials=[])
context = build_harness_context(
study=study,
window_summary={},
state=state,
)
authority = context["stop_authority"]
# The authority is the deterministic validator; with no completed
# trials it must not authorize a stop.
self.assertEqual(
authority["authorized"], context["harness_stop"]["should_stop"]
)
self.assertFalse(authority["authorized"])
def test_adaptive_replay_set_truncates_only_when_enabled(self) -> None:
from types import SimpleNamespace
@@ -3956,6 +3974,56 @@ class CoreFlowTests(unittest.TestCase):
state = store.load_state("study-1")
self.assertEqual(state.next_trial_index, 1)
def test_cli_tune_vetoes_unauthorized_llm_stop(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
spec = json.loads(study_path.read_text(encoding="utf-8"))
spec["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://localhost:9/v1",
"model": "test-model",
"api_key_env": "AITUNER_TEST_KEY",
}
study_path.write_text(json.dumps(spec), encoding="utf-8")
store_root = tmp_path / "store"
stop_payload = json.dumps(
{
"observation": "looks done",
"diagnosis": "agent thinks it converged",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": ["stop"],
"why_not_previous_failures": "n/a",
"should_stop": True,
}
)
buffer = io.StringIO()
with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch(
"aituner.cli.call_llm_for_proposal", return_value=stop_payload
), contextlib.redirect_stdout(buffer):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--skip-baseline",
"--max-trials",
"2",
]
)
self.assertEqual(exit_code, 0)
run_trial_mock.assert_not_called()
executed = json.loads(buffer.getvalue())["executed_trials"]
# The first unauthorized LLM stop is vetoed; the second is honored
# only after the veto budget is spent.
self.assertTrue(any(item.get("stop_vetoed") for item in executed))
honored = [item for item in executed if item.get("stopped")]
self.assertTrue(honored)
self.assertEqual(honored[-1]["stop_authorized_by"], "llm_after_veto_budget")
def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)