Harness stop uses full state baseline

This commit is contained in:
2026-06-20 22:48:27 +08:00
parent a9d237bbfd
commit 426151bc9f
2 changed files with 139 additions and 18 deletions

View File

@@ -607,6 +607,15 @@ def stateful_history_limit() -> int:
return 8 return 8
def _state_completed_trials_with_rates(state: StudyState) -> list[TrialSummary]:
return [
trial
for trial in state.trials
if trial.status == "completed"
and isinstance(trial.best_request_rate_per_gpu, (int, float))
]
def _load_result(trial: TrialSummary) -> dict[str, Any] | None: def _load_result(trial: TrialSummary) -> dict[str, Any] | None:
if not trial.result_path: if not trial.result_path:
return None return None
@@ -1960,15 +1969,19 @@ def _validation_exhausted_guard(
} }
if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)): if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
return default return default
completed = [ state_completed = _state_completed_trials_with_rates(state)
item if state_completed:
for item in recent_diagnostics baseline_rate = float(state_completed[0].best_request_rate_per_gpu)
if item.get("status") == "completed" else:
and isinstance(item.get("best_request_rate_per_gpu"), (int, float)) completed = [
] item
if not completed: for item in recent_diagnostics
return default if item.get("status") == "completed"
baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu")) and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
]
if not completed:
return default
baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
incumbent_rate = _as_float(state.best_request_rate_per_gpu) incumbent_rate = _as_float(state.best_request_rate_per_gpu)
if baseline_rate <= 0 or incumbent_rate <= 0: if baseline_rate <= 0 or incumbent_rate <= 0:
return default return default
@@ -2084,16 +2097,11 @@ def _strong_incumbent_guard(
} }
if state.best_trial_id is None or state.best_request_rate_per_gpu is None: if state.best_trial_id is None or state.best_request_rate_per_gpu is None:
return default return default
completed = [ completed = _state_completed_trials_with_rates(state)
item
for item in recent_diagnostics
if item.get("status") == "completed"
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
]
if len(completed) < 2: if len(completed) < 2:
return default return default
baseline = completed[0] baseline = completed[0]
baseline_rate = float(baseline["best_request_rate_per_gpu"]) baseline_rate = float(baseline.best_request_rate_per_gpu)
incumbent_rate = float(state.best_request_rate_per_gpu) incumbent_rate = float(state.best_request_rate_per_gpu)
if baseline_rate <= 0: if baseline_rate <= 0:
return default return default
@@ -2103,7 +2111,7 @@ def _strong_incumbent_guard(
return { return {
"guard_active": True, "guard_active": True,
"reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best_enter_validation_phase", "reason": "incumbent_exceeds_baseline_by_1_8x_and_latest_trial_is_best_enter_validation_phase",
"baseline_trial_id": baseline.get("trial_id"), "baseline_trial_id": baseline.trial_id,
"baseline_request_rate_per_gpu": baseline_rate, "baseline_request_rate_per_gpu": baseline_rate,
"incumbent_gain_vs_baseline": gain, "incumbent_gain_vs_baseline": gain,
"recommended_next_action": ( "recommended_next_action": (
@@ -2113,7 +2121,7 @@ def _strong_incumbent_guard(
} }
return { return {
**default, **default,
"baseline_trial_id": baseline.get("trial_id"), "baseline_trial_id": baseline.trial_id,
"baseline_request_rate_per_gpu": baseline_rate, "baseline_request_rate_per_gpu": baseline_rate,
"incumbent_gain_vs_baseline": gain, "incumbent_gain_vs_baseline": gain,
"reason": "need_more_evidence_before_strong_incumbent_stop", "reason": "need_more_evidence_before_strong_incumbent_stop",

View File

@@ -1068,6 +1068,119 @@ class CoreFlowTests(unittest.TestCase):
self.assertTrue(context["harness_stop"]["should_stop"]) self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={"tunable_flags": ["max-num-seqs"]},
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0006",
best_parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=0.88,
best_request_rate_per_gpu=0.11,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
best_request_rate=0.96,
best_request_rate_per_gpu=0.12,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
best_request_rate=1.04,
best_request_rate_per_gpu=0.13,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 32}},
),
TrialSummary(
trial_id="trial-0005",
status="completed",
parallel_size=8,
best_request_rate=2.24,
best_request_rate_per_gpu=0.28,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 40}},
),
TrialSummary(
trial_id="trial-0006",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 48}},
),
TrialSummary(
trial_id="trial-0007",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 56}},
),
TrialSummary(
trial_id="trial-0008",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
),
TrialSummary(
trial_id="trial-0009",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 72}},
),
TrialSummary(
trial_id="trial-0010",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 80}},
),
TrialSummary(
trial_id="trial-0011",
status="failed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 88}},
),
TrialSummary(
trial_id="trial-0012",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 96}},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
self.assertGreater(
context["harness_stop"]["evidence"]["incumbent_gain_vs_baseline"],
2.9,
)
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None: def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)