Stop after strong incumbent harness gains
This commit is contained in:
@@ -135,6 +135,7 @@ def _knob_harnesses(
|
|||||||
"guards": [
|
"guards": [
|
||||||
"Keep MBT changes within a conservative trust region.",
|
"Keep MBT changes within a conservative trust region.",
|
||||||
"Do not raise MBT after OOM or launch failures involving memory-related knobs.",
|
"Do not raise MBT after OOM or launch failures involving memory-related knobs.",
|
||||||
|
"Do not raise MBT when the incumbent MBT already covers prompt p99 unless same-topology history proves prefill fragmentation is the bottleneck.",
|
||||||
],
|
],
|
||||||
"active_now": active_bottleneck == "ttft_prefill",
|
"active_now": active_bottleneck == "ttft_prefill",
|
||||||
}
|
}
|
||||||
@@ -308,6 +309,7 @@ def _convergence_guard(
|
|||||||
recent_diagnostics: list[dict[str, Any]],
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
infeasible_progress = _infeasible_progress_guard(recent_diagnostics)
|
infeasible_progress = _infeasible_progress_guard(recent_diagnostics)
|
||||||
|
strong_incumbent = _strong_incumbent_guard(state, recent_diagnostics)
|
||||||
completed = [
|
completed = [
|
||||||
item
|
item
|
||||||
for item in recent_diagnostics
|
for item in recent_diagnostics
|
||||||
@@ -329,12 +331,21 @@ def _convergence_guard(
|
|||||||
reason = "need_more_evidence_before_stop"
|
reason = "need_more_evidence_before_stop"
|
||||||
if not should_stop and infeasible_progress["plateau_detected"]:
|
if not should_stop and infeasible_progress["plateau_detected"]:
|
||||||
reason = str(infeasible_progress["reason"])
|
reason = str(infeasible_progress["reason"])
|
||||||
|
if (
|
||||||
|
not should_stop
|
||||||
|
and not infeasible_progress["plateau_detected"]
|
||||||
|
and strong_incumbent["guard_active"]
|
||||||
|
):
|
||||||
|
reason = str(strong_incumbent["reason"])
|
||||||
return {
|
return {
|
||||||
"should_stop_if_no_harness_can_justify_a_new_adjacent_probe": (
|
"should_stop_if_no_harness_can_justify_a_new_adjacent_probe": (
|
||||||
should_stop or bool(infeasible_progress["stop_if_next_probe_repeats_family"])
|
should_stop
|
||||||
|
or bool(infeasible_progress["stop_if_next_probe_repeats_family"])
|
||||||
|
or bool(strong_incumbent["guard_active"])
|
||||||
),
|
),
|
||||||
"reason": reason,
|
"reason": reason,
|
||||||
"infeasible_progress": infeasible_progress,
|
"infeasible_progress": infeasible_progress,
|
||||||
|
"strong_incumbent": strong_incumbent,
|
||||||
"incumbent": {
|
"incumbent": {
|
||||||
"trial_id": state.best_trial_id,
|
"trial_id": state.best_trial_id,
|
||||||
"parallel_size": state.best_parallel_size,
|
"parallel_size": state.best_parallel_size,
|
||||||
@@ -345,6 +356,51 @@ def _convergence_guard(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _strong_incumbent_guard(
|
||||||
|
state: StudyState,
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
default = {
|
||||||
|
"guard_active": False,
|
||||||
|
"reason": "no_strong_incumbent_yet",
|
||||||
|
"baseline_trial_id": None,
|
||||||
|
"baseline_request_rate_per_gpu": None,
|
||||||
|
"incumbent_gain_vs_baseline": None,
|
||||||
|
}
|
||||||
|
if state.best_trial_id is None or state.best_request_rate_per_gpu is None:
|
||||||
|
return default
|
||||||
|
completed = [
|
||||||
|
item
|
||||||
|
for item in recent_diagnostics
|
||||||
|
if item.get("status") == "completed"
|
||||||
|
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||||
|
]
|
||||||
|
if len(completed) < 2:
|
||||||
|
return default
|
||||||
|
baseline = completed[0]
|
||||||
|
baseline_rate = float(baseline["best_request_rate_per_gpu"])
|
||||||
|
incumbent_rate = float(state.best_request_rate_per_gpu)
|
||||||
|
if baseline_rate <= 0:
|
||||||
|
return default
|
||||||
|
gain = incumbent_rate / baseline_rate
|
||||||
|
latest = recent_diagnostics[-1] if recent_diagnostics else {}
|
||||||
|
if state.best_trial_id == latest.get("trial_id") and gain >= 3.0:
|
||||||
|
return {
|
||||||
|
"guard_active": True,
|
||||||
|
"reason": "incumbent_exceeds_baseline_by_3x_and_latest_trial_is_best",
|
||||||
|
"baseline_trial_id": baseline.get("trial_id"),
|
||||||
|
"baseline_request_rate_per_gpu": baseline_rate,
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
**default,
|
||||||
|
"baseline_trial_id": baseline.get("trial_id"),
|
||||||
|
"baseline_request_rate_per_gpu": baseline_rate,
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
"reason": "need_more_evidence_before_strong_incumbent_stop",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _infeasible_progress_guard(recent_diagnostics: list[dict[str, Any]]) -> dict[str, Any]:
|
def _infeasible_progress_guard(recent_diagnostics: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
points = [
|
points = [
|
||||||
point
|
point
|
||||||
@@ -474,6 +530,7 @@ def _proposal_rules() -> list[str]:
|
|||||||
"First decide the active bottleneck from recent_trial_diagnostics.",
|
"First decide the active bottleneck from recent_trial_diagnostics.",
|
||||||
"Pick at most one primary knob family from knob_harnesses unless the history proves a coupled change is needed.",
|
"Pick at most one primary knob family from knob_harnesses unless the history proves a coupled change is needed.",
|
||||||
"Use adjacent legal values around the incumbent; avoid broad exploratory jumps.",
|
"Use adjacent legal values around the incumbent; avoid broad exploratory jumps.",
|
||||||
|
"When strong_incumbent.guard_active is true, do not propose runtime-only tweaks unless the relevant harness guard is positively satisfied by same-topology evidence.",
|
||||||
"If infeasible_progress blocks the last primary knob family, do not continue that family; switch families with direct bottleneck evidence or set should_stop=true.",
|
"If infeasible_progress blocks the last primary knob family, do not continue that family; switch families with direct bottleneck evidence or set should_stop=true.",
|
||||||
"If a proposed config is likely to reduce request_rate_per_gpu under the active guard, set should_stop=true instead of exploring.",
|
"If a proposed config is likely to reduce request_rate_per_gpu under the active guard, set should_stop=true instead of exploring.",
|
||||||
"Never repeat an already tested config signature.",
|
"Never repeat an already tested config signature.",
|
||||||
|
|||||||
@@ -365,6 +365,58 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_harness_strong_incumbent_guard_after_large_gain(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_request_rate_per_gpu=0.21,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=1,
|
||||||
|
best_request_rate=0.035,
|
||||||
|
best_request_rate_per_gpu=0.035,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=2,
|
||||||
|
best_request_rate=0.42,
|
||||||
|
best_request_rate_per_gpu=0.21,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 2,
|
||||||
|
"data-parallel-size": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={
|
||||||
|
"prompt_tokens_p95": 7628,
|
||||||
|
"prompt_tokens_p99": 8102,
|
||||||
|
"prompt_tail_ratio_p95_p50": 3.83,
|
||||||
|
},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
guard = context["convergence_guard"]["strong_incumbent"]
|
||||||
|
self.assertTrue(guard["guard_active"])
|
||||||
|
self.assertGreaterEqual(guard["incumbent_gain_vs_baseline"], 3.0)
|
||||||
|
self.assertTrue(
|
||||||
|
context["convergence_guard"][
|
||||||
|
"should_stop_if_no_harness_can_justify_a_new_adjacent_probe"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
|
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
|
|||||||
Reference in New Issue
Block a user