Use full state for frontier projection

This commit is contained in:
2026-06-29 16:22:09 +08:00
parent 8dd9ada194
commit 9ef9550214
2 changed files with 89 additions and 6 deletions

View File

@@ -29,6 +29,7 @@ _RUNTIME_KEYS = {
_STRONG_INCUMBENT_MIN_GAIN = 1.8
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
_STATEFUL_HISTORY_LIMIT = 8
# Decode-bound throughput is frequently KV-cache limited, so more gpu-memory-utilization
# yields more KV blocks and more concurrent decode. Hill-climb in small steps toward a
# safe ceiling and let measurement find the real peak: a too-high target regresses or
@@ -428,9 +429,14 @@ def _knob_harnesses(
return harnesses
def _recent_trial_diagnostics(state: StudyState) -> list[dict[str, Any]]:
def _recent_trial_diagnostics(
state: StudyState,
*,
limit: int | None = _STATEFUL_HISTORY_LIMIT,
) -> list[dict[str, Any]]:
diagnostics: list[dict[str, Any]] = []
for trial in state.trials[-stateful_history_limit() :]:
trials = state.trials if limit is None else state.trials[-limit:]
for trial in trials:
item: dict[str, Any] = {
"trial_id": trial.trial_id,
"status": trial.status,
@@ -629,7 +635,7 @@ def _rank_bottleneck_hypotheses(
def stateful_history_limit() -> int:
return 8
return _STATEFUL_HISTORY_LIMIT
def _state_completed_trials_with_rates(state: StudyState) -> list[TrialSummary]:
@@ -1121,6 +1127,7 @@ def _candidate_actions(
candidates.extend(
_frontier_delta_projection_actions(
study,
state,
trial_profiles,
top_bottleneck,
bottleneck_hypotheses,
@@ -1641,6 +1648,7 @@ def _runtime_candidate_actions(
def _frontier_delta_projection_actions(
study: StudySpec,
state: StudyState,
trial_profiles: list[dict[str, Any]],
top_bottleneck: str,
bottleneck_hypotheses: list[dict[str, Any]],
@@ -1649,10 +1657,11 @@ def _frontier_delta_projection_actions(
) -> list[dict[str, Any]]:
if not (set(study.engine.tunable_flags) & _RUNTIME_KEYS):
return []
anchors = _pareto_frontier_anchor_profiles(study, trial_profiles)
projection_profiles = _frontier_projection_profiles(study, state, trial_profiles)
anchors = _pareto_frontier_anchor_profiles(study, projection_profiles)
if len(anchors) < 2:
return []
deltas = _positive_runtime_delta_records(study, trial_profiles)
deltas = _positive_runtime_delta_records(study, projection_profiles)
if not deltas:
return []
@@ -1661,7 +1670,7 @@ def _frontier_delta_projection_actions(
incumbent_rate = max(
(
_profile_request_rate_per_gpu(profile)
for profile in trial_profiles
for profile in projection_profiles
if profile.get("status") == "completed"
),
default=0.0,
@@ -1813,6 +1822,19 @@ def _frontier_delta_projection_actions(
return actions[:8]
def _frontier_projection_profiles(
study: StudySpec,
state: StudyState,
trial_profiles: list[dict[str, Any]],
) -> list[dict[str, Any]]:
if len(state.trials) <= len(trial_profiles):
return trial_profiles
return _trial_profiles(
study,
_recent_trial_diagnostics(state, limit=None),
)
def _pareto_frontier_anchor_profiles(
study: StudySpec,
trial_profiles: list[dict[str, Any]],

View File

@@ -2856,6 +2856,67 @@ class CoreFlowTests(unittest.TestCase):
},
},
),
TrialSummary(
trial_id="trial-0006",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
"max-num-seqs": 16,
},
},
),
TrialSummary(
trial_id="trial-0007",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.92,
},
},
),
TrialSummary(
trial_id="trial-0008",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
"max-num-batched-tokens": 16384,
"max-num-seqs": 16,
},
},
),
TrialSummary(
trial_id="trial-0009",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
"enable-chunked-prefill": True,
"max-num-batched-tokens": 8192,
},
},
),
],
)
context = build_harness_context(