Use full state for frontier projection
This commit is contained in:
@@ -29,6 +29,7 @@ _RUNTIME_KEYS = {
|
||||
_STRONG_INCUMBENT_MIN_GAIN = 1.8
|
||||
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
|
||||
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
|
||||
_STATEFUL_HISTORY_LIMIT = 8
|
||||
# Decode-bound throughput is frequently KV-cache limited, so more gpu-memory-utilization
|
||||
# yields more KV blocks and more concurrent decode. Hill-climb in small steps toward a
|
||||
# safe ceiling and let measurement find the real peak: a too-high target regresses or
|
||||
@@ -428,9 +429,14 @@ def _knob_harnesses(
|
||||
return harnesses
|
||||
|
||||
|
||||
def _recent_trial_diagnostics(state: StudyState) -> list[dict[str, Any]]:
|
||||
def _recent_trial_diagnostics(
|
||||
state: StudyState,
|
||||
*,
|
||||
limit: int | None = _STATEFUL_HISTORY_LIMIT,
|
||||
) -> list[dict[str, Any]]:
|
||||
diagnostics: list[dict[str, Any]] = []
|
||||
for trial in state.trials[-stateful_history_limit() :]:
|
||||
trials = state.trials if limit is None else state.trials[-limit:]
|
||||
for trial in trials:
|
||||
item: dict[str, Any] = {
|
||||
"trial_id": trial.trial_id,
|
||||
"status": trial.status,
|
||||
@@ -629,7 +635,7 @@ def _rank_bottleneck_hypotheses(
|
||||
|
||||
|
||||
def stateful_history_limit() -> int:
|
||||
return 8
|
||||
return _STATEFUL_HISTORY_LIMIT
|
||||
|
||||
|
||||
def _state_completed_trials_with_rates(state: StudyState) -> list[TrialSummary]:
|
||||
@@ -1121,6 +1127,7 @@ def _candidate_actions(
|
||||
candidates.extend(
|
||||
_frontier_delta_projection_actions(
|
||||
study,
|
||||
state,
|
||||
trial_profiles,
|
||||
top_bottleneck,
|
||||
bottleneck_hypotheses,
|
||||
@@ -1641,6 +1648,7 @@ def _runtime_candidate_actions(
|
||||
|
||||
def _frontier_delta_projection_actions(
|
||||
study: StudySpec,
|
||||
state: StudyState,
|
||||
trial_profiles: list[dict[str, Any]],
|
||||
top_bottleneck: str,
|
||||
bottleneck_hypotheses: list[dict[str, Any]],
|
||||
@@ -1649,10 +1657,11 @@ def _frontier_delta_projection_actions(
|
||||
) -> list[dict[str, Any]]:
|
||||
if not (set(study.engine.tunable_flags) & _RUNTIME_KEYS):
|
||||
return []
|
||||
anchors = _pareto_frontier_anchor_profiles(study, trial_profiles)
|
||||
projection_profiles = _frontier_projection_profiles(study, state, trial_profiles)
|
||||
anchors = _pareto_frontier_anchor_profiles(study, projection_profiles)
|
||||
if len(anchors) < 2:
|
||||
return []
|
||||
deltas = _positive_runtime_delta_records(study, trial_profiles)
|
||||
deltas = _positive_runtime_delta_records(study, projection_profiles)
|
||||
if not deltas:
|
||||
return []
|
||||
|
||||
@@ -1661,7 +1670,7 @@ def _frontier_delta_projection_actions(
|
||||
incumbent_rate = max(
|
||||
(
|
||||
_profile_request_rate_per_gpu(profile)
|
||||
for profile in trial_profiles
|
||||
for profile in projection_profiles
|
||||
if profile.get("status") == "completed"
|
||||
),
|
||||
default=0.0,
|
||||
@@ -1813,6 +1822,19 @@ def _frontier_delta_projection_actions(
|
||||
return actions[:8]
|
||||
|
||||
|
||||
def _frontier_projection_profiles(
|
||||
study: StudySpec,
|
||||
state: StudyState,
|
||||
trial_profiles: list[dict[str, Any]],
|
||||
) -> list[dict[str, Any]]:
|
||||
if len(state.trials) <= len(trial_profiles):
|
||||
return trial_profiles
|
||||
return _trial_profiles(
|
||||
study,
|
||||
_recent_trial_diagnostics(state, limit=None),
|
||||
)
|
||||
|
||||
|
||||
def _pareto_frontier_anchor_profiles(
|
||||
study: StudySpec,
|
||||
trial_profiles: list[dict[str, Any]],
|
||||
|
||||
@@ -2856,6 +2856,67 @@ class CoreFlowTests(unittest.TestCase):
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0006",
|
||||
status="completed",
|
||||
parallel_size=4,
|
||||
best_request_rate=8.0,
|
||||
best_request_rate_per_gpu=2.0,
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"max-num-seqs": 16,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0007",
|
||||
status="completed",
|
||||
parallel_size=4,
|
||||
best_request_rate=8.0,
|
||||
best_request_rate_per_gpu=2.0,
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.92,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0008",
|
||||
status="completed",
|
||||
parallel_size=4,
|
||||
best_request_rate=8.0,
|
||||
best_request_rate_per_gpu=2.0,
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"max-num-batched-tokens": 16384,
|
||||
"max-num-seqs": 16,
|
||||
},
|
||||
},
|
||||
),
|
||||
TrialSummary(
|
||||
trial_id="trial-0009",
|
||||
status="completed",
|
||||
parallel_size=4,
|
||||
best_request_rate=8.0,
|
||||
best_request_rate_per_gpu=2.0,
|
||||
config_patch={
|
||||
"env_patch": {},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"enable-chunked-prefill": True,
|
||||
"max-num-batched-tokens": 8192,
|
||||
},
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
context = build_harness_context(
|
||||
|
||||
Reference in New Issue
Block a user