Reset new topology groups to full binary search

This commit is contained in:
2026-04-11 00:36:45 +08:00
parent a4d54442db
commit 83325b2f76
3 changed files with 14 additions and 10 deletions

View File

@@ -301,8 +301,9 @@ def build_prompt(
"", "",
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.", "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
"The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.", "The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
"The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists. For a different tp_dp_product group, it uses a scaled lower floor instead of reusing the global incumbent directly.", "The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
"Do not assume a configuration with fewer GPUs must start from the global incumbent sampling_u.", "If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
] ]
return "\n".join(sections) return "\n".join(sections)

View File

@@ -253,12 +253,6 @@ def _derive_search_floor(*, study: StudySpec, state: StudyState, parallel_size:
group_incumbent.get("best_sampling_u"), (int, float) group_incumbent.get("best_sampling_u"), (int, float)
): ):
candidate = float(group_incumbent["best_sampling_u"]) candidate = float(group_incumbent["best_sampling_u"])
elif (
isinstance(state.best_sampling_u, (int, float))
and isinstance(state.best_parallel_size, int)
and state.best_parallel_size > 0
):
candidate = float(state.best_sampling_u) * float(parallel_size) / float(state.best_parallel_size)
else: else:
candidate = low candidate = low
return min(high, max(low, candidate)) return min(high, max(low, candidate))

View File

@@ -1061,6 +1061,15 @@ class CoreFlowTests(unittest.TestCase):
best_request_rate=3.0, best_request_rate=3.0,
best_request_rate_per_gpu=0.75, best_request_rate_per_gpu=0.75,
next_trial_index=2, next_trial_index=2,
best_by_parallel_size={
"4": {
"trial_id": "trial-0001",
"parallel_size": 4,
"best_sampling_u": 0.375,
"best_request_rate": 3.0,
"best_request_rate_per_gpu": 0.75,
}
},
trials=[], trials=[],
) )
proposal = Proposal.from_dict( proposal = Proposal.from_dict(
@@ -1112,7 +1121,7 @@ class CoreFlowTests(unittest.TestCase):
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.125) self.assertEqual(trial.search.low, 0.125)
def test_materialize_trial_scales_search_floor_for_different_parallel_group(self) -> None: def test_materialize_trial_resets_search_floor_for_new_parallel_group(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path) study_path = _write_study_assets(tmp_path)
@@ -1138,7 +1147,7 @@ class CoreFlowTests(unittest.TestCase):
} }
) )
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.2) self.assertEqual(trial.search.low, study.search.low)
def test_ingest_trial_results_records_failure_reason(self) -> None: def test_ingest_trial_results_records_failure_reason(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp: