Fix topology-aware incumbents for qwen27b tuning

This commit is contained in:
2026-04-11 00:32:41 +08:00
parent 06d4c380b3
commit a4d54442db
5 changed files with 282 additions and 22 deletions

View File

@@ -1039,6 +1039,12 @@ class CoreFlowTests(unittest.TestCase):
self.assertEqual(next_state.best_trial_id, trial.trial_id)
self.assertEqual(next_state.best_sampling_u, 0.75)
self.assertEqual(next_state.best_request_rate, 12.5)
self.assertEqual(next_state.best_parallel_size, 4)
self.assertEqual(next_state.best_request_rate_per_gpu, 3.125)
self.assertEqual(
next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"],
3.125,
)
def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
@@ -1050,8 +1056,10 @@ class CoreFlowTests(unittest.TestCase):
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_sampling_u=0.375,
best_request_rate=3.0,
best_request_rate_per_gpu=0.75,
next_trial_index=2,
trials=[],
)
@@ -1067,6 +1075,71 @@ class CoreFlowTests(unittest.TestCase):
self.assertEqual(trial.search.low, 0.375)
self.assertEqual(trial.search.high, 1.0)
def test_materialize_trial_uses_same_parallel_group_incumbent(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_sampling_u=0.375,
best_request_rate=3.0,
best_request_rate_per_gpu=0.75,
next_trial_index=2,
best_by_parallel_size={
"2": {
"trial_id": "trial-0000",
"parallel_size": 2,
"best_sampling_u": 0.125,
"best_request_rate": 0.8,
"best_request_rate_per_gpu": 0.4,
}
},
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.125)
def test_materialize_trial_scales_search_floor_for_different_parallel_group(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_sampling_u=0.4,
best_request_rate=3.0,
best_request_rate_per_gpu=0.75,
next_trial_index=2,
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.2)
def test_ingest_trial_results_records_failure_reason(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
@@ -1137,6 +1210,66 @@ class CoreFlowTests(unittest.TestCase):
next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.trials[0].failure_stage, "engine_launch")
def test_ingest_trial_results_prefers_higher_request_rate_per_gpu(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal_a = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"],
}
)
trial_a, state = store.materialize_trial(study=study, state=state, proposal=proposal_a)
Path(trial_a.result_path).write_text(
json.dumps(
{
"study_id": study.study_id,
"trial_id": trial_a.trial_id,
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 4.0,
"best_pass_rate": 0.97,
}
),
encoding="utf-8",
)
proposal_b = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
"expected_effects": ["raise rate"],
}
)
trial_b, _ = store.materialize_trial(study=study, state=state, proposal=proposal_b)
Path(trial_b.result_path).write_text(
json.dumps(
{
"study_id": study.study_id,
"trial_id": trial_b.trial_id,
"status": "completed",
"best_sampling_u": 0.4,
"best_request_rate": 3.0,
"best_pass_rate": 0.97,
}
),
encoding="utf-8",
)
next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.best_trial_id, trial_b.trial_id)
self.assertEqual(next_state.best_parallel_size, 2)
self.assertEqual(next_state.best_request_rate, 3.0)
self.assertEqual(next_state.best_request_rate_per_gpu, 1.5)
self.assertEqual(next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"], 1.0)
self.assertEqual(next_state.best_by_parallel_size["2"]["best_request_rate_per_gpu"], 1.5)
def test_validate_proposal_rejects_invalid_tp_dp_product(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)