Fix topology-aware incumbents for qwen27b tuning
This commit is contained in:
@@ -1039,6 +1039,12 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertEqual(next_state.best_trial_id, trial.trial_id)
|
||||
self.assertEqual(next_state.best_sampling_u, 0.75)
|
||||
self.assertEqual(next_state.best_request_rate, 12.5)
|
||||
self.assertEqual(next_state.best_parallel_size, 4)
|
||||
self.assertEqual(next_state.best_request_rate_per_gpu, 3.125)
|
||||
self.assertEqual(
|
||||
next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"],
|
||||
3.125,
|
||||
)
|
||||
|
||||
def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
@@ -1050,8 +1056,10 @@ class CoreFlowTests(unittest.TestCase):
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_parallel_size=4,
|
||||
best_sampling_u=0.375,
|
||||
best_request_rate=3.0,
|
||||
best_request_rate_per_gpu=0.75,
|
||||
next_trial_index=2,
|
||||
trials=[],
|
||||
)
|
||||
@@ -1067,6 +1075,71 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertEqual(trial.search.low, 0.375)
|
||||
self.assertEqual(trial.search.high, 1.0)
|
||||
|
||||
def test_materialize_trial_uses_same_parallel_group_incumbent(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_parallel_size=4,
|
||||
best_sampling_u=0.375,
|
||||
best_request_rate=3.0,
|
||||
best_request_rate_per_gpu=0.75,
|
||||
next_trial_index=2,
|
||||
best_by_parallel_size={
|
||||
"2": {
|
||||
"trial_id": "trial-0000",
|
||||
"parallel_size": 2,
|
||||
"best_sampling_u": 0.125,
|
||||
"best_request_rate": 0.8,
|
||||
"best_request_rate_per_gpu": 0.4,
|
||||
}
|
||||
},
|
||||
trials=[],
|
||||
)
|
||||
proposal = Proposal.from_dict(
|
||||
{
|
||||
"observation": "Obs",
|
||||
"diagnosis": "Diag",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
|
||||
"expected_effects": ["raise rate"],
|
||||
}
|
||||
)
|
||||
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||
self.assertEqual(trial.search.low, 0.125)
|
||||
|
||||
def test_materialize_trial_scales_search_floor_for_different_parallel_group(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_parallel_size=4,
|
||||
best_sampling_u=0.4,
|
||||
best_request_rate=3.0,
|
||||
best_request_rate_per_gpu=0.75,
|
||||
next_trial_index=2,
|
||||
trials=[],
|
||||
)
|
||||
proposal = Proposal.from_dict(
|
||||
{
|
||||
"observation": "Obs",
|
||||
"diagnosis": "Diag",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
|
||||
"expected_effects": ["raise rate"],
|
||||
}
|
||||
)
|
||||
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||
self.assertEqual(trial.search.low, 0.2)
|
||||
|
||||
def test_ingest_trial_results_records_failure_reason(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -1137,6 +1210,66 @@ class CoreFlowTests(unittest.TestCase):
|
||||
next_state = store.ingest_trial_results(study.study_id)
|
||||
self.assertEqual(next_state.trials[0].failure_stage, "engine_launch")
|
||||
|
||||
def test_ingest_trial_results_prefers_higher_request_rate_per_gpu(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = store.load_state(study.study_id)
|
||||
proposal_a = Proposal.from_dict(
|
||||
{
|
||||
"observation": "Obs",
|
||||
"diagnosis": "Diag",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
|
||||
"expected_effects": ["raise rate"],
|
||||
}
|
||||
)
|
||||
trial_a, state = store.materialize_trial(study=study, state=state, proposal=proposal_a)
|
||||
Path(trial_a.result_path).write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"study_id": study.study_id,
|
||||
"trial_id": trial_a.trial_id,
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.5,
|
||||
"best_request_rate": 4.0,
|
||||
"best_pass_rate": 0.97,
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
proposal_b = Proposal.from_dict(
|
||||
{
|
||||
"observation": "Obs",
|
||||
"diagnosis": "Diag",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
|
||||
"expected_effects": ["raise rate"],
|
||||
}
|
||||
)
|
||||
trial_b, _ = store.materialize_trial(study=study, state=state, proposal=proposal_b)
|
||||
Path(trial_b.result_path).write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"study_id": study.study_id,
|
||||
"trial_id": trial_b.trial_id,
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.4,
|
||||
"best_request_rate": 3.0,
|
||||
"best_pass_rate": 0.97,
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
next_state = store.ingest_trial_results(study.study_id)
|
||||
self.assertEqual(next_state.best_trial_id, trial_b.trial_id)
|
||||
self.assertEqual(next_state.best_parallel_size, 2)
|
||||
self.assertEqual(next_state.best_request_rate, 3.0)
|
||||
self.assertEqual(next_state.best_request_rate_per_gpu, 1.5)
|
||||
self.assertEqual(next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"], 1.0)
|
||||
self.assertEqual(next_state.best_by_parallel_size["2"]["best_request_rate_per_gpu"], 1.5)
|
||||
|
||||
def test_validate_proposal_rejects_invalid_tp_dp_product(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
|
||||
Reference in New Issue
Block a user