Fix topology-aware incumbents for qwen27b tuning

2026-04-11 00:32:41 +08:00
parent 06d4c380b3
commit a4d54442db
5 changed files with 282 additions and 22 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -1039,6 +1039,12 @@ class CoreFlowTests(unittest.TestCase):
            self.assertEqual(next_state.best_trial_id, trial.trial_id)
            self.assertEqual(next_state.best_sampling_u, 0.75)
            self.assertEqual(next_state.best_request_rate, 12.5)
+            self.assertEqual(next_state.best_parallel_size, 4)
+            self.assertEqual(next_state.best_request_rate_per_gpu, 3.125)
+            self.assertEqual(
+                next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"],
+                3.125,
+            )

    def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
@@ -1050,8 +1056,10 @@ class CoreFlowTests(unittest.TestCase):
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
+                best_parallel_size=4,
                best_sampling_u=0.375,
                best_request_rate=3.0,
+                best_request_rate_per_gpu=0.75,
                next_trial_index=2,
                trials=[],
            )
@@ -1067,6 +1075,71 @@ class CoreFlowTests(unittest.TestCase):
            self.assertEqual(trial.search.low, 0.375)
            self.assertEqual(trial.search.high, 1.0)

+    def test_materialize_trial_uses_same_parallel_group_incumbent(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0001",
+                best_parallel_size=4,
+                best_sampling_u=0.375,
+                best_request_rate=3.0,
+                best_request_rate_per_gpu=0.75,
+                next_trial_index=2,
+                best_by_parallel_size={
+                    "2": {
+                        "trial_id": "trial-0000",
+                        "parallel_size": 2,
+                        "best_sampling_u": 0.125,
+                        "best_request_rate": 0.8,
+                        "best_request_rate_per_gpu": 0.4,
+                    }
+                },
+                trials=[],
+            )
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "Obs",
+                    "diagnosis": "Diag",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
+                    "expected_effects": ["raise rate"],
+                }
+            )
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+            self.assertEqual(trial.search.low, 0.125)
+
+    def test_materialize_trial_scales_search_floor_for_different_parallel_group(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0001",
+                best_parallel_size=4,
+                best_sampling_u=0.4,
+                best_request_rate=3.0,
+                best_request_rate_per_gpu=0.75,
+                next_trial_index=2,
+                trials=[],
+            )
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "Obs",
+                    "diagnosis": "Diag",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
+                    "expected_effects": ["raise rate"],
+                }
+            )
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+            self.assertEqual(trial.search.low, 0.2)
+
    def test_ingest_trial_results_records_failure_reason(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -1137,6 +1210,66 @@ class CoreFlowTests(unittest.TestCase):
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.trials[0].failure_stage, "engine_launch")

+    def test_ingest_trial_results_prefers_higher_request_rate_per_gpu(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = store.load_state(study.study_id)
+            proposal_a = Proposal.from_dict(
+                {
+                    "observation": "Obs",
+                    "diagnosis": "Diag",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
+                    "expected_effects": ["raise rate"],
+                }
+            )
+            trial_a, state = store.materialize_trial(study=study, state=state, proposal=proposal_a)
+            Path(trial_a.result_path).write_text(
+                json.dumps(
+                    {
+                        "study_id": study.study_id,
+                        "trial_id": trial_a.trial_id,
+                        "status": "completed",
+                        "best_sampling_u": 0.5,
+                        "best_request_rate": 4.0,
+                        "best_pass_rate": 0.97,
+                    }
+                ),
+                encoding="utf-8",
+            )
+            proposal_b = Proposal.from_dict(
+                {
+                    "observation": "Obs",
+                    "diagnosis": "Diag",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
+                    "expected_effects": ["raise rate"],
+                }
+            )
+            trial_b, _ = store.materialize_trial(study=study, state=state, proposal=proposal_b)
+            Path(trial_b.result_path).write_text(
+                json.dumps(
+                    {
+                        "study_id": study.study_id,
+                        "trial_id": trial_b.trial_id,
+                        "status": "completed",
+                        "best_sampling_u": 0.4,
+                        "best_request_rate": 3.0,
+                        "best_pass_rate": 0.97,
+                    }
+                ),
+                encoding="utf-8",
+            )
+            next_state = store.ingest_trial_results(study.study_id)
+            self.assertEqual(next_state.best_trial_id, trial_b.trial_id)
+            self.assertEqual(next_state.best_parallel_size, 2)
+            self.assertEqual(next_state.best_request_rate, 3.0)
+            self.assertEqual(next_state.best_request_rate_per_gpu, 1.5)
+            self.assertEqual(next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"], 1.0)
+            self.assertEqual(next_state.best_by_parallel_size["2"]["best_request_rate_per_gpu"], 1.5)
+
    def test_validate_proposal_rejects_invalid_tp_dp_product(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)