Add advisory harness attribution and descriptor planner MVP

2026-06-30 12:05:03 +08:00
parent 08429e5da8
commit adb5356c4b
11 changed files with 1066 additions and 9 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -7410,6 +7410,231 @@ class CoreFlowTests(unittest.TestCase):
                (store.study_root(study.study_id) / "harness" / "candidate-set-0002.json").exists()
            )

+    def test_cli_tune_records_advisory_llm_out_of_set_candidate_family_gap(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            payload = json.loads(study_path.read_text(encoding="utf-8"))
+            payload["llm"]["endpoint"] = {
+                "provider": "custom",
+                "base_url": "http://llm.example/v1",
+                "wire_api": "chat.completions",
+                "model": "test-model",
+                "api_key_env": "OPENAI_API_KEY",
+            }
+            study_path.write_text(json.dumps(payload), encoding="utf-8")
+            study = load_study_spec(study_path)
+            store_root = tmp_path / "store"
+            store = StudyStore(store_root)
+            store.init_study(spec_path=study_path, study=study)
+            store.save_state(
+                StudyState(
+                    study_id=study.study_id,
+                    best_trial_id="trial-0001",
+                    best_parallel_size=1,
+                    best_sampling_u=0.25,
+                    best_request_rate=1.0,
+                    best_request_rate_per_gpu=1.0,
+                    next_trial_index=2,
+                    trials=[
+                        TrialSummary(
+                            trial_id="trial-0001",
+                            status="completed",
+                            parallel_size=1,
+                            best_request_rate=1.0,
+                            best_request_rate_per_gpu=1.0,
+                            config_patch={
+                                "env_patch": {},
+                                "flag_patch": {"max-num-seqs": 8},
+                            },
+                        )
+                    ],
+                )
+            )
+            harness_context = {
+                "experiment_plan": {
+                    "planner_version": "test",
+                    "candidate_set": {
+                        "candidate_set_hash": "candidate-set-test",
+                        "eligible_candidates": [
+                            {
+                                "candidate_id": "cand-mns16",
+                                "action_id": "coordinate_step:max-num-seqs:8->16",
+                                "knob_family": "max-num-seqs",
+                                "score": 0.8,
+                                "effective_config_fingerprint": "not-the-llm-proposal",
+                                "config_patch": {
+                                    "env_patch": {},
+                                    "flag_patch": {"max-num-seqs": 16},
+                                },
+                            }
+                        ],
+                        "blocked_candidates": [],
+                    },
+                    "next_action": None,
+                }
+            }
+            llm_payload = json.dumps(
+                {
+                    "observation": "Harness is in the right admission direction but too conservative.",
+                    "diagnosis": "Try a larger same-operator admission step.",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
+                    "expected_effects": ["test whether admission capacity was underexplored"],
+                    "why_not_previous_failures": "new value and no launch failure evidence",
+                    "should_stop": False,
+                }
+            )
+
+            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
+                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
+                trial_root = Path(trial_payload["artifact_dir"])
+                result = {
+                    "study_id": trial_payload["study_id"],
+                    "trial_id": trial_payload["trial_id"],
+                    "status": "completed",
+                    "best_sampling_u": 0.5,
+                    "best_request_rate": 2.0,
+                    "best_pass_rate": 1.0,
+                    "best_request_count": 2,
+                    "probes": [],
+                }
+                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
+                return result
+
+            buffer = io.StringIO()
+            with mock.patch("aituner.cli.build_harness_context", return_value=harness_context):
+                with mock.patch("aituner.llm.build_harness_context", return_value=harness_context):
+                    with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
+                        with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
+                            with contextlib.redirect_stdout(buffer):
+                                exit_code = cli_main(
+                                    [
+                                        "study",
+                                        "tune",
+                                        "--spec",
+                                        str(study_path),
+                                        "--store-root",
+                                        str(store_root),
+                                        "--skip-baseline",
+                                        "--max-trials",
+                                        "2",
+                                        "--proposal-policy",
+                                        "llm-first",
+                                    ]
+                                )
+
+            self.assertEqual(exit_code, 0)
+            summary = json.loads(buffer.getvalue())
+            executed = summary["executed_trials"]
+            self.assertEqual(executed[0]["proposal_origin"], "llm_out_of_set")
+            self.assertTrue(executed[0]["candidate_family_gap_path"])
+            attribution_path = (
+                store.study_root(study.study_id)
+                / "proposal_attributions"
+                / "proposal-0002.json"
+            )
+            attribution = json.loads(attribution_path.read_text(encoding="utf-8"))
+            self.assertEqual(attribution["proposal_origin"], "llm_out_of_set")
+            self.assertEqual(attribution["harness_candidate_policy"], "advisory")
+            gap_path = Path(executed[0]["candidate_family_gap_path"])
+            gap = json.loads(gap_path.read_text(encoding="utf-8"))
+            self.assertEqual(gap["gap_type"], "same_operator_new_step")
+            self.assertEqual(gap["review_status"], "pending")
+            self.assertEqual(gap["changed_knobs"], ["flag:max-num-seqs"])
+            self.assertEqual(gap["proposal_patch"]["flag_patch"]["max-num-seqs"], 24)
+            self.assertEqual(gap["nearest_harness_candidates"][0]["candidate_id"], "cand-mns16")
+
+    def test_cli_tune_strict_harness_policy_rejects_llm_out_of_set_proposal(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            payload = json.loads(study_path.read_text(encoding="utf-8"))
+            payload["llm"]["harness_candidate_policy"] = "strict"
+            payload["llm"]["endpoint"] = {
+                "provider": "custom",
+                "base_url": "http://llm.example/v1",
+                "wire_api": "chat.completions",
+                "model": "test-model",
+                "api_key_env": "OPENAI_API_KEY",
+            }
+            study_path.write_text(json.dumps(payload), encoding="utf-8")
+            study = load_study_spec(study_path)
+            store_root = tmp_path / "store"
+            store = StudyStore(store_root)
+            store.init_study(spec_path=study_path, study=study)
+            store.save_state(
+                StudyState(
+                    study_id=study.study_id,
+                    best_trial_id="trial-0001",
+                    best_parallel_size=1,
+                    best_request_rate=1.0,
+                    best_request_rate_per_gpu=1.0,
+                    next_trial_index=2,
+                    trials=[
+                        TrialSummary(
+                            trial_id="trial-0001",
+                            status="completed",
+                            parallel_size=1,
+                            best_request_rate=1.0,
+                            best_request_rate_per_gpu=1.0,
+                            config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 8}},
+                        )
+                    ],
+                )
+            )
+            harness_context = {
+                "experiment_plan": {
+                    "candidate_set": {
+                        "candidate_set_hash": "candidate-set-test",
+                        "eligible_candidates": [
+                            {
+                                "candidate_id": "cand-mns16",
+                                "effective_config_fingerprint": "not-the-llm-proposal",
+                                "config_patch": {
+                                    "env_patch": {},
+                                    "flag_patch": {"max-num-seqs": 16},
+                                },
+                            }
+                        ],
+                    }
+                }
+            }
+            llm_payload = json.dumps(
+                {
+                    "observation": "Try an out-of-set candidate.",
+                    "diagnosis": "strict mode should reject this.",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
+                    "expected_effects": ["should not run"],
+                    "why_not_previous_failures": "",
+                    "should_stop": False,
+                }
+            )
+            stderr = io.StringIO()
+            with mock.patch("aituner.cli.build_harness_context", return_value=harness_context):
+                with mock.patch("aituner.llm.build_harness_context", return_value=harness_context):
+                    with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
+                        with mock.patch("aituner.cli.run_trial") as run_trial_mock:
+                            with contextlib.redirect_stderr(stderr):
+                                exit_code = cli_main(
+                                    [
+                                        "study",
+                                        "tune",
+                                        "--spec",
+                                        str(study_path),
+                                        "--store-root",
+                                        str(store_root),
+                                        "--skip-baseline",
+                                        "--max-trials",
+                                        "2",
+                                        "--proposal-policy",
+                                        "llm-first",
+                                    ]
+                                )
+
+            self.assertEqual(exit_code, 2)
+            run_trial_mock.assert_not_called()
+            self.assertIn("llm.harness_candidate_policy=strict", stderr.getvalue())
+
    def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
--- a/tests/test_mechanism_planner.py
+++ b/tests/test_mechanism_planner.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import unittest
+
+from aituner.engine_adapters.vllm import default_vllm_descriptors
+from aituner.knob_descriptor import KnobConstraints, KnobDescriptor
+from aituner.mechanism_planner import coordinate_line_search_candidates
+
+
+class MechanismPlannerTests(unittest.TestCase):
+    def test_coordinate_search_uses_mechanism_not_knob_name(self) -> None:
+        vllm_descriptor = default_vllm_descriptors(tunable_flags=("max-num-seqs",))[0]
+        sglang_descriptor = KnobDescriptor(
+            name="max-running-requests",
+            location="flag",
+            value_type="int",
+            mechanisms=("admission_capacity", "kv_memory_pressure"),
+            search_geometry="positive_capacity",
+            operators=("coordinate_line_search",),
+            constraints=KnobConstraints(min_value=1, integer=True, multiple_of=8),
+            directional_effects={
+                "increase": ("admission_capacity",),
+                "decrease": ("kv_memory_pressure",),
+            },
+        )
+
+        vllm_candidates = coordinate_line_search_candidates(
+            current_config={"max-num-seqs": 8},
+            descriptors=(vllm_descriptor,),
+            evidence_weights={"admission_capacity": 0.9},
+        )
+        sglang_candidates = coordinate_line_search_candidates(
+            current_config={"max-running-requests": 8},
+            descriptors=(sglang_descriptor,),
+            evidence_weights={"admission_capacity": 0.9},
+        )
+
+        self.assertEqual(vllm_candidates[0].patch, {"max-num-seqs": 16})
+        self.assertEqual(sglang_candidates[0].patch, {"max-running-requests": 16})
+        self.assertEqual(vllm_candidates[0].mechanism, "admission_capacity")
+        self.assertEqual(sglang_candidates[0].mechanism, "admission_capacity")
+
+    def test_positive_capacity_can_decrease_for_memory_pressure(self) -> None:
+        descriptor = default_vllm_descriptors(tunable_flags=("max-num-seqs",))[0]
+
+        candidates = coordinate_line_search_candidates(
+            current_config={"max-num-seqs": 64},
+            descriptors=(descriptor,),
+            evidence_weights={"kv_memory_pressure": 0.8},
+        )
+
+        self.assertEqual(candidates[0].direction, "decrease")
+        self.assertEqual(candidates[0].patch, {"max-num-seqs": 32})
+
+    def test_bounded_fraction_respects_constraints(self) -> None:
+        descriptor = default_vllm_descriptors(tunable_flags=("gpu-memory-utilization",))[0]
+
+        candidates = coordinate_line_search_candidates(
+            current_config={"gpu-memory-utilization": 0.98},
+            descriptors=(descriptor,),
+            evidence_weights={"kv_memory_capacity": 0.8},
+        )
+
+        self.assertEqual(candidates[0].patch, {"gpu-memory-utilization": 1.0})
+
+
+if __name__ == "__main__":
+    unittest.main()