Add multi-window baseline vs tuned compare flow

2026-04-11 13:51:54 +08:00
parent a0b2d7eab2
commit 5e54e9c8f5
5 changed files with 860 additions and 0 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -9,6 +9,7 @@ from pathlib import Path
 from unittest import mock

 from aituner.cli import main as cli_main
+from aituner.compare import load_compare_spec, run_compare
 from aituner.engine import build_launch_recipe
 from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
 from aituner.job import append_job, build_trial_job
@@ -162,6 +163,36 @@ def _write_study_assets(
    return study_path


+def _write_compare_assets(
+    tmp_path: Path,
+    *,
+    study_path: Path,
+    window_ids: list[str] | None = None,
+    window_selector: dict[str, object] | None = None,
+    baseline: dict[str, object] | None = None,
+    tuned: dict[str, object] | None = None,
+) -> Path:
+    compare_path = tmp_path / "compare.json"
+    payload: dict[str, object] = {
+        "compare_id": "compare-1",
+        "study_spec_path": str(study_path),
+        "baseline": baseline or {"config_patch": {"env_patch": {}, "flag_patch": {}}},
+        "tuned": tuned
+        or {
+            "config_patch": {
+                "env_patch": {},
+                "flag_patch": {"tensor-parallel-size": 2},
+            }
+        },
+    }
+    if window_ids is not None:
+        payload["window_ids"] = window_ids
+    if window_selector is not None:
+        payload["window_selector"] = window_selector
+    compare_path.write_text(json.dumps(payload), encoding="utf-8")
+    return compare_path
+
+
 class CoreFlowTests(unittest.TestCase):
    def test_trace_and_prompt_flow(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
@@ -1597,6 +1628,243 @@ class CoreFlowTests(unittest.TestCase):
            self.assertEqual(state.best_request_rate, 2.0)
            self.assertEqual(state.next_trial_index, 3)

+    def test_load_compare_spec_requires_window_selection(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            compare_path = tmp_path / "compare.json"
+            compare_path.write_text(
+                json.dumps(
+                    {
+                        "compare_id": "compare-1",
+                        "study_spec_path": str(study_path),
+                        "baseline": {"config_patch": {"env_patch": {}, "flag_patch": {}}},
+                        "tuned": {"config_patch": {"env_patch": {}, "flag_patch": {}}},
+                    }
+                ),
+                encoding="utf-8",
+            )
+            with self.assertRaisesRegex(SpecError, "window_ids or window_selector"):
+                load_compare_spec(compare_path)
+
+    def test_run_compare_outputs_summary_and_report(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            trace_dir = tmp_path / "trace_windows" / "traces"
+            trace_path = trace_dir / "chat_w2.jsonl"
+            trace_path.write_text(
+                json.dumps(
+                    {
+                        "request_id": "r4",
+                        "timestamp": 0.0,
+                        "sampling_u": 0.2,
+                        "messages": [{"role": "user", "content": "extra"}],
+                        "input_length": 3000,
+                        "output_length": 32,
+                    }
+                )
+                + "\n",
+                encoding="utf-8",
+            )
+            windows_path = tmp_path / "trace_windows" / "windows.json"
+            windows_payload = json.loads(windows_path.read_text(encoding="utf-8"))
+            windows_payload["windows"].append(
+                {
+                    "window_id": "chat_w2",
+                    "trace_type": "chat",
+                    "trace_file": "traces/chat_w2.jsonl",
+                    "window_start": 0.0,
+                    "window_end": 10.0,
+                    "date": "2026-03-12",
+                    "slot_token": "1000",
+                    "slot_label": "10:00-10:10",
+                }
+            )
+            windows_payload["windows"][0]["date"] = "2026-03-11"
+            windows_payload["windows"][0]["slot_token"] = "1000"
+            windows_payload["windows"][0]["slot_label"] = "10:00-10:10"
+            windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
+            compare_path = _write_compare_assets(
+                tmp_path,
+                study_path=study_path,
+                window_ids=["chat_w1", "chat_w2"],
+            )
+
+            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
+                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
+                source_path = Path(trial_payload["study_spec_path"])
+                actual_spec_path = Path(source_path.read_text(encoding="utf-8").strip())
+                study_payload = json.loads(actual_spec_path.read_text(encoding="utf-8"))
+                window_id = study_payload["trace"]["window_id"]
+                trial_id = trial_payload["trial_id"]
+                rate_map = {
+                    ("chat_w1", "baseline"): 1.0,
+                    ("chat_w1", "tuned"): 3.0,
+                    ("chat_w2", "baseline"): 3.0,
+                    ("chat_w2", "tuned"): 7.0,
+                }
+                best_rate = rate_map[(window_id, trial_id)]
+                result = {
+                    "study_id": trial_payload["study_id"],
+                    "trial_id": trial_id,
+                    "status": "completed",
+                    "best_sampling_u": 0.5,
+                    "best_request_rate": best_rate,
+                    "best_pass_rate": 1.0,
+                    "best_request_count": 2,
+                    "probes": [],
+                }
+                Path(trial_payload["result_path"]).write_text(
+                    json.dumps(result),
+                    encoding="utf-8",
+                )
+                return result
+
+            with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
+                summary = run_compare(compare_path, output_root=tmp_path / ".compare")
+            self.assertEqual(len(summary["windows"]), 2)
+            self.assertEqual(summary["aggregate"]["wins"]["tuned"], 2)
+            self.assertTrue((tmp_path / ".compare" / "summary.json").exists())
+            self.assertTrue((tmp_path / ".compare" / "report.md").exists())
+
+    def test_run_compare_resolves_trial_ref_candidate(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            prior_root = tmp_path / "prior-study"
+            trial_dir = prior_root / "trials" / "trial-0002"
+            trial_dir.mkdir(parents=True)
+            trial_spec = {
+                "study_id": "prior-study",
+                "trial_id": "trial-0002",
+                "config_patch": {
+                    "env_patch": {},
+                    "flag_patch": {"data-parallel-size": 2},
+                },
+                "search": {
+                    "low": 0.0,
+                    "high": 1.0,
+                    "tolerance": 0.01,
+                    "max_probes": 8,
+                    "sample_seed": 20260325,
+                },
+                "study_spec_path": str(study_path),
+                "artifact_dir": str(trial_dir),
+                "probe_log_path": str(trial_dir / "probe_history.json"),
+                "engine_log_path": str(trial_dir / "engine.log"),
+                "result_path": str(trial_dir / "result.json"),
+            }
+            (trial_dir / "trial_spec.json").write_text(json.dumps(trial_spec), encoding="utf-8")
+            compare_path = _write_compare_assets(
+                tmp_path,
+                study_path=study_path,
+                window_ids=["chat_w1"],
+                baseline={
+                    "trial_ref": {
+                        "study_root": str(prior_root),
+                        "trial_id": "trial-0002",
+                    }
+                },
+            )
+
+            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
+                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
+                flags = (trial_payload["config_patch"] or {}).get("flag_patch") or {}
+                best_rate = 5.0 if flags.get("data-parallel-size") == 2 else 2.0
+                result = {
+                    "study_id": trial_payload["study_id"],
+                    "trial_id": trial_payload["trial_id"],
+                    "status": "completed",
+                    "best_sampling_u": 0.5,
+                    "best_request_rate": best_rate,
+                    "best_pass_rate": 1.0,
+                    "best_request_count": 2,
+                    "probes": [],
+                }
+                Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8")
+                return result
+
+            with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
+                summary = run_compare(compare_path, output_root=tmp_path / ".compare")
+            self.assertEqual(summary["baseline_source"]["kind"], "trial_ref")
+            self.assertEqual(
+                summary["windows"][0]["baseline"]["config_patch"]["flag_patch"]["data-parallel-size"],
+                2,
+            )
+
+    def test_run_compare_window_selector_filters_windows(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            trace_dir = tmp_path / "trace_windows" / "traces"
+            for name in ("chat_w2.jsonl", "thinking_w3.jsonl"):
+                (trace_dir / name).write_text(
+                    json.dumps(
+                        {
+                            "request_id": name,
+                            "timestamp": 0.0,
+                            "sampling_u": 0.2,
+                            "messages": [{"role": "user", "content": name}],
+                            "input_length": 3000,
+                            "output_length": 32,
+                        }
+                    )
+                    + "\n",
+                    encoding="utf-8",
+                )
+            windows_path = tmp_path / "trace_windows" / "windows.json"
+            windows_payload = json.loads(windows_path.read_text(encoding="utf-8"))
+            windows_payload["windows"][0]["date"] = "2026-03-11"
+            windows_payload["windows"][0]["slot_token"] = "1000"
+            windows_payload["windows"].append(
+                {
+                    "window_id": "chat_w2",
+                    "trace_type": "chat",
+                    "trace_file": "traces/chat_w2.jsonl",
+                    "window_start": 0.0,
+                    "window_end": 10.0,
+                    "date": "2026-03-12",
+                    "slot_token": "1000",
+                }
+            )
+            windows_payload["windows"].append(
+                {
+                    "window_id": "thinking_w3",
+                    "trace_type": "thinking",
+                    "trace_file": "traces/thinking_w3.jsonl",
+                    "window_start": 0.0,
+                    "window_end": 10.0,
+                    "date": "2026-03-12",
+                    "slot_token": "1000",
+                }
+            )
+            windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
+            compare_path = _write_compare_assets(
+                tmp_path,
+                study_path=study_path,
+                window_selector={"trace_type": "chat", "date_prefix": "2026-03-12"},
+            )
+
+            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
+                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
+                result = {
+                    "study_id": trial_payload["study_id"],
+                    "trial_id": trial_payload["trial_id"],
+                    "status": "completed",
+                    "best_sampling_u": 0.5,
+                    "best_request_rate": 1.0,
+                    "best_pass_rate": 1.0,
+                    "best_request_count": 2,
+                    "probes": [],
+                }
+                Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8")
+                return result
+
+            with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
+                summary = run_compare(compare_path, output_root=tmp_path / ".compare")
+            self.assertEqual([row["window_id"] for row in summary["windows"]], ["chat_w2"])
+
    def test_proposal_expected_effects_accepts_string(self) -> None:
        proposal = Proposal.from_dict(
            {