Harden trial measurement accounting

2026-05-06 21:18:09 +08:00
parent 871c4cfc02
commit c1ff64381d
8 changed files with 366 additions and 16 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -9,9 +9,9 @@ from pathlib import Path
 from unittest import mock

 from aituner.cli import main as cli_main
-from aituner.compare import load_compare_spec, run_compare
+from aituner.compare import _aggregate_summary, load_compare_spec, run_compare
 from aituner.engine import build_launch_recipe
-from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
+from aituner.http_client import StreamMetrics, _auth_headers, _openai_url, _should_bypass_proxy
 from aituner.job import append_job, build_trial_job
 from aituner.harness import (
    build_harness_context,
@@ -34,9 +34,11 @@ from aituner.store import StudyStore
 from aituner.trace import load_trace_requests, summarize_window
 from aituner.worker import (
    _latency_summary,
+    _run_one_request,
    _replay_requests,
    _terminate_process_tree,
    _wait_for_server_or_exit,
+    run_trial,
 )
 from aituner.trace import TraceRequest

@@ -863,6 +865,24 @@ class CoreFlowTests(unittest.TestCase):
            with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
                load_study_spec(study_path)

+    def test_trace_rejects_non_positive_max_requests_per_probe(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            study_path = _write_study_assets(
+                Path(tmp),
+                trace_overrides={"max_requests_per_probe": 0},
+            )
+            with self.assertRaisesRegex(SpecError, "max_requests_per_probe must be > 0"):
+                load_study_spec(study_path)
+
+    def test_trace_rejects_invalid_replay_time_scale(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            study_path = _write_study_assets(
+                Path(tmp),
+                trace_overrides={"replay_time_scale": 0.0},
+            )
+            with self.assertRaisesRegex(SpecError, "replay_time_scale must be > 0"):
+                load_study_spec(study_path)
+
    def test_decode_only_mode_is_loaded_and_prompt_mentions_it(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -1456,6 +1476,34 @@ class CoreFlowTests(unittest.TestCase):
            self.assertEqual(requests[2].body["min_tokens"], 1)
            self.assertEqual(requests[2].body["max_tokens"], 1)

+    def test_run_one_request_fails_fixed_length_completion_mismatch(self) -> None:
+        request = TraceRequest(
+            row_id="r1",
+            arrival_s=0.0,
+            sampling_u=0.1,
+            body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
+            prompt_tokens_hint=8,
+            completion_tokens_hint=2,
+        )
+
+        with mock.patch(
+            "aituner.worker.stream_chat_completion",
+            return_value=StreamMetrics(
+                ttft_ms=10.0,
+                tpot_ms=5.0,
+                completion_tokens=1,
+            ),
+        ):
+            outcome = _run_one_request(
+                request,
+                base_url="http://127.0.0.1:8000",
+                timeout_s=1.0,
+            )
+
+        self.assertFalse(outcome.success)
+        self.assertEqual(outcome.error, "completion_tokens_mismatch expected=2 actual=1")
+        self.assertEqual(outcome.completion_tokens, 1)
+
    def test_build_prompt_mentions_completion_tokens_override(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
@@ -1950,6 +1998,86 @@ class CoreFlowTests(unittest.TestCase):
                3.125,
            )

+    def test_run_trial_persists_probe_request_details(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            payload = json.loads(study_path.read_text(encoding="utf-8"))
+            payload["search"]["max_probes"] = 1
+            study_path.write_text(json.dumps(payload), encoding="utf-8")
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = store.load_state(study.study_id)
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "baseline",
+                    "diagnosis": "baseline",
+                    "config_patch": {"env_patch": {}, "flag_patch": {}},
+                    "expected_effects": ["measure"],
+                }
+            )
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+
+            def fake_replay(requests, **kwargs):
+                return (
+                    [
+                        RequestOutcome(
+                            request_id=request.row_id,
+                            success=True,
+                            ttft_ms=10.0,
+                            tpot_ms=5.0,
+                            prompt_tokens=request.prompt_tokens_hint,
+                            completion_tokens=request.completion_tokens_hint,
+                        )
+                        for request in requests
+                    ],
+                    False,
+                    "",
+                )
+
+            process = mock.Mock()
+            process.poll.return_value = 0
+            with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
+                with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
+                    with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
+                        with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
+                            result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
+
+            self.assertEqual(result["status"], "completed")
+            details_path = Path(trial.artifact_dir) / "probe_details.jsonl"
+            self.assertTrue(details_path.exists())
+            rows = [
+                json.loads(line)
+                for line in details_path.read_text(encoding="utf-8").splitlines()
+            ]
+            self.assertEqual(len(rows), 1)
+            self.assertEqual(rows[0]["threshold"], 0.5)
+            self.assertEqual(rows[0]["outcomes"][0]["request_id"], "r1")
+            self.assertEqual(rows[0]["outcomes"][0]["sampling_u"], 0.1)
+
+    def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = store.load_state(study.study_id)
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "baseline",
+                    "diagnosis": "baseline",
+                    "config_patch": {"env_patch": {}, "flag_patch": {}},
+                    "expected_effects": ["measure"],
+                }
+            )
+
+            _, next_state = store.materialize_trial(study=study, state=state, proposal=proposal)
+
+            self.assertEqual(state.trials, [])
+            self.assertEqual(len(next_state.trials), 1)
+
    def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -2969,6 +3097,44 @@ class CoreFlowTests(unittest.TestCase):
            self.assertTrue((tmp_path / ".compare" / "summary.json").exists())
            self.assertTrue((tmp_path / ".compare" / "report.md").exists())

+    def test_compare_aggregate_counts_failed_and_no_feasible_windows(self) -> None:
+        summary = _aggregate_summary(
+            [
+                {
+                    "baseline": {
+                        "status": "completed",
+                        "best_request_rate": 1.0,
+                        "best_request_rate_per_gpu": 1.0,
+                    },
+                    "tuned": {
+                        "status": "completed",
+                        "best_request_rate": None,
+                        "best_request_rate_per_gpu": None,
+                    },
+                    "delta": {"winner": "baseline"},
+                },
+                {
+                    "baseline": {
+                        "status": "failed",
+                        "best_request_rate": None,
+                        "best_request_rate_per_gpu": None,
+                    },
+                    "tuned": {
+                        "status": "completed",
+                        "best_request_rate": 2.0,
+                        "best_request_rate_per_gpu": 2.0,
+                    },
+                    "delta": {"winner": "tuned"},
+                },
+            ]
+        )
+        self.assertEqual(summary["baseline_completed_window_count"], 1)
+        self.assertEqual(summary["baseline_failed_window_count"], 1)
+        self.assertEqual(summary["baseline_no_feasible_window_count"], 1)
+        self.assertEqual(summary["tuned_completed_window_count"], 2)
+        self.assertEqual(summary["tuned_failed_window_count"], 0)
+        self.assertEqual(summary["tuned_no_feasible_window_count"], 1)
+
    def test_run_compare_resolves_trial_ref_candidate(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)