from __future__ import annotations

import json
import hashlib
import contextlib
import io
import math
import os
import signal
import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest import mock

from aituner.cli import main as cli_main
from aituner.compare import _aggregate_summary, load_compare_spec, run_compare
from aituner.config_signature import materialized_effective_config_signature
from aituner.engine import build_launch_recipe
from aituner.http_client import (
    HttpClientError,
    StreamMetrics,
    _auth_headers,
    _openai_url,
    _should_bypass_proxy,
    stream_chat_completion,
)
from aituner.job import append_job, build_trial_job
from aituner.harness import (
    _effective_config_signature,
    build_harness_context,
    build_harness_guided_proposal,
    build_harness_stop_proposal,
)
from aituner.lca import (
    build_study_workload_profile,
    build_workload_profile,
    find_convergence_prefix,
    profile_similarity,
    resolve_length_mode,
    similarity_report,
)
from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
from aituner.search import ThresholdProbe, binary_search_max_feasible
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
from aituner.spec import (
    AdaptiveStopSpec,
    ConfigPatch,
    LLMEndpointSpec,
    Proposal,
    SloSpec,
    SpecError,
    StudyState,
    TrialSummary,
    load_study_spec,
)
from aituner.store import StudyStore, resolve_auto_high_search
from aituner.trace import load_trace_requests, summarize_window
from aituner.worker import (
    _adaptive_replay_set,
    _probe_drain_deadline,
    _install_sigterm_as_keyboardinterrupt,
    _restore_sigterm,
    _should_extend_on_boundary,
    _best_feasible_probe_record,
    _latency_summary,
    _run_one_request,
    _replay_requests,
    _terminate_process_tree,
    _wait_for_server_or_exit,
    run_trial,
)
from aituner.trace import TraceRequest, WindowRecord


REPO_ROOT = Path(__file__).resolve().parents[1]


def _write_study_assets(
    tmp_path: Path,
    *,
    trace_overrides: dict[str, object] | None = None,
    slo_overrides: dict[str, object] | None = None,
    engine_overrides: dict[str, object] | None = None,
    search_overrides: dict[str, object] | None = None,
) -> Path:
    trace_dir = tmp_path / "trace_windows" / "traces"
    trace_dir.mkdir(parents=True)
    trace_path = trace_dir / "chat_w1.jsonl"
    rows = [
        {
            "request_id": "r1",
            "timestamp": 0.0,
            "sampling_u": 0.10,
            "messages": [{"role": "user", "content": "hello"}],
            "input_length": 1000,
            "output_length": 16
        },
        {
            "request_id": "r2",
            "timestamp": 1.0,
            "sampling_u": 0.50,
            "messages": [{"role": "user", "content": "world"}],
            "input_length": 5000,
            "output_length": 32
        },
        {
            "request_id": "r3",
            "timestamp": 2.0,
            "sampling_u": 0.90,
            "messages": [{"role": "user", "content": "!"}],
            "input_length": 20000,
            "output_length": 64
        }
    ]
    with trace_path.open("w", encoding="utf-8") as handle:
        for row in rows:
            handle.write(json.dumps(row) + "\n")

    windows_path = tmp_path / "trace_windows" / "windows.json"
    windows_payload = {
        "u_field": "sampling_u",
        "windows": [
            {
                "window_id": "chat_w1",
                "trace_type": "chat",
                "trace_file": "traces/chat_w1.jsonl",
                "window_start": 0.0,
                "window_end": 10.0
            }
        ]
    }
    windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")

    capability_path = tmp_path / "capability.json"
    capability_path.write_text(
        json.dumps({"prefill_service_by_bucket": {"4k": {"tp4_ms": 320, "tp8_ms": 240}}}),
        encoding="utf-8",
    )

    study_path = tmp_path / "study.json"
    trace_payload: dict[str, object] = {
        "windows_path": str(windows_path),
        "window_id": "chat_w1",
        "u_field": "sampling_u",
        "timestamp_field": "timestamp",
        "max_concurrency": 4,
    }
    if trace_overrides:
        trace_payload.update(trace_overrides)

    study_payload = {
        "study_id": "study-1",
        "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
        "model": {
            "model_id": "qwen",
            "served_model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507"
        },
        "engine": {
            "engine_name": "vllm",
            "engine_version": "0.1",
            "exec_path": "/usr/local/bin/vllm",
            "cwd": str(tmp_path),
            "host": "127.0.0.1",
            "port": 8000,
            "healthcheck_path": "/v1/models",
            "ready_timeout_s": 30,
            "request_timeout_s": 30,
            "launch_args": ["serve", "/models/qwen"],
            "base_envs": {"BASE_ENV": "1"},
            "base_flags": {"host": "127.0.0.1", "port": 8000},
            "tunable_envs": ["VLLM_ATTENTION_BACKEND"],
            "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
            "python_executable": "python3"
        },
        "trace": trace_payload,
        "slo": {
            "target_pass_rate": 0.95,
            "ttft_rule": {
                "kind": "step_ms",
                "buckets": [
                    {"max_input_tokens": 4096, "threshold_ms": 2000},
                    {"max_input_tokens": 16384, "threshold_ms": 5000},
                    {"threshold_ms": 9000}
                ]
            },
            "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 120}
        },
        "search": {
            "low": 0.0,
            "high": 1.0,
            "tolerance": 0.01,
            "max_probes": 8,
            "sample_seed": 20260325
        },
        "llm": {"system_prompt": "Tune it.", "max_history_trials": 8},
        "capability_profile_path": str(capability_path)
    }
    if slo_overrides:
        study_payload["slo"].update(slo_overrides)
    if engine_overrides:
        study_payload["engine"].update(engine_overrides)
    if search_overrides:
        study_payload["search"].update(search_overrides)
    study_path.write_text(json.dumps(study_payload), encoding="utf-8")
    return study_path


def _write_compare_assets(
    tmp_path: Path,
    *,
    study_path: Path,
    window_ids: list[str] | None = None,
    window_selector: dict[str, object] | None = None,
    baseline: dict[str, object] | None = None,
    tuned: dict[str, object] | None = None,
) -> Path:
    compare_path = tmp_path / "compare.json"
    payload: dict[str, object] = {
        "compare_id": "compare-1",
        "study_spec_path": str(study_path),
        "baseline": baseline or {"config_patch": {"env_patch": {}, "flag_patch": {}}},
        "tuned": tuned
        or {
            "config_patch": {
                "env_patch": {},
                "flag_patch": {"tensor-parallel-size": 2},
            }
        },
    }
    if window_ids is not None:
        payload["window_ids"] = window_ids
    if window_selector is not None:
        payload["window_selector"] = window_selector
    compare_path.write_text(json.dumps(payload), encoding="utf-8")
    return compare_path


class CoreFlowTests(unittest.TestCase):
    def test_trace_and_prompt_flow(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            study_root = store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)

            window, requests = load_trace_requests(study, study_spec_path=study_path)
            summary = summarize_window(requests, window)
            self.assertEqual(summary["request_count"], 3)
            self.assertEqual(summary["request_rate"], 0.3)

            prompt = build_prompt(
                study=study,
                window_summary=summary,
                state=state,
                capability_profile={"queueing_knee_by_bucket": {"4k": 1000}},
            )
            self.assertIn("allowed_flag_keys", prompt)
            self.assertIn("study-1", prompt)
            self.assertIn('"current_best"', prompt)
            self.assertIn("queueing_knee_by_bucket", prompt)
            self.assertIn("Harnesses:", prompt)
            self.assertIn("workload_lca_profile", prompt)
            self.assertIn("knob_harnesses", prompt)
            self.assertTrue(study_root.exists())

    def test_search_auto_high_schema_is_backward_compatible(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
                Path(tmp),
                search_overrides={"high": 0.4},
            )
            study = load_study_spec(study_path)
            self.assertFalse(study.search.auto_high.enabled)
            updated, evidence = resolve_auto_high_search(
                search=study.search,
                sampling_us=[0.1, 0.9],
            )
            self.assertEqual(updated.high, 0.4)
            self.assertEqual(evidence["reason"], "auto_high_disabled")

    def test_search_auto_high_caps_at_policy_and_trace(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
                Path(tmp),
                search_overrides={
                    "high": 0.2,
                    "auto_high": {
                        "enabled": True,
                        "max_sampling_u": 0.8,
                        "require_human_confirmation_beyond_trace": True,
                    },
                },
            )
            study = load_study_spec(study_path)
            capped_by_policy, policy_evidence = resolve_auto_high_search(
                search=study.search,
                sampling_us=[0.1, 0.9],
            )
            self.assertEqual(capped_by_policy.high, 0.8)
            self.assertEqual(
                policy_evidence["reason"],
                "search_high_raised_to_trace_ceiling",
            )

            capped_by_trace, trace_evidence = resolve_auto_high_search(
                search=study.search,
                sampling_us=[0.1, 0.7],
            )
            self.assertEqual(capped_by_trace.high, 0.7)
            self.assertEqual(trace_evidence["effective_ceiling"], 0.7)

            low_above_ceiling = study.search.__class__.from_dict(
                {
                    "low": 0.9,
                    "high": 0.95,
                    "tolerance": study.search.tolerance,
                    "max_probes": study.search.max_probes,
                    "sample_seed": study.search.sample_seed,
                    "auto_high": {
                        "enabled": True,
                        "max_sampling_u": 0.8,
                        "require_human_confirmation_beyond_trace": True,
                    },
                }
            )
            unchanged, invalid_evidence = resolve_auto_high_search(
                search=low_above_ceiling,
                sampling_us=[0.1, 0.9],
            )
            self.assertEqual(unchanged.low, 0.9)
            self.assertEqual(unchanged.high, 0.95)
            self.assertEqual(
                invalid_evidence["reason"],
                "auto_high_ceiling_below_search_low",
            )

            high_search = study.search.__class__.from_dict(
                {
                    "low": 0.0,
                    "high": 0.95,
                    "tolerance": study.search.tolerance,
                    "max_probes": study.search.max_probes,
                    "sample_seed": study.search.sample_seed,
                    "auto_high": {
                        "enabled": True,
                        "max_sampling_u": 0.8,
                        "require_human_confirmation_beyond_trace": True,
                    },
                }
            )
            lowered, lowered_evidence = resolve_auto_high_search(
                search=high_search,
                sampling_us=[0.1, 0.9],
            )
            self.assertEqual(lowered.high, 0.8)
            self.assertEqual(
                lowered_evidence["reason"],
                "search_high_lowered_to_trace_ceiling",
            )

    def test_effective_config_signature_treats_noop_patch_as_baseline(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
                Path(tmp),
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                        "gpu-memory-utilization": 0.5,
                        "max-num-seqs": 8,
                    },
                },
            )
            study = load_study_spec(study_path)

            baseline = _effective_config_signature(study, {"env_patch": {}, "flag_patch": {}})
            noop_tp = _effective_config_signature(
                study,
                {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 8}},
            )
            noop_tp_string = _effective_config_signature(
                study,
                {"env_patch": {}, "flag_patch": {"tensor-parallel-size": "8"}},
            )
            changed_tp = _effective_config_signature(
                study,
                {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
            )

            self.assertEqual(baseline, noop_tp)
            self.assertEqual(baseline, noop_tp_string)
            self.assertNotEqual(baseline, changed_tp)

    def test_materialized_signature_inherits_incumbent_topology(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "max-num-seqs": 64,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=8,
                trials=[
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                                "max-num-seqs": 160,
                            },
                        },
                    )
                ],
            )
            runtime_only = Proposal.from_dict(
                {
                    "observation": "Try the same runtime cap.",
                    "diagnosis": "This should materialize on incumbent topology.",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
                    "expected_effects": ["no-op after topology inheritance"],
                }
            )
            explicit = Proposal.from_dict(
                {
                    "observation": "Explicit duplicate.",
                    "diagnosis": "Same effective execution config.",
                    "config_patch": {
                        "env_patch": {},
                        "flag_patch": {
                            "tensor-parallel-size": "2",
                            "data-parallel-size": "4",
                            "max-num-seqs": "160",
                        },
                    },
                    "expected_effects": ["same config"],
                }
            )
            self.assertEqual(
                materialized_effective_config_signature(
                    study=study,
                    state=state,
                    proposal=runtime_only,
                ),
                materialized_effective_config_signature(
                    study=study,
                    state=state,
                    proposal=explicit,
                ),
            )

    def test_lca_workload_profile_uses_standard_10d_features(self) -> None:
        window = WindowRecord(
            window_id="w1",
            trace_path=Path("trace.jsonl"),
            trace_type="chat",
            window_start=0.0,
            window_end=4.0,
            source_payload={"block_size": 64},
        )
        requests = [
            TraceRequest(
                row_id="r1",
                arrival_s=0.0,
                sampling_u=1.0,
                body={},
                prompt_tokens_hint=100,
                completion_tokens_hint=10,
                metadata={"hash_ids": [1, 2]},
            ),
            TraceRequest(
                row_id="r2",
                arrival_s=1.0,
                sampling_u=1.0,
                body={},
                prompt_tokens_hint=100,
                completion_tokens_hint=20,
                metadata={"hash_ids": [1, 3]},
            ),
        ]

        profile = build_workload_profile(
            requests,
            window,
            gpu_count=2,
            length_mode="total",
        )

        self.assertEqual(len(profile.feature_names), 10)
        self.assertEqual(len(profile.vector), 10)
        self.assertEqual(profile.feature_names[0], "L.log_mean_length")
        self.assertAlmostEqual(profile.stats["cache"]["total_hit_length"], 64.0)
        self.assertAlmostEqual(profile.stats["cache"]["hit_rate"], 64.0 / 230.0)
        self.assertAlmostEqual(profile.stats["cache"]["input_hit_rate"], 64.0 / 200.0)
        self.assertAlmostEqual(profile.vector[3], math.log1p(32.0))
        self.assertAlmostEqual(profile.vector[5], 1.0)
        self.assertAlmostEqual(profile.stats["arrival"]["request_rate_per_gpu"], 0.25)
        self.assertAlmostEqual(profile.stats["arrival"]["fano_1s"], 0.5)
        self.assertEqual(resolve_length_mode(request_mode="decode_only"), "output")

    def test_harness_context_uses_canonical_lca_vector(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            profile = build_study_workload_profile(study, requests, window)
            state = StudyState(study_id=study.study_id, trials=[])
            summary = summarize_window(requests, window)
            context = build_harness_context(
                study=study,
                window_summary=summary,
                state=state,
                workload_profile=profile,
            )
            block = context["workload_lca_profile"]
            # The labeled L-C-A block is the canonical 10-dim metric, not ad-hoc.
            self.assertEqual(block["vector"], profile.vector)
            self.assertEqual(len(block["vector"]), 10)
            self.assertIn("RobustScaler", block["metric"])
            # Without a profile it falls back to the legacy ad-hoc rendering.
            legacy = build_harness_context(
                study=study,
                window_summary=summary,
                state=state,
            )["workload_lca_profile"]
            self.assertNotIn("vector", legacy)

    def _steady_requests(self, count: int, *, input_tokens: int = 100) -> list:
        return [
            TraceRequest(
                row_id=f"r{i}",
                arrival_s=float(i),
                sampling_u=1.0,
                body={},
                prompt_tokens_hint=input_tokens,
                completion_tokens_hint=16,
                metadata={"hash_ids": None},
            )
            for i in range(count)
        ]

    def _conv_window(self) -> WindowRecord:
        return WindowRecord(
            window_id="conv",
            trace_path=Path("trace.jsonl"),
            trace_type="chat",
            window_start=0.0,
            window_end=0.0,
            source_payload={"block_size": 64},
        )

    def test_convergence_prefix_stops_early_on_stationary_trace(self) -> None:
        requests = self._steady_requests(60)
        point = find_convergence_prefix(
            requests,
            self._conv_window(),
            gpu_count=1,
            length_mode="total",
            tau=0.9,
            tau_c=0.9,
            stable_checks=3,
            max_checks=20,
            min_fraction=0.1,
        )
        self.assertTrue(point.converged)
        # A stationary workload should be trustworthy well before the full window.
        self.assertLess(point.stop_index, len(requests))
        self.assertLess(point.fraction, 1.0)
        self.assertTrue(point.checks)

    def test_convergence_prefix_waits_when_cache_warms_late(self) -> None:
        window = self._conv_window()
        # First half: no prefix reuse. Second half: every request reuses block 1,
        # so the C dimension only stabilizes once the reuse regime is exercised.
        requests = []
        for i in range(30):
            requests.append(
                TraceRequest(
                    row_id=f"cold{i}",
                    arrival_s=float(i),
                    sampling_u=1.0,
                    body={},
                    prompt_tokens_hint=640,
                    completion_tokens_hint=16,
                    metadata={"hash_ids": [10_000 + i]},
                )
            )
        for i in range(30):
            requests.append(
                TraceRequest(
                    row_id=f"warm{i}",
                    arrival_s=float(30 + i),
                    sampling_u=1.0,
                    body={},
                    prompt_tokens_hint=640,
                    completion_tokens_hint=16,
                    metadata={"hash_ids": [1, 2, 3, 4, 5]},
                )
            )
        point = find_convergence_prefix(
            requests,
            window,
            gpu_count=1,
            length_mode="total",
            tau=0.9,
            tau_c=0.95,
            stable_checks=2,
            max_checks=20,
            min_fraction=0.1,
        )
        # The C family similarity must be low while only the cold half is seen.
        early = [c for c in point.checks if c["fraction"] <= 0.4]
        self.assertTrue(early)
        self.assertTrue(any(c["family_similarity"]["C"] < 0.9 for c in early))

    def test_stop_authority_mirrors_validator_and_blocks_fresh_stop(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study = load_study_spec(_write_study_assets(Path(tmp)))
            state = StudyState(study_id=study.study_id, trials=[])
            context = build_harness_context(
                study=study,
                window_summary={},
                state=state,
            )
            authority = context["stop_authority"]
            # The authority is the deterministic validator; with no completed
            # trials it must not authorize a stop.
            self.assertEqual(
                authority["authorized"], context["harness_stop"]["should_stop"]
            )
            self.assertFalse(authority["authorized"])

    def test_adaptive_replay_set_truncates_only_when_enabled(self) -> None:
        from types import SimpleNamespace

        requests = self._steady_requests(60)
        window = self._conv_window()
        enabled_study = SimpleNamespace(
            trace=SimpleNamespace(
                adaptive_stop=AdaptiveStopSpec(
                    enabled=True,
                    tau=0.9,
                    tau_c=0.9,
                    stable_checks=3,
                    max_checks=20,
                    min_fraction=0.1,
                ),
                request_mode="chat",
            ),
            hardware=SimpleNamespace(gpu_count=1),
        )
        replay, certificate = _adaptive_replay_set(
            requests, study=enabled_study, window=window
        )
        self.assertIsNotNone(certificate)
        self.assertTrue(certificate["enabled"])
        self.assertEqual(len(replay), certificate["stop_index"])
        self.assertLessEqual(len(replay), len(requests))

        disabled_study = SimpleNamespace(
            trace=SimpleNamespace(
                adaptive_stop=AdaptiveStopSpec(enabled=False),
                request_mode="chat",
            ),
            hardware=SimpleNamespace(gpu_count=1),
        )
        passthrough, no_cert = _adaptive_replay_set(
            requests, study=disabled_study, window=window
        )
        self.assertIsNone(no_cert)
        self.assertEqual(len(passthrough), len(requests))

    def test_boundary_guard_extends_only_near_the_slo_knee(self) -> None:
        converged = {"converged": True}
        # Truncated, converged, pass-rate on the knee -> re-measure full.
        self.assertTrue(
            _should_extend_on_boundary(
                pass_rate=0.961, target_pass_rate=0.95, certificate=converged,
                truncated=True, boundary_delta=0.02,
            )
        )
        self.assertTrue(
            _should_extend_on_boundary(
                pass_rate=0.946, target_pass_rate=0.95, certificate=converged,
                truncated=True, boundary_delta=0.02,
            )
        )
        # Clearly feasible / clearly infeasible -> trust the truncated verdict.
        self.assertFalse(
            _should_extend_on_boundary(
                pass_rate=0.99, target_pass_rate=0.95, certificate=converged,
                truncated=True, boundary_delta=0.02,
            )
        )
        self.assertFalse(
            _should_extend_on_boundary(
                pass_rate=0.50, target_pass_rate=0.95, certificate=converged,
                truncated=True, boundary_delta=0.02,
            )
        )
        # Not truncated, not converged, guard disabled, or no certificate -> no extend.
        self.assertFalse(
            _should_extend_on_boundary(
                pass_rate=0.95, target_pass_rate=0.95, certificate=converged,
                truncated=False, boundary_delta=0.02,
            )
        )
        self.assertFalse(
            _should_extend_on_boundary(
                pass_rate=0.95, target_pass_rate=0.95, certificate={"converged": False},
                truncated=True, boundary_delta=0.02,
            )
        )
        self.assertFalse(
            _should_extend_on_boundary(
                pass_rate=0.95, target_pass_rate=0.95, certificate=converged,
                truncated=True, boundary_delta=0.0,
            )
        )
        self.assertFalse(
            _should_extend_on_boundary(
                pass_rate=0.95, target_pass_rate=0.95, certificate=None,
                truncated=True, boundary_delta=0.02,
            )
        )

    def test_probe_drain_deadline_tracks_admitted_set_and_caps_at_ceiling(self) -> None:
        slo = SloSpec.from_dict(
            {
                "target_pass_rate": 0.95,
                "ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125},
                "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
            }
        )

        def req(arrival_s: float, in_tok: int, out_tok: int) -> TraceRequest:
            return TraceRequest(
                row_id="r",
                arrival_s=arrival_s,
                sampling_u=0.1,
                body={},
                prompt_tokens_hint=in_tok,
                completion_tokens_hint=out_tok,
                metadata={},
            )

        # 100 requests, last arrival 500s, p99 in=8000 / out=2000.
        reqs = [req(float(i * 5), 8000, 2000) for i in range(100)]
        # deadline = last_arrival + (ttft_ms + p99_out*tpot_ms)/1000 + margin
        #          = 495 + (5000 + 2000*50)/1000 + 30 = 495 + 105 + 30 = 630
        self.assertAlmostEqual(
            _probe_drain_deadline(reqs, slo, ceiling=1000.0), 630.0, places=3
        )
        # Ceiling caps a deadline that would otherwise exceed it.
        self.assertEqual(_probe_drain_deadline(reqs, slo, ceiling=400.0), 400.0)
        # No requests or no TPOT rule -> fall back to the ceiling.
        self.assertEqual(_probe_drain_deadline([], slo, ceiling=400.0), 400.0)

    def test_linear_ms_ttft_rule_scales_with_input_length(self) -> None:
        slo = SloSpec.from_dict(
            {
                "target_pass_rate": 0.95,
                "ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125},
                "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
            }
        )

        def ev(prompt_tokens: int, ttft_ms: float):
            return evaluate_request(
                RequestOutcome(
                    request_id="r",
                    success=True,
                    ttft_ms=ttft_ms,
                    tpot_ms=10.0,
                    prompt_tokens=prompt_tokens,
                    completion_tokens=8,
                ),
                slo,
            )

        # threshold = 4000 + 0.125*L_in : 8k->5000ms, 0->4000ms
        self.assertTrue(ev(8000, 4900).passed)
        self.assertFalse(ev(8000, 5100).passed)
        self.assertTrue(ev(0, 3900).passed)
        self.assertFalse(ev(0, 4100).passed)

    def test_streaming_socket_timeout_is_a_failed_request_not_a_crash(self) -> None:
        # A request that exceeds request_timeout_s raises TimeoutError mid-stream;
        # it must surface as HttpClientError (a failed request), never escape to
        # crash the trial.
        with mock.patch(
            "aituner.http_client._urlopen", side_effect=TimeoutError("timed out")
        ):
            with self.assertRaises(HttpClientError):
                stream_chat_completion(
                    base_url="http://127.0.0.1:1/v1",
                    body={"messages": [{"role": "user", "content": "hi"}], "stream": True},
                    timeout_s=0.5,
                )
            outcome = _run_one_request(
                TraceRequest(
                    row_id="r",
                    arrival_s=0.0,
                    sampling_u=1.0,
                    body={"messages": [{"role": "user", "content": "hi"}], "stream": True},
                    prompt_tokens_hint=10,
                    completion_tokens_hint=None,
                ),
                base_url="http://127.0.0.1:1/v1",
                timeout_s=0.5,
            )
        self.assertFalse(outcome.success)
        self.assertIn("timed out", outcome.error)

    def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None:
        # So a killed `study tune` runs the engine-teardown finally instead of
        # orphaning the vLLM EngineCore workers on the GPUs.
        import signal as _signal

        previous = _install_sigterm_as_keyboardinterrupt()
        try:
            with self.assertRaises(KeyboardInterrupt):
                _signal.raise_signal(_signal.SIGTERM)
        finally:
            _restore_sigterm(previous)

    def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
        window = WindowRecord(
            window_id="base",
            trace_path=Path("trace.jsonl"),
            trace_type="chat",
            window_start=0.0,
            window_end=4.0,
            source_payload={"block_size": 64},
        )

        def make_profile(window_id: str, input_tokens: int, *, arrival_gap: float) -> object:
            reqs = [
                TraceRequest(
                    row_id=f"{window_id}-1",
                    arrival_s=0.0,
                    sampling_u=1.0,
                    body={},
                    prompt_tokens_hint=input_tokens,
                    completion_tokens_hint=16,
                    metadata={"hash_ids": [window_id, 1]},
                ),
                TraceRequest(
                    row_id=f"{window_id}-2",
                    arrival_s=arrival_gap,
                    sampling_u=1.0,
                    body={},
                    prompt_tokens_hint=input_tokens,
                    completion_tokens_hint=16,
                    metadata={"hash_ids": [window_id, 1, 2]},
                ),
            ]
            return build_workload_profile(
                reqs,
                WindowRecord(
                    window_id=window_id,
                    trace_path=window.trace_path,
                    trace_type=window.trace_type,
                    window_start=window.window_start,
                    window_end=window.window_end,
                    source_payload=window.source_payload,
                ),
                gpu_count=1,
                length_mode="total",
            )

        p1 = make_profile("same-a", 100, arrival_gap=1.0)
        p2 = make_profile("same-b", 100, arrival_gap=1.0)
        p3 = make_profile("different", 10000, arrival_gap=0.1)

        report = similarity_report([p1, p2, p3])

        self.assertAlmostEqual(profile_similarity(p1, p2), 1.0)
        self.assertGreater(report["matrix"][0][1], report["matrix"][0][2])
        self.assertIn("L", report["pairs"][2]["family_similarity"])

    def test_cli_profile_window_outputs_lca_profile(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            stdout = io.StringIO()
            with mock.patch("sys.stdout", stdout):
                rc = cli_main(
                    [
                        "profile",
                        "window",
                        "--spec",
                        str(study_path),
                        "--gpu-count",
                        "8",
                    ]
                )

            self.assertEqual(rc, 0)
            payload = json.loads(stdout.getvalue())
            self.assertEqual(payload["profile"]["window_id"], "chat_w1")
            self.assertEqual(len(payload["profile"]["vector"]), 10)
            self.assertEqual(payload["profile"]["gpu_count"], 8)

    def test_cli_profile_window_does_not_resolve_llm_endpoint(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["endpoint"] = {
                "provider": "codex",
                "model": "gpt-5.4",
            }
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            stdout = io.StringIO()
            with mock.patch("sys.stdout", stdout):
                rc = cli_main(["profile", "window", "--spec", str(study_path)])

            self.assertEqual(rc, 0)
            self.assertEqual(json.loads(stdout.getvalue())["profile"]["window_id"], "chat_w1")

    def test_harness_uses_latency_failures_before_generic_unrecoverable(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-result.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "probes": [
                            {
                                "threshold": 0.25,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.3,
                                    "request_rate": 1.0,
                                    "early_stopped": True,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {
                                            "ttft_ms>5000.0": 70,
                                            "tpot_ms>50.0": 5,
                                            "probe_elapsed_s>240.0": 100,
                                        },
                                        "ttft_ms": {"p95": 6500.0, "p99": 7200.0},
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    )
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={
                    "prompt_tokens_p95": 5000,
                    "prompt_tail_ratio_p95_p50": 3.0,
                },
                state=state,
            )
            self.assertEqual(
                context["recent_trial_diagnostics"][0]["active_bottleneck"],
                "ttft_prefill",
            )

    def test_harness_blocks_repeating_infeasible_plateau_family(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            trial_summaries = []
            for index, (dp, pass_rate, p95) in enumerate(
                [(4, 0.345, 3818.4), (8, 0.345, 3823.4)], start=3
            ):
                result_path = tmp_path / f"trial-{index:04d}.json"
                result_path.write_text(
                    json.dumps(
                        {
                            "status": "completed",
                            "best_request_rate": None,
                            "all_infeasible_diagnostics": {
                                "threshold": 0.0078125,
                                "request_count": 148,
                                "request_rate": 0.22,
                                "pass_rate": pass_rate,
                                "early_stopped": True,
                                "early_stop_reason": "elapsed",
                                "latency_summary": {
                                    "failed_reason_counts": {"ttft_ms>5000.0": 97},
                                    "ttft_ms": {"p95": p95, "p99": 5800.0},
                                },
                            },
                        }
                    ),
                    encoding="utf-8",
                )
                trial_summaries.append(
                    TrialSummary(
                        trial_id=f"trial-{index:04d}",
                        status="completed",
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 1,
                                "data-parallel-size": dp,
                                "expert-parallel-size": 1,
                            },
                        },
                    )
                )
            context = build_harness_context(
                study=study,
                window_summary={
                    "prompt_tokens_p95": 7628,
                    "prompt_tail_ratio_p95_p50": 3.83,
                },
                state=StudyState(study_id=study.study_id, trials=trial_summaries),
            )
            guard = context["convergence_guard"]["infeasible_progress"]
            self.assertTrue(guard["plateau_detected"])
            self.assertTrue(guard["stop_if_next_probe_repeats_family"])
            self.assertEqual(guard["blocked_primary_family"], "data-parallel-size")
            self.assertTrue(
                context["convergence_guard"][
                    "should_stop_if_no_harness_can_justify_a_new_adjacent_probe"
                ]
            )
            self.assertFalse(context["convergence_guard"]["deterministic_stop"])
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertIsNone(build_harness_stop_proposal(context))

    def test_harness_strong_incumbent_guard_after_large_gain(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate_per_gpu=0.21,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=1,
                        best_request_rate=0.035,
                        best_request_rate_per_gpu=0.035,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=2,
                        best_request_rate=0.42,
                        best_request_rate_per_gpu=0.21,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 1,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={
                    "prompt_tokens_p95": 7628,
                    "prompt_tokens_p99": 8102,
                    "prompt_tail_ratio_p95_p50": 3.83,
                },
                state=state,
            )
            guard = context["convergence_guard"]["strong_incumbent"]
            self.assertTrue(guard["guard_active"])
            self.assertGreaterEqual(guard["incumbent_gain_vs_baseline"], 3.0)
            self.assertFalse(
                context["convergence_guard"][
                    "should_stop_if_no_harness_can_justify_a_new_adjacent_probe"
                ]
            )
            self.assertEqual(
                context["convergence_guard"]["reason"],
                "strong_incumbent_requires_validation_probes",
            )
            self.assertIn("validate", guard["recommended_next_action"])

    def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=8,
                best_sampling_u=0.02,
                best_request_rate=2.4,
                best_request_rate_per_gpu=0.3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.8,
                        best_request_rate_per_gpu=0.1,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.4,
                        best_request_rate_per_gpu=0.3,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        parallel_size=8,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 1,
                                "data-parallel-size": 8,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        parallel_size=8,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"max-num-seqs": 160},
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertTrue(context["harness_stop"]["should_stop"])
            self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
            proposal = build_harness_stop_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertTrue(proposal.should_stop)

    def test_harness_stop_after_non_improving_feasible_validation_is_exhausted(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=8,
                best_sampling_u=0.02,
                best_request_rate=2.4,
                best_request_rate_per_gpu=0.3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.8,
                        best_request_rate_per_gpu=0.1,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.4,
                        best_request_rate_per_gpu=0.3,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=0.25,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 1,
                                "data-parallel-size": 8,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.1,
                        best_request_rate_per_gpu=0.2625,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"max-num-seqs": 160},
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertTrue(context["harness_stop"]["should_stop"])
            self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")

    def test_harness_stop_after_gmu_incumbent_and_non_improving_topology_validation(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "gpu-memory-utilization",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0007",
                best_request_rate=6.8667,
                best_request_rate_per_gpu=3.4333,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=2.2,
                        best_request_rate_per_gpu=2.2,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=6.5167,
                        best_request_rate_per_gpu=3.2583,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 2},
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        best_request_rate=8.3667,
                        best_request_rate_per_gpu=2.0917,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 4},
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0007",
                        status="completed",
                        best_request_rate=6.8667,
                        best_request_rate_per_gpu=3.4333,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "gpu-memory-utilization": 0.97,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0008",
                        status="completed",
                        best_request_rate=4.1833,
                        best_request_rate_per_gpu=1.0458,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "data-parallel-size": 2,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0009",
                        status="completed",
                        best_request_rate=8.3667,
                        best_request_rate_per_gpu=1.0458,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 8},
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 1500},
                state=state,
            )
            self.assertTrue(context["harness_stop"]["should_stop"])
            self.assertEqual(
                context["harness_stop"]["reason"],
                "post_incumbent_validation_exhausted",
            )
            proposal = build_harness_stop_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertTrue(proposal.should_stop)

    def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={"tunable_flags": ["max-num-seqs"]},
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0006",
                best_parallel_size=8,
                best_request_rate=2.4,
                best_request_rate_per_gpu=0.3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.8,
                        best_request_rate_per_gpu=0.1,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.88,
                        best_request_rate_per_gpu=0.11,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}},
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.96,
                        best_request_rate_per_gpu=0.12,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=1.04,
                        best_request_rate_per_gpu=0.13,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 32}},
                    ),
                    TrialSummary(
                        trial_id="trial-0005",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.24,
                        best_request_rate_per_gpu=0.28,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 40}},
                    ),
                    TrialSummary(
                        trial_id="trial-0006",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.4,
                        best_request_rate_per_gpu=0.3,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 48}},
                    ),
                    TrialSummary(
                        trial_id="trial-0007",
                        status="completed",
                        parallel_size=8,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 56}},
                    ),
                    TrialSummary(
                        trial_id="trial-0008",
                        status="completed",
                        parallel_size=8,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
                    ),
                    TrialSummary(
                        trial_id="trial-0009",
                        status="completed",
                        parallel_size=8,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 72}},
                    ),
                    TrialSummary(
                        trial_id="trial-0010",
                        status="completed",
                        parallel_size=8,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 80}},
                    ),
                    TrialSummary(
                        trial_id="trial-0011",
                        status="failed",
                        parallel_size=8,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 88}},
                    ),
                    TrialSummary(
                        trial_id="trial-0012",
                        status="completed",
                        parallel_size=8,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 96}},
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertTrue(context["harness_stop"]["should_stop"])
            self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
            self.assertGreater(
                context["harness_stop"]["evidence"]["incumbent_gain_vs_baseline"],
                2.9,
            )

    def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=8,
                best_request_rate=2.4,
                best_request_rate_per_gpu=0.3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.8,
                        best_request_rate_per_gpu=0.1,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.4,
                        best_request_rate_per_gpu=0.3,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertIsNone(build_harness_stop_proposal(context))

    def test_harness_stop_when_incumbent_saturates_search_high(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.99609375,
                        "best_request_rate": 9.0,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.99609375,
                                "feasible": True,
                                "payload": {
                                    "request_count": 10,
                                    "pass_rate": 1.0,
                                    "request_rate": 9.0,
                                    "early_stopped": False,
                                    "early_stop_reason": "",
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_request_rate=9.0,
                best_request_rate_per_gpu=9.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=9.0,
                        best_request_rate_per_gpu=9.0,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    )
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertEqual(
                context["harness_stop"]["reason"],
                "search_high_saturation_requires_parallel_size_evidence",
            )
            self.assertEqual(
                context["harness_stop"]["evidence"]["objective"],
                "request_rate_per_gpu",
            )
            proposal = build_harness_stop_proposal(context)
            self.assertIsNone(proposal)

    def test_harness_stop_allows_feasible_high_probe_with_some_failures(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0004.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.99609375,
                        "best_request_rate": 1.77,
                        "best_pass_rate": 0.968,
                        "probes": [
                            {
                                "threshold": 0.99609375,
                                "feasible": True,
                                "payload": {
                                    "request_count": 1063,
                                    "pass_rate": 0.968,
                                    "request_rate": 1.77,
                                    "early_stopped": False,
                                    "early_stop_reason": "",
                                    "latency_summary": {
                                        "failed_reason_counts": {
                                            "tpot_ms>50.0": 34,
                                        }
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0004",
                best_request_rate=1.77,
                best_request_rate_per_gpu=0.4425,
                trials=[
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        best_request_rate=1.77,
                        best_request_rate_per_gpu=0.4425,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 4},
                        },
                    )
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertEqual(
                context["harness_stop"]["reason"],
                "search_high_saturation_requires_parallel_size_evidence",
            )

    def test_harness_stop_blocks_high_saturation_for_fixed_product_tp_dp_redistribution(
        self,
    ) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                    },
                    "tunable_flags": ["tensor-parallel-size", "data-parallel-size"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_tp_dp_products": [8],
                        "require_tp_dp_product_equals_gpu_count": True,
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.99609375,
                        "best_request_rate": 8.0,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.99609375,
                                "feasible": True,
                                "payload": {
                                    "request_count": 10,
                                    "pass_rate": 1.0,
                                    "request_rate": 8.0,
                                    "early_stopped": False,
                                    "early_stop_reason": "",
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_request_rate=8.0,
                best_request_rate_per_gpu=1.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=1.0,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 8,
                                "data-parallel-size": 1,
                            },
                        },
                    )
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertEqual(
                context["harness_stop"]["reason"],
                "search_high_saturation_requires_parallel_size_evidence",
            )

    def test_harness_does_not_repropose_noop_topology_equivalent_to_baseline(
        self,
    ) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                        "gpu-memory-utilization": 0.5,
                        "max-num-seqs": 8,
                    },
                    "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            trial1_result = tmp_path / "trial-0001.json"
            trial1_result.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.935616858887,
                        "best_request_rate": 8.0,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.935616858887,
                                "feasible": True,
                                "payload": {
                                    "request_count": 480,
                                    "pass_rate": 1.0,
                                    "request_rate": 8.0,
                                    "early_stopped": False,
                                    "early_stop_reason": "",
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            trial2_result = tmp_path / "trial-0002.json"
            trial2_result.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.810867944369,
                        "best_request_rate": 6.95,
                        "best_pass_rate": 0.9784,
                        "probes": [
                            {
                                "threshold": 0.873242401628,
                                "feasible": False,
                                "payload": {
                                    "request_count": 450,
                                    "pass_rate": 0.7844,
                                    "request_rate": 7.5,
                                    "early_stopped": True,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {
                                            "ttft_ms>2000.0": 42,
                                            "slo_pass_rate_unrecoverable": 49,
                                        }
                                    },
                                },
                            },
                            {
                                "threshold": 0.810867944369,
                                "feasible": True,
                                "payload": {
                                    "request_count": 417,
                                    "pass_rate": 0.9784,
                                    "request_rate": 6.95,
                                    "early_stopped": False,
                                    "early_stop_reason": "",
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>2000.0": 9}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=4,
                best_sampling_u=0.810867944369,
                best_request_rate=6.95,
                best_request_rate_per_gpu=1.7375,
                next_trial_index=3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=1.0,
                        result_path=str(trial1_result),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=6.95,
                        best_request_rate_per_gpu=1.7375,
                        result_path=str(trial2_result),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 4},
                        },
                    ),
                ],
            )

            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            candidate_set = context["experiment_plan"]["candidate_set"]
            self.assertEqual(candidate_set["version"], "candidate-set-v1")
            self.assertIn("candidate_set_hash", candidate_set)
            self.assertGreaterEqual(
                candidate_set["blocked_reason_summary"].get(
                    "blocked_noop_or_repeat_effective_full_config",
                    0,
                ),
                1,
            )
            baseline_fingerprint = hashlib.sha256(
                _effective_config_signature(
                    study,
                    {"env_patch": {}, "flag_patch": {}},
                ).encode("utf-8")
            ).hexdigest()
            blocked_baseline_equivalent = [
                item
                for item in candidate_set["blocked_candidates"]
                if item.get("effective_config_fingerprint") == baseline_fingerprint
            ]
            self.assertTrue(blocked_baseline_equivalent)
            self.assertEqual(
                blocked_baseline_equivalent[0]["blocked_reason"],
                "blocked_noop_or_repeat_effective_full_config",
            )
            self.assertIn("effective_config_fingerprint", blocked_baseline_equivalent[0])
            actions = context["experiment_plan"]["candidate_actions"]
            self.assertFalse(
                any(
                    action.get("config_patch", {}).get("flag_patch")
                    == {"tensor-parallel-size": 8}
                    for action in actions
                )
            )
            proposal = build_harness_guided_proposal(context)
            self.assertTrue(
                proposal is None
                or proposal.config_patch.flag_patch != {"tensor-parallel-size": 8}
            )

    def test_harness_guided_first_tp_probe_for_latency_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": ["tensor-parallel-size", "data-parallel-size"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_data_parallel_sizes": [1, 2],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.25,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.6,
                                    "request_rate": 4.0,
                                    "early_stopped": True,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>50.0": 40},
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_request_rate=2.0,
                best_request_rate_per_gpu=2.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=2.0,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    )
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2})
            self.assertFalse(proposal.should_stop)

    def test_harness_guided_runtime_seed_preserves_tp_incumbent(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "gpu-memory-utilization",
                        "enable-chunked-prefill",
                        "max-num-batched-tokens",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.75,
                        "best_request_rate": 6.0,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.75,
                                "feasible": True,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 1.0,
                                    "request_rate": 6.0,
                                    "early_stopped": False,
                                    "early_stop_reason": "",
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate=6.0,
                best_request_rate_per_gpu=3.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=2.0,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=6.0,
                        best_request_rate_per_gpu=3.0,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 2},
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p99": 8100},
                state=state,
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(
                proposal.config_patch.flag_patch,
                {
                    "tensor-parallel-size": 2,
                    "enable-chunked-prefill": True,
                    "max-num-batched-tokens": 16384,
                },
            )

    def test_harness_runtime_refinement_preserves_incumbent_runtime_knobs(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "gpu-memory-utilization",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                        "max-num-batched-tokens",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.098,
                        "best_request_rate": 3.3,
                        "best_pass_rate": 0.97,
                        "probes": [
                            {
                                "threshold": 0.098,
                                "feasible": True,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.97,
                                    "request_rate": 3.3,
                                    "early_stopped": False,
                                    "early_stop_reason": "",
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate=3.3,
                best_request_rate_per_gpu=0.825,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=2.5,
                        best_request_rate_per_gpu=0.625,
                        config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=3.3,
                        best_request_rate_per_gpu=0.825,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "gpu-memory-utilization": 0.92,
                                "max-num-seqs": 48,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p99": 8100},
                state=state,
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(
                proposal.config_patch.flag_patch,
                {
                    "tensor-parallel-size": 4,
                    "gpu-memory-utilization": 0.92,
                    "max-num-seqs": 48,
                    "enable-chunked-prefill": True,
                    "max-num-batched-tokens": 16384,
                },
            )

    def test_harness_raises_gpu_mem_util_on_settled_decode_bound_incumbent(self) -> None:
        """Regression for the coverage gap that let the naive baseline beat the harness:
        a settled TP incumbent that is decode_tpot-bound must get a gpu-memory-utilization
        raise (KV-cache headroom) before the harness is allowed to stop."""
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                slo_overrides={
                    "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
                },
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "gpu-memory-utilization",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.074,
                        "best_request_rate": 2.6,
                        "best_pass_rate": 0.97,
                        "probes": [
                            {
                                "threshold": 0.074,
                                "feasible": True,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.97,
                                    "request_rate": 2.6,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                            {
                                "threshold": 0.09,
                                "feasible": False,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.6,
                                    "request_rate": 3.2,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>50.0": 90}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate=2.6,
                best_request_rate_per_gpu=0.65,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=1.1,
                        best_request_rate_per_gpu=0.275,
                        config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=2.6,
                        best_request_rate_per_gpu=0.65,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "gpu-memory-utilization": 0.9,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertFalse(proposal.should_stop)
            # TP4 preserved; gpu-memory-utilization hill-climbed one step (0.9 -> 0.92).
            self.assertEqual(
                proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
            )
            self.assertEqual(
                proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.92
            )
            # And the harness must NOT authorize a stop while that knob is untried.
            self.assertIsNone(build_harness_stop_proposal(context))

    def test_harness_climbs_tp_before_gpu_mem_util_micro_tuning(self) -> None:
        """gpu-memory-utilization must not preempt an untried TP increase: at a TP2 incumbent
        with TP4 still reachable, the harness must climb TP, not micro-tune runtime."""
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                slo_overrides={
                    "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
                },
                engine_overrides={
                    "tunable_flags": ["tensor-parallel-size", "gpu-memory-utilization"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.03,
                        "best_request_rate": 1.1,
                        "best_pass_rate": 0.97,
                        "probes": [
                            {
                                "threshold": 0.03,
                                "feasible": True,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.97,
                                    "request_rate": 1.1,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                            {
                                "threshold": 0.05,
                                "feasible": False,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.6,
                                    "request_rate": 1.6,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>50.0": 90}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate=1.1,
                best_request_rate_per_gpu=0.55,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=0.6,
                        best_request_rate_per_gpu=0.6,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=1.1,
                        best_request_rate_per_gpu=0.55,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "gpu-memory-utilization": 0.9,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            # Must climb TP (to 4), and must NOT micro-tune gpu-memory-utilization yet.
            self.assertEqual(
                proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
            )
            self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch)

    def test_harness_brackets_down_from_bad_high_tp_start_before_runtime_tuning(self) -> None:
        """A no-LLM run that starts at the max TP should validate the adjacent lower
        topology before spending trials on runtime micro-tuning."""
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                slo_overrides={
                    "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
                },
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                        "gpu-memory-utilization": 0.5,
                        "max-num-seqs": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "gpu-memory-utilization",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.05,
                        "best_request_rate": 8.0,
                        "best_pass_rate": 0.96,
                        "probes": [
                            {
                                "threshold": 0.05,
                                "feasible": True,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.96,
                                    "request_rate": 8.0,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                            {
                                "threshold": 0.08,
                                "feasible": False,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.5,
                                    "request_rate": 10.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 120}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_request_rate=8.0,
                best_request_rate_per_gpu=1.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=1.0,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 6500},
                state=state,
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(
                proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
            )
            self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch)
            self.assertNotIn("max-num-seqs", proposal.config_patch.flag_patch)

    def test_harness_jumps_low_gpu_mem_util_to_nominal_floor_after_topology_settles(self) -> None:
        """A pathological gmu=0.5 start should jump to the normal operating floor
        after topology is bracketed instead of wasting many 0.02 hill-climb trials."""
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                slo_overrides={
                    "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
                },
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 2,
                        "data-parallel-size": 1,
                        "gpu-memory-utilization": 0.5,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "gpu-memory-utilization",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.07,
                        "best_request_rate": 2.4,
                        "best_pass_rate": 0.97,
                        "probes": [
                            {
                                "threshold": 0.07,
                                "feasible": True,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.97,
                                    "request_rate": 2.4,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                            {
                                "threshold": 0.1,
                                "feasible": False,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.55,
                                    "request_rate": 3.1,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>50.0": 90}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_request_rate=2.4,
                best_request_rate_per_gpu=1.2,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=2.4,
                        best_request_rate_per_gpu=1.2,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=2.2,
                        best_request_rate_per_gpu=0.55,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 4},
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 1500},
                state=state,
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(
                proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.9
            )
            self.assertNotIn("tensor-parallel-size", proposal.config_patch.flag_patch)

    def test_harness_stops_gpu_mem_util_climb_after_tied_same_topology_probe(self) -> None:
        """A same-topology gpu-memory-utilization probe must improve per-GPU rate before
        the hill-climb continues; launch success alone is not evidence to keep climbing."""
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                slo_overrides={
                    "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
                },
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "gpu-memory-utilization",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.75,
                        "best_request_rate": 6.5,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.75,
                                "feasible": True,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 1.0,
                                    "request_rate": 6.5,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                            {
                                "threshold": 0.765625,
                                "feasible": False,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.6,
                                    "request_rate": 6.7,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 80}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate=6.5,
                best_request_rate_per_gpu=3.25,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=2.2,
                        best_request_rate_per_gpu=2.2,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=6.5,
                        best_request_rate_per_gpu=3.25,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 2},
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        best_request_rate=8.4,
                        best_request_rate_per_gpu=2.1,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 4},
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        best_request_rate=6.5,
                        best_request_rate_per_gpu=3.25,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "gpu-memory-utilization": 0.92,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 1500},
                state=state,
            )
            candidates = context["experiment_plan"]["candidate_actions"]
            self.assertNotIn(
                {"tensor-parallel-size": 2, "gpu-memory-utilization": 0.94},
                [
                    item["config_patch"]["flag_patch"]
                    for item in candidates
                    if item["knob_family"] == "gpu-memory-utilization"
                ],
            )

    def test_harness_projects_measured_runtime_delta_to_other_frontier_anchor(self) -> None:
        """A runtime improvement found on one topology must be tested on other
        Pareto anchors before the harness can keep micro-tuning the source topology."""
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                slo_overrides={
                    "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
                },
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 2,
                        "data-parallel-size": 1,
                        "gpu-memory-utilization": 0.5,
                        "max-num-seqs": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "gpu-memory-utilization",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [2, 4, 8],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            latest_result_path = tmp_path / "trial-0005.json"
            latest_result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.1,
                        "best_request_rate": 8.0,
                        "best_pass_rate": 0.96,
                        "probes": [
                            {
                                "threshold": 0.1,
                                "feasible": True,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.96,
                                    "request_rate": 8.0,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                            {
                                "threshold": 0.12,
                                "feasible": False,
                                "payload": {
                                    "request_count": 300,
                                    "pass_rate": 0.6,
                                    "request_rate": 9.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 100}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0005",
                best_request_rate=8.0,
                best_request_rate_per_gpu=2.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=2,
                        best_request_rate=2.9,
                        best_request_rate_per_gpu=1.45,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=6.95,
                        best_request_rate_per_gpu=1.7375,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 4},
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=1.0,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 8},
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=6.95,
                        best_request_rate_per_gpu=1.7375,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "max-num-seqs": 16,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0005",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=2.0,
                        result_path=str(latest_result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "gpu-memory-utilization": 0.9,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0006",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=2.0,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "gpu-memory-utilization": 0.9,
                                "max-num-seqs": 16,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0007",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=2.0,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "gpu-memory-utilization": 0.92,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0008",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=2.0,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "gpu-memory-utilization": 0.9,
                                "max-num-batched-tokens": 16384,
                                "max-num-seqs": 16,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0009",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=2.0,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "gpu-memory-utilization": 0.9,
                                "enable-chunked-prefill": True,
                                "max-num-batched-tokens": 8192,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 6500},
                state=state,
            )
            next_action = context["experiment_plan"]["next_action"]
            self.assertEqual(next_action["knob_family"], "frontier-delta-projection")
            self.assertEqual(
                next_action["config_patch"]["flag_patch"],
                {
                    "tensor-parallel-size": 2,
                    "data-parallel-size": 1,
                    "gpu-memory-utilization": 0.9,
                },
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            materialized_signature = materialized_effective_config_signature(
                study=study,
                state=state,
                proposal=proposal,
            )
            tested_signatures = {
                _effective_config_signature(study, trial.config_patch)
                for trial in state.trials
            }
            self.assertNotIn(materialized_signature, tested_signatures)
            self.assertIsNone(build_harness_stop_proposal(context))

    def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "max-num-batched-tokens",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.96,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": True,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.96,
                                    "request_rate": 2.0,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                            {
                                "threshold": 0.75,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.6,
                                    "request_rate": 3.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>25.0": 40}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate=2.0,
                best_request_rate_per_gpu=1.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=0.5,
                        best_request_rate_per_gpu=0.5,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=1.0,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 2},
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
                state=state,
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 4})
            self.assertEqual(
                context["harness_proposal"]["reason"],
                "topology_frontier_probe_for_slo_pressure",
            )

    def test_profile_driven_planner_scores_unmeasured_tp_frontier(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "max-num-batched-tokens",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.96,
                        "probes": [
                            {
                                "threshold": 0.75,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.6,
                                    "request_rate": 3.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 35}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0002",
                    best_request_rate=2.0,
                    best_request_rate_per_gpu=1.0,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=0.5,
                            best_request_rate_per_gpu=0.5,
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        ),
                        TrialSummary(
                            trial_id="trial-0002",
                            status="completed",
                            best_request_rate=2.0,
                            best_request_rate_per_gpu=1.0,
                            result_path=str(result_path),
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"tensor-parallel-size": 2},
                            },
                        ),
                    ],
                ),
            )
            plan = context["experiment_plan"]
            self.assertEqual(plan["planner_version"], "profile-driven-v1")
            self.assertEqual(plan["next_action"]["knob_family"], "topology")
            self.assertEqual(
                plan["next_action"]["config_patch"]["flag_patch"],
                {"tensor-parallel-size": 4},
            )
            self.assertIn("ttft_prefill", context["bottleneck_hypotheses"][0]["name"])
            self.assertFalse(context["harness_stop"]["should_stop"])

    def test_profile_driven_topology_does_not_introduce_ep_for_ttft(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {"host": "127.0.0.1", "port": 8000},
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                        "enable-expert-parallel",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_expert_parallel_sizes": [1, 2],
                        "allowed_tp_dp_products": [1, 2, 4],
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "require_enable_expert_parallel_when_ep_gt_one": True,
                    },
                },
            )
            result_paths: list[Path] = []
            for idx in range(1, 4):
                result_path = tmp_path / f"trial-000{idx}.json"
                result_path.write_text(
                    json.dumps(
                        {
                            "status": "completed",
                            "best_sampling_u": 0.25,
                            "best_request_rate": 2.0,
                            "best_pass_rate": 1.0,
                            "probes": [
                                {
                                    "threshold": 0.5,
                                    "feasible": False,
                                    "payload": {
                                        "request_count": 100,
                                        "pass_rate": 0.6,
                                        "request_rate": 4.0,
                                        "early_stop_reason": "slo_pass_rate_unrecoverable",
                                        "latency_summary": {
                                            "failed_reason_counts": {"ttft_ms>2000": 40}
                                        },
                                    },
                                }
                            ],
                        }
                    ),
                    encoding="utf-8",
                )
                result_paths.append(result_path)
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 8192},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0002",
                    best_request_rate=4.0,
                    best_request_rate_per_gpu=2.0,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=2.0,
                            best_request_rate_per_gpu=2.0,
                            result_path=str(result_paths[0]),
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        ),
                        TrialSummary(
                            trial_id="trial-0002",
                            status="completed",
                            best_request_rate=4.0,
                            best_request_rate_per_gpu=2.0,
                            result_path=str(result_paths[1]),
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"tensor-parallel-size": 2},
                            },
                        ),
                        TrialSummary(
                            trial_id="trial-0003",
                            status="completed",
                            best_request_rate=4.0,
                            best_request_rate_per_gpu=1.0,
                            result_path=str(result_paths[2]),
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"tensor-parallel-size": 4},
                            },
                        ),
                    ],
                ),
            )
            candidate_actions = context["experiment_plan"]["candidate_actions"]
            for action in candidate_actions:
                patch = action["config_patch"]["flag_patch"]
                self.assertNotIn("enable-expert-parallel", patch)
                self.assertNotIn("expert-parallel-size", patch)

    def test_profile_driven_planner_prefers_decode_concurrency_relief(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"request_mode": "decode_only"},
                slo_overrides={
                    "ttft_rule": None,
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
                },
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 4,
                        "max-num-seqs": 64,
                    },
                    "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.25,
                        "best_request_rate": 1.0,
                        "best_pass_rate": 0.97,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.5,
                                    "request_rate": 2.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>20.0": 50}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_request_rate=1.0,
                    best_request_rate_per_gpu=0.25,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=1.0,
                            best_request_rate_per_gpu=0.25,
                            result_path=str(result_path),
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        )
                    ],
                ),
            )
            plan = context["experiment_plan"]
            self.assertEqual(plan["next_action"]["knob_family"], "max-num-seqs")
            self.assertEqual(
                plan["next_action"]["config_patch"]["flag_patch"],
                {"max-num-seqs": 32},
            )

    def test_prefill_convergence_stop_waits_for_sequence_concurrency_probe(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 1,
                        "max-num-batched-tokens": 8192,
                        "max-num-seqs": 64,
                        "enable-chunked-prefill": True,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [4, 8],
                        "allowed_data_parallel_sizes": [1, 2],
                        "allowed_tp_dp_products": [4, 8],
                    },
                },
            )

            def write_result(name: str, best_rate: float | None, pass_rate: float) -> Path:
                path = tmp_path / f"{name}.json"
                payload = {
                    "status": "completed",
                    "best_sampling_u": 0.091796875 if best_rate is not None else None,
                    "best_request_rate": best_rate,
                    "best_pass_rate": pass_rate if best_rate is not None else None,
                    "probes": [
                        {
                            "threshold": 0.09375,
                            "feasible": best_rate is not None,
                            "payload": {
                                "request_rate": best_rate,
                                "pass_rate": pass_rate,
                                "early_stop_reason": (
                                    "" if best_rate is not None else "slo_pass_rate_unrecoverable"
                                ),
                                "latency_summary": {
                                    "failed_reason_counts": {"ttft_ms>4000.0": 32}
                                },
                            },
                        }
                    ],
                }
                path.write_text(json.dumps(payload), encoding="utf-8")
                return path

            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=8,
                best_sampling_u=0.091796875,
                best_request_rate=2.303,
                best_request_rate_per_gpu=0.288,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.303,
                        best_request_rate_per_gpu=0.288,
                        best_pass_rate=0.952,
                        result_path=str(write_result("trial-0001", 2.303, 0.952)),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 8,
                                "data-parallel-size": 1,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.303,
                        best_request_rate_per_gpu=0.288,
                        best_pass_rate=0.953,
                        result_path=str(write_result("trial-0002", 2.303, 0.953)),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 8,
                                "max-num-batched-tokens": 32768,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        parallel_size=8,
                        result_path=str(write_result("trial-0003", None, 0.0)),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 4,
                                "data-parallel-size": 2,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.303,
                        best_request_rate_per_gpu=0.288,
                        best_pass_rate=0.954,
                        result_path=str(write_result("trial-0004", 2.303, 0.954)),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 8,
                                "data-parallel-size": 1,
                                "max-num-batched-tokens": 12288,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertEqual(
                context["harness_stop"]["reason"],
                "experiment_plan_has_high_value_candidate",
            )
            action = context["experiment_plan"]["next_action"]
            self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
            self.assertEqual(action["action_id"], "raise_prefill_quantum_with_chunked_prefill")
            flag_patch = action["config_patch"]["flag_patch"]
            self.assertEqual(flag_patch["tensor-parallel-size"], 8)
            self.assertGreater(flag_patch["max-num-batched-tokens"], 8192)

    def test_prefill_scheduler_lowers_quantum_by_normalized_ratio(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                        "max-num-batched-tokens": 32768,
                        "max-num-seqs": 8,
                        "enable-chunked-prefill": True,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [8],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [8],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.95,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": True,
                                "payload": {
                                    "request_rate": 2.0,
                                    "pass_rate": 0.95,
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 24}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=8,
                best_request_rate=2.0,
                best_request_rate_per_gpu=0.25,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=0.25,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    )
                ],
            )

            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
                state=state,
            )

            action = context["experiment_plan"]["next_action"]
            flag_patch = action["config_patch"]["flag_patch"]
            self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
            self.assertEqual(action["action_id"], "lower_prefill_quantum_with_chunked_prefill")
            self.assertLess(flag_patch["max-num-batched-tokens"], 32768)
            factors = action["score_factors"]
            self.assertLess(
                factors["prefill_quantum_ratio_target"],
                factors["prefill_quantum_ratio_current"],
            )

    def test_prefill_scheduler_quantum_step_scales_with_prompt_length(self) -> None:
        targets: list[int] = []
        for prompt_p95 in (8192, 16384):
            with tempfile.TemporaryDirectory() as tmp:
                tmp_path = Path(tmp)
                study_path = _write_study_assets(
                    tmp_path,
                    engine_overrides={
                        "base_flags": {
                            "host": "127.0.0.1",
                            "port": 8000,
                            "tensor-parallel-size": 8,
                            "data-parallel-size": 1,
                            "max-num-batched-tokens": 32768,
                            "max-num-seqs": 8,
                            "enable-chunked-prefill": True,
                        },
                        "tunable_flags": [
                            "tensor-parallel-size",
                            "data-parallel-size",
                            "max-num-batched-tokens",
                            "max-num-seqs",
                            "enable-chunked-prefill",
                        ],
                        "topology_constraints": {
                            "allowed_tensor_parallel_sizes": [8],
                            "allowed_data_parallel_sizes": [1],
                            "allowed_tp_dp_products": [8],
                        },
                    },
                )
                result_path = tmp_path / "trial-0001.json"
                result_path.write_text(
                    json.dumps(
                        {
                            "status": "completed",
                            "best_sampling_u": 0.5,
                            "best_request_rate": 2.0,
                            "best_pass_rate": 0.95,
                            "probes": [
                                {
                                    "threshold": 0.5,
                                    "feasible": True,
                                    "payload": {
                                        "request_rate": 2.0,
                                        "pass_rate": 0.95,
                                        "latency_summary": {
                                            "failed_reason_counts": {"ttft_ms>4000.0": 24}
                                        },
                                    },
                                }
                            ],
                        }
                    ),
                    encoding="utf-8",
                )
                study = load_study_spec(study_path)
                state = StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_parallel_size=8,
                    best_request_rate=2.0,
                    best_request_rate_per_gpu=0.25,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            parallel_size=8,
                            best_request_rate=2.0,
                            best_request_rate_per_gpu=0.25,
                            result_path=str(result_path),
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        )
                    ],
                )

                context = build_harness_context(
                    study=study,
                    window_summary={
                        "prompt_tokens_p95": prompt_p95,
                        "prompt_tail_ratio_p95_p50": 4.0,
                    },
                    state=state,
                )
                action = context["experiment_plan"]["next_action"]
                self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
                targets.append(action["config_patch"]["flag_patch"]["max-num-batched-tokens"])

        self.assertGreater(targets[1], targets[0])

    def test_prefill_scheduler_coverage_precedes_gmu_microtune(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 2,
                        "data-parallel-size": 1,
                        "gpu-memory-utilization": 0.7,
                        "max-num-seqs": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "gpu-memory-utilization",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [2, 4],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [2, 4],
                    },
                },
                trace_overrides={"max_concurrency": 64},
            )

            def write_result(name: str, request_rate: float) -> Path:
                path = tmp_path / f"{name}.json"
                path.write_text(
                    json.dumps(
                        {
                            "status": "completed",
                            "best_sampling_u": 0.5,
                            "best_request_rate": request_rate,
                            "best_pass_rate": 0.95,
                            "probes": [
                                {
                                    "threshold": 0.5,
                                    "feasible": True,
                                    "payload": {
                                        "request_rate": request_rate,
                                        "pass_rate": 0.95,
                                        "latency_summary": {
                                            "failed_reason_counts": {"ttft_ms>4000.0": 24}
                                        },
                                    },
                                }
                            ],
                        }
                    ),
                    encoding="utf-8",
                )
                return path

            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=2,
                best_request_rate=4.05,
                best_request_rate_per_gpu=2.025,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=2,
                        best_request_rate=4.05,
                        best_request_rate_per_gpu=2.025,
                        result_path=str(write_result("trial-0001", 4.05)),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=8.0,
                        best_request_rate_per_gpu=2.0,
                        result_path=str(write_result("trial-0002", 8.0)),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 4},
                        },
                    ),
                ],
            )

            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 7774, "prompt_tail_ratio_p95_p50": 3.0},
                state=state,
            )
            action = context["experiment_plan"]["next_action"]
            self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
            self.assertEqual(action["action_id"], "seed_chunked_prefill_quantum")
            self.assertGreater(
                action["score_factors"]["uncovered_scheduler_dimension_bonus"],
                0.0,
            )
            families = {
                item["knob_family"] for item in context["experiment_plan"]["candidate_actions"]
            }
            self.assertNotIn("enable-chunked-prefill", families)

    def test_prefill_scheduler_admission_pressure_only_uses_normalized_seq_cap(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"max_concurrency": 64},
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                        "max-num-batched-tokens": 8192,
                        "max-num-seqs": 8,
                        "enable-chunked-prefill": True,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [8],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [8],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.5,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": False,
                                "payload": {
                                    "request_rate": 2.0,
                                    "pass_rate": 0.5,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=8,
                best_request_rate=2.0,
                best_request_rate_per_gpu=0.25,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=0.25,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    )
                ],
            )

            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
                state=state,
            )

            action = context["experiment_plan"]["next_action"]
            flag_patch = action["config_patch"]["flag_patch"]
            self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
            self.assertEqual(action["action_id"], "raise_admission_pressure_with_chunked_prefill")
            self.assertEqual(flag_patch["max-num-seqs"], 16)
            self.assertNotIn("max-num-batched-tokens", flag_patch)
            self.assertEqual(action["score_factors"]["admission_pressure_direction"], "raise")
            self.assertLess(
                action["score_factors"]["admission_pressure_ratio_current"],
                action["score_factors"]["admission_pressure_ratio_target"],
            )

    def test_prefill_scheduler_lowers_excess_admission_pressure(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"max_concurrency": 64},
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                        "max-num-batched-tokens": 8192,
                        "max-num-seqs": 128,
                        "enable-chunked-prefill": True,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [8],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [8],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.95,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": True,
                                "payload": {
                                    "request_rate": 2.0,
                                    "pass_rate": 0.95,
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 24}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=8,
                best_request_rate=2.0,
                best_request_rate_per_gpu=0.25,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=0.25,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    )
                ],
            )

            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
                state=state,
            )

            action = context["experiment_plan"]["next_action"]
            flag_patch = action["config_patch"]["flag_patch"]
            self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
            self.assertEqual(action["action_id"], "lower_admission_pressure_with_chunked_prefill")
            self.assertLess(flag_patch["max-num-seqs"], 128)
            self.assertNotIn("max-num-batched-tokens", flag_patch)
            self.assertEqual(action["score_factors"]["admission_pressure_direction"], "lower")
            self.assertLess(
                action["score_factors"]["admission_pressure_ratio_target"],
                action["score_factors"]["admission_pressure_ratio_current"],
            )

    def test_prefill_scheduler_negative_applicability_matrix(self) -> None:
        variants = [
            (
                {"request_mode": "decode_only"},
                {"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
            ),
            (
                {},
                {
                    "prompt_tokens_p95": 8192,
                    "prompt_tail_ratio_p95_p50": 4.0,
                    "prefix_cache": {"repeated_token_ratio_estimate": 0.75},
                },
            ),
            (
                {},
                {"prompt_tokens_p95": 2048, "prompt_tail_ratio_p95_p50": 1.0},
            ),
        ]
        for trace_overrides, window_summary in variants:
            with self.subTest(trace_overrides=trace_overrides, window_summary=window_summary):
                with tempfile.TemporaryDirectory() as tmp:
                    tmp_path = Path(tmp)
                    study_path = _write_study_assets(
                        tmp_path,
                        trace_overrides=trace_overrides,
                        engine_overrides={
                            "base_flags": {
                                "host": "127.0.0.1",
                                "port": 8000,
                                "tensor-parallel-size": 8,
                                "data-parallel-size": 1,
                                "max-num-batched-tokens": 8192,
                                "max-num-seqs": 8,
                                "enable-chunked-prefill": True,
                            },
                            "tunable_flags": [
                                "tensor-parallel-size",
                                "data-parallel-size",
                                "max-num-batched-tokens",
                                "max-num-seqs",
                                "enable-chunked-prefill",
                            ],
                            "topology_constraints": {
                                "allowed_tensor_parallel_sizes": [8],
                                "allowed_data_parallel_sizes": [1],
                                "allowed_tp_dp_products": [8],
                            },
                        },
                    )
                    result_path = tmp_path / "trial-0001.json"
                    result_path.write_text(
                        json.dumps(
                            {
                                "status": "completed",
                                "best_sampling_u": 0.5,
                                "best_request_rate": 2.0,
                                "best_pass_rate": 0.95,
                                "probes": [
                                    {
                                        "threshold": 0.5,
                                        "feasible": True,
                                        "payload": {
                                            "request_rate": 2.0,
                                            "pass_rate": 0.95,
                                            "latency_summary": {
                                                "failed_reason_counts": {
                                                    "ttft_ms>4000.0": 24
                                                }
                                            },
                                        },
                                    }
                                ],
                            }
                        ),
                        encoding="utf-8",
                    )
                    study = load_study_spec(study_path)
                    state = StudyState(
                        study_id=study.study_id,
                        best_trial_id="trial-0001",
                        best_parallel_size=8,
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=0.25,
                        trials=[
                            TrialSummary(
                                trial_id="trial-0001",
                                status="completed",
                                parallel_size=8,
                                best_request_rate=2.0,
                                best_request_rate_per_gpu=0.25,
                                result_path=str(result_path),
                                config_patch={"env_patch": {}, "flag_patch": {}},
                            )
                        ],
                    )

                    context = build_harness_context(
                        study=study,
                        window_summary=window_summary,
                        state=state,
                    )
                    families = {
                        item["knob_family"]
                        for item in context["experiment_plan"]["candidate_actions"]
                    }
                    self.assertNotIn("prefill-scheduler-interaction", families)

    def test_prefill_scheduler_does_not_preempt_open_topology_frontier(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 2,
                        "data-parallel-size": 1,
                        "max-num-batched-tokens": 8192,
                        "max-num-seqs": 8,
                        "enable-chunked-prefill": True,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [2, 4],
                        "allowed_data_parallel_sizes": [1, 2],
                        "allowed_tp_dp_products": [4, 8],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.95,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": True,
                                "payload": {
                                    "request_rate": 2.0,
                                    "pass_rate": 0.95,
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 24}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=4,
                best_request_rate=2.0,
                best_request_rate_per_gpu=0.5,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=4,
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=0.5,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"data-parallel-size": 2},
                        },
                    )
                ],
            )

            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
                state=state,
            )

            action = context["experiment_plan"]["next_action"]
            self.assertEqual(action["knob_family"], "topology")
            self.assertEqual(
                action["config_patch"]["flag_patch"],
                {"tensor-parallel-size": 4, "data-parallel-size": 2},
            )
            families = {
                item["knob_family"] for item in context["experiment_plan"]["candidate_actions"]
            }
            self.assertNotIn("prefill-scheduler-interaction", families)

    def test_prefill_scheduler_not_active_for_short_prompt_workload(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 8,
                        "data-parallel-size": 1,
                        "max-num-batched-tokens": 32768,
                        "max-num-seqs": 8,
                        "enable-chunked-prefill": True,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [8],
                        "allowed_data_parallel_sizes": [1],
                        "allowed_tp_dp_products": [8],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.95,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": True,
                                "payload": {
                                    "request_rate": 2.0,
                                    "pass_rate": 0.95,
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 24}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=8,
                best_request_rate=2.0,
                best_request_rate_per_gpu=0.25,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=0.25,
                        result_path=str(result_path),
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    )
                ],
            )

            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048, "prompt_tail_ratio_p95_p50": 1.0},
                state=state,
            )

            families = {
                item["knob_family"] for item in context["experiment_plan"]["candidate_actions"]
            }
            self.assertNotIn("prefill-scheduler-interaction", families)

    def test_prefill_sequence_probe_followed_by_joint_runtime_probe(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 1,
                        "max-num-batched-tokens": 8192,
                        "max-num-seqs": 64,
                        "enable-chunked-prefill": True,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-batched-tokens",
                        "max-num-seqs",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [4, 8],
                        "allowed_data_parallel_sizes": [1, 2],
                        "allowed_tp_dp_products": [4, 8],
                    },
                },
            )

            def write_result(name: str) -> Path:
                path = tmp_path / f"{name}.json"
                payload = {
                    "status": "completed",
                    "best_sampling_u": 0.091796875,
                    "best_request_rate": 2.303,
                    "best_pass_rate": 0.951,
                    "probes": [
                        {
                            "threshold": 0.09375,
                            "feasible": True,
                            "payload": {
                                "request_rate": 2.303,
                                "pass_rate": 0.951,
                                "latency_summary": {
                                    "failed_reason_counts": {"ttft_ms>4000.0": 32}
                                },
                            },
                        }
                    ],
                }
                path.write_text(json.dumps(payload), encoding="utf-8")
                return path

            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=8,
                best_sampling_u=0.091796875,
                best_request_rate=2.303,
                best_request_rate_per_gpu=0.288,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.303,
                        best_request_rate_per_gpu=0.288,
                        best_pass_rate=0.952,
                        result_path=str(write_result("trial-0001")),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 8,
                                "data-parallel-size": 1,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.303,
                        best_request_rate_per_gpu=0.288,
                        best_pass_rate=0.950,
                        result_path=str(write_result("trial-0002")),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 8,
                                "max-num-seqs": 96,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.303,
                        best_request_rate_per_gpu=0.288,
                        best_pass_rate=0.950,
                        result_path=str(write_result("trial-0003")),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 8,
                                "data-parallel-size": 1,
                                "max-num-batched-tokens": 12288,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertEqual(
                context["harness_stop"]["reason"],
                "experiment_plan_has_high_value_candidate",
            )
            action = context["experiment_plan"]["next_action"]
            flag_patch = action["config_patch"]["flag_patch"]
            self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
            self.assertEqual(action["action_id"], "raise_prefill_quantum_with_chunked_prefill")
            self.assertEqual(flag_patch["tensor-parallel-size"], 8)
            self.assertGreater(flag_patch["max-num-batched-tokens"], 8192)
            self.assertLess(flag_patch["max-num-batched-tokens"], 24000)

    def test_slo_unrecoverable_does_not_mask_latency_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                slo_overrides={
                    "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 25},
                },
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_request_rate": 0.065,
                        "best_request_rate_per_gpu": 0.065,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.015625,
                                "feasible": False,
                                "payload": {
                                    "request_count": 290,
                                    "pass_rate": 0.041,
                                    "request_rate": 0.483,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {
                                            "ttft_ms>4000.0": 2,
                                            "tpot_ms>25.0": 14,
                                            "slo_pass_rate_unrecoverable": 263,
                                        }
                                    },
                                },
                            },
                            {
                                "threshold": 0.001953125,
                                "feasible": True,
                                "payload": {
                                    "request_count": 39,
                                    "pass_rate": 1.0,
                                    "request_rate": 0.065,
                                    "latency_summary": {"failed_reason_counts": {}},
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_request_rate=0.065,
                    best_request_rate_per_gpu=0.065,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=0.065,
                            best_request_rate_per_gpu=0.065,
                            best_pass_rate=1.0,
                            result_path=str(result_path),
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        )
                    ],
                ),
            )
            self.assertNotEqual(
                context["bottleneck_hypotheses"][0]["name"],
                "admission_or_queueing",
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2})

    def test_harness_excludes_topology_above_visible_gpu_count(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_envs": {"CUDA_VISIBLE_DEVICES": "0,1,2,4,5,6,7"},
                    "tunable_flags": ["tensor-parallel-size"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            result_path = tmp_path / "trial-0003.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_request_rate": 1.078,
                        "best_pass_rate": 0.958,
                        "probes": [
                            {
                                "threshold": 0.039,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.8,
                                    "request_rate": 1.10,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>25.0": 20}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0003",
                    best_request_rate=1.078,
                    best_request_rate_per_gpu=0.2695,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=0.065,
                            best_request_rate_per_gpu=0.065,
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        ),
                        TrialSummary(
                            trial_id="trial-0002",
                            status="completed",
                            best_request_rate=0.398,
                            best_request_rate_per_gpu=0.199,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"tensor-parallel-size": 2},
                            },
                        ),
                        TrialSummary(
                            trial_id="trial-0003",
                            status="completed",
                            best_request_rate=1.078,
                            best_request_rate_per_gpu=0.2695,
                            result_path=str(result_path),
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"tensor-parallel-size": 4},
                            },
                        ),
                    ],
                ),
            )
            candidates = context["candidate_actions"]
            self.assertFalse(
                any(
                    action["config_patch"]["flag_patch"].get("tensor-parallel-size") == 8
                    for action in candidates
                )
            )
            proposal = build_harness_guided_proposal(context)
            self.assertTrue(
                proposal is None
                or proposal.config_patch.flag_patch.get("tensor-parallel-size") != 8
            )

    def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            study = load_study_spec(study_path)
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.96,
                        "probes": [
                            {
                                "threshold": 0.75,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.6,
                                    "request_rate": 3.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>25.0": 40}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_request_rate=2.0,
                best_request_rate_per_gpu=1.0,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        best_request_rate=0.5,
                        best_request_rate_per_gpu=0.5,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        best_request_rate=2.0,
                        best_request_rate_per_gpu=1.0,
                        result_path=str(result_path),
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"tensor-parallel-size": 2},
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        best_request_rate=1.98,
                        best_request_rate_per_gpu=0.99,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 8}},
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        best_request_rate=1.98,
                        best_request_rate_per_gpu=0.99,
                        config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}},
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertEqual(context["harness_stop"]["reason"], "topology_frontier_requires_probe")

    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={
                    "input_length_filter": {
                        "min_input_tokens": 0,
                        "max_input_tokens": 8192,
                    }
                },
            )
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            summary = summarize_window(requests, window)
            self.assertEqual(len(requests), 2)
            self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
            self.assertEqual(summary["request_count"], 2)
            self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
            self.assertIn("prefix_cache", summary)
            self.assertIn("arrival_burst_ratio_p95_to_mean", summary)
            prompt = build_prompt(
                study=study,
                window_summary=summary,
                state=StudyState(study_id=study.study_id),
                capability_profile=None,
            )
            self.assertIn('"input_length_filter"', prompt)
            self.assertIn('"max_input_tokens": 8192', prompt)

    def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={
                    "input_length_filter": {
                        "min_input_tokens": 8193,
                        "max_input_tokens": 8192,
                    }
                },
            )
            with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
                load_study_spec(study_path)

    def test_trace_rejects_non_positive_max_requests_per_probe(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
                Path(tmp),
                trace_overrides={"max_requests_per_probe": 0},
            )
            with self.assertRaisesRegex(SpecError, "max_requests_per_probe must be > 0"):
                load_study_spec(study_path)

    def test_trace_rejects_invalid_replay_time_scale(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
                Path(tmp),
                trace_overrides={"replay_time_scale": 0.0},
            )
            with self.assertRaisesRegex(SpecError, "replay_time_scale must be > 0"):
                load_study_spec(study_path)

    def test_decode_only_mode_is_loaded_and_prompt_mentions_it(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"request_mode": "decode_only"},
                slo_overrides={
                    "ttft_rule": None,
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
                },
            )
            study = load_study_spec(study_path)
            self.assertEqual(study.trace.request_mode, "decode_only")
            self.assertTrue(study.trace.restart_engine_after_early_stop)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            prompt = build_prompt(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(study_id=study.study_id),
                capability_profile=None,
            )
            self.assertIn('"request_mode": "decode_only"', prompt)
            self.assertIn('"restart_engine_after_early_stop": true', prompt)
            self.assertIn("There is no TTFT SLO for this study.", prompt)
            self.assertIn("decode-only", prompt)

    def test_decode_only_restart_after_early_stop_can_be_disabled(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={
                    "request_mode": "decode_only",
                    "restart_engine_after_early_stop": False,
                },
            )
            study = load_study_spec(study_path)
            self.assertFalse(study.trace.restart_engine_after_early_stop)

    def test_chat_mode_does_not_restart_after_early_stop_by_default(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            self.assertEqual(study.trace.request_mode, "chat")
            self.assertFalse(study.trace.restart_engine_after_early_stop)

    def test_decode_only_harness_defaults_to_decode_tpot(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"request_mode": "decode_only"},
                slo_overrides={
                    "ttft_rule": None,
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
                },
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-seqs",
                        "max-num-batched-tokens",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_tp_dp_products": [8],
                        "require_tp_dp_product_equals_gpu_count": True,
                    },
                },
            )
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            context = build_harness_context(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(study_id=study.study_id),
            )
            active = {
                harness["knob_family"]
                for harness in context["knob_harnesses"]
                if harness["active_now"]
            }
            self.assertIn("tensor-parallel-size", active)
            self.assertIn("data-parallel-size", active)
            self.assertIn("max-num-seqs", active)
            self.assertIn("max-num-batched-tokens", active)
            self.assertIn(
                "For decode_only studies, ignore TTFT",
                "\n".join(context["proposal_rules"]),
            )
            self.assertIn(
                "config_patch is applied to the study base config",
                "\n".join(context["proposal_rules"]),
            )

    def test_decode_topology_planner_prefers_dp_redistribution_and_preserves_ep(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"request_mode": "decode_only"},
                slo_overrides={
                    "ttft_rule": None,
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 40},
                },
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": True,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "expert-parallel-size": 8,
                        "max-num-seqs": 192,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1, 2, 4, 8],
                        "require_tp_dp_product_equals_gpu_count": True,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "require_enable_expert_parallel_when_ep_gt_one": True,
                    },
                },
            )
            result_path = tmp_path / "trial-0001-result.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_request_rate": 0.47,
                        "best_pass_rate": 0.98,
                        "probes": [
                            {
                                "threshold": 0.04,
                                "feasible": False,
                                "payload": {
                                    "request_rate": 0.72,
                                    "pass_rate": 0.3,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>40.0": 80}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_request_rate=0.47,
                    best_request_rate_per_gpu=0.05875,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=0.47,
                            best_request_rate_per_gpu=0.05875,
                            best_pass_rate=0.98,
                            result_path=str(result_path),
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        )
                    ],
                ),
            )
            action = context["experiment_plan"]["next_action"]
            self.assertEqual(action["knob_family"], "topology")
            self.assertEqual(
                action["config_patch"]["flag_patch"],
                {"tensor-parallel-size": 2, "data-parallel-size": 4},
            )
            proposal = build_harness_guided_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertEqual(
                proposal.config_patch.flag_patch,
                {"tensor-parallel-size": 2, "data-parallel-size": 4},
            )

    def test_prompt_can_disable_harness_for_ablation(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["use_harness"] = False
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            prompt = build_prompt(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(study_id=study.study_id),
                capability_profile=None,
            )
            self.assertFalse(study.llm.use_harness)
            self.assertIn("Study context:", prompt)
            self.assertIn("Trial history:", prompt)
            self.assertIn("Known launch failures:", prompt)
            self.assertNotIn('"paper_alignment"', prompt)
            self.assertNotIn("Harnesses:", prompt)
            self.assertNotIn("Disabled by llm.use_harness=false", prompt)
            self.assertNotIn("without harness hints", prompt)
            self.assertNotIn("Window summary:", prompt)
            self.assertNotIn("Parallel space candidates:", prompt)
            self.assertNotIn("Prioritize exploring legal topology changes", prompt)

    def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"request_mode": "decode_only"},
                slo_overrides={
                    "ttft_rule": None,
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
                },
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-seqs",
                    ]
                },
            )
            result_path = tmp_path / "trial-0001-result.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_request_rate": 1.0,
                        "best_pass_rate": 1.0,
                        "probes": [
                            {
                                "threshold": 0.1,
                                "feasible": False,
                                "payload": {
                                    "request_rate": 2.0,
                                    "pass_rate": 0.1,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>20.0": 20}
                                    },
                                },
                            },
                            {
                                "threshold": 0.01,
                                "feasible": True,
                                "payload": {
                                    "request_rate": 1.0,
                                    "pass_rate": 1.0,
                                    "early_stop_reason": "probe_elapsed_s>1200.0",
                                    "latency_summary": {
                                        "failed_reason_counts": {"probe_elapsed_s>1200.0": 1}
                                    },
                                },
                            },
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={},
                state=StudyState(
                    study_id=study.study_id,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            result_path=str(result_path),
                        )
                    ],
                ),
            )
            diagnostics = context["recent_trial_diagnostics"]
            self.assertEqual(diagnostics[0]["active_bottleneck"], "decode_tpot")
            active = {
                harness["knob_family"]
                for harness in context["knob_harnesses"]
                if harness["active_now"]
            }
            self.assertIn("data-parallel-size", active)
            self.assertIn("max-num-seqs", active)

    def test_best_feasible_probe_record_keeps_partial_probe_evidence(self) -> None:
        best = _best_feasible_probe_record(
            [
                {
                    "threshold": 0.03125,
                    "request_rate": 0.72,
                    "pass_rate": 0.3,
                    "feasible": False,
                },
                {
                    "threshold": 0.015625,
                    "request_rate": 0.3533,
                    "pass_rate": 0.99,
                    "feasible": True,
                },
                {
                    "threshold": 0.017578125,
                    "request_rate": 0.3833,
                    "pass_rate": 0.995,
                    "feasible": True,
                },
            ]
        )
        self.assertIsNotNone(best)
        self.assertEqual(best["threshold"], 0.017578125)
        self.assertEqual(best["request_rate"], 0.3833)

    def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "served-model-name": "engine-name",
                    }
                },
            )
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["model"]["served_model_name"] = "trace-name"
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            with self.assertRaisesRegex(SpecError, "must match engine.base_flags"):
                load_study_spec(study_path)

    def test_bailian_endpoint_defaults(self) -> None:
        endpoint = LLMEndpointSpec.from_dict({"provider": "bailian", "model": "qwen-plus"})
        self.assertEqual(endpoint.provider, "bailian")
        self.assertEqual(
            endpoint.base_url, "https://dashscope.aliyuncs.com/compatible-mode/v1"
        )
        self.assertEqual(endpoint.api_key_env, "DASHSCOPE_API_KEY")

    def test_codex_endpoint_resolves_base_url_from_codex_config(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            codex_dir = tmp_path / ".codex"
            codex_dir.mkdir(parents=True)
            (codex_dir / "config.toml").write_text(
                '\n'.join(
                    [
                        'model_provider = "ipads"',
                        'model_reasoning_effort = "high"',
                        "",
                        "[model_providers.ipads]",
                        'base_url = "http://codex.example/v1"',
                        'wire_api = "responses"',
                    ]
                ),
                encoding="utf-8",
            )
            with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True):
                endpoint = LLMEndpointSpec.from_dict({"provider": "codex", "model": "gpt-5.4"})
            self.assertEqual(endpoint.provider, "codex")
            self.assertEqual(endpoint.base_url, "http://codex.example/v1")
            self.assertEqual(endpoint.wire_api, "responses")
            self.assertFalse(endpoint.stream)
            self.assertEqual(endpoint.reasoning_effort, "high")
            self.assertEqual(endpoint.api_key_env, "OPENAI_API_KEY")

    def test_codex_stream_forces_chat_completions_wire_api(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            codex_dir = tmp_path / ".codex"
            codex_dir.mkdir(parents=True)
            (codex_dir / "config.toml").write_text(
                '\n'.join(
                    [
                        'model_provider = "ipads"',
                        "",
                        "[model_providers.ipads]",
                        'base_url = "http://codex.example/v1"',
                        'wire_api = "responses"',
                    ]
                ),
                encoding="utf-8",
            )
            with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True):
                endpoint = LLMEndpointSpec.from_dict(
                    {"provider": "codex", "model": "gpt-5.4", "stream": True}
                )
            self.assertTrue(endpoint.stream)
            self.assertEqual(endpoint.wire_api, "chat.completions")

    def test_endpoint_stream_flag(self) -> None:
        endpoint = LLMEndpointSpec.from_dict(
            {
                "provider": "custom",
                "base_url": "http://example/v1",
                "wire_api": "chat.completions",
                "stream": True,
                "model": "x",
                "api_key_env": "OPENAI_API_KEY",
            }
        )
        self.assertTrue(endpoint.stream)

    def test_extract_response_text_supports_responses_api_output(self) -> None:
        text = _extract_response_text(
            {
                "output": [
                    {
                        "type": "message",
                        "content": [
                            {"type": "output_text", "text": '{"diagnosis":"ok"}'}
                        ],
                    }
                ]
            }
        )
        self.assertEqual(text, '{"diagnosis":"ok"}')

    def test_auth_headers_load_bailian_key_from_dotenv(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            (tmp_path / ".env").write_text('DASHSCOPE_API_KEY="dash-key"\n', encoding="utf-8")
            with mock.patch.dict(os.environ, {}, clear=True):
                with mock.patch("pathlib.Path.cwd", return_value=tmp_path):
                    headers = _auth_headers("DASHSCOPE_API_KEY", "bailian")
            self.assertEqual(headers["Authorization"], "Bearer dash-key")

    def test_auth_headers_load_codex_auth_and_proxy(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            codex_dir = tmp_path / ".codex"
            codex_dir.mkdir(parents=True)
            (codex_dir / "config.toml").write_text(
                '\n'.join(
                    [
                        "[network]",
                        'http_proxy = "http://proxy.example:3128"',
                        'https_proxy = "http://proxy.example:3128"',
                    ]
                ),
                encoding="utf-8",
            )
            (codex_dir / "auth.json").write_text(
                json.dumps({"OPENAI_API_KEY": "sk-codex-test"}),
                encoding="utf-8",
            )
            with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True):
                with mock.patch("pathlib.Path.cwd", return_value=tmp_path):
                    headers = _auth_headers("OPENAI_API_KEY", "codex")
                    self.assertEqual(os.environ["http_proxy"], "http://proxy.example:3128")
                    self.assertEqual(os.environ["HTTP_PROXY"], "http://proxy.example:3128")
            self.assertEqual(headers["Authorization"], "Bearer sk-codex-test")

    def test_prompt_includes_failed_trial_context(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            prompt = build_prompt(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(
                    study_id=study.study_id,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="failed",
                            diagnosis="flashinfer looked promising",
                            config_patch={
                                "env_patch": {"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
                                "flag_patch": {"tensor-parallel-size": 4},
                            },
                            failure_reason="engine_process_exited_before_ready exit_code=1",
                        )
                    ],
                ),
                capability_profile=None,
            )
            self.assertIn('"status": "failed"', prompt)
            self.assertIn('"failure_reason": "engine_process_exited_before_ready exit_code=1"', prompt)
            self.assertIn('"VLLM_ATTENTION_BACKEND": "FLASHINFER"', prompt)
            self.assertIn("Known launch failures:", prompt)

    def test_prompt_includes_failure_stage_for_launch_failures(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            prompt = build_prompt(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(
                    study_id=study.study_id,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0002",
                            status="failed",
                            diagnosis="bad topology",
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {
                                    "tensor-parallel-size": 3,
                                    "data-parallel-size": 3,
                                },
                            },
                            failure_stage="engine_launch",
                            failure_reason="engine_process_exited_before_ready exit_code=1",
                        )
                    ],
                ),
                capability_profile=None,
            )
            self.assertIn('"failure_stage": "engine_launch"', prompt)
            self.assertIn('"implicated_flag_keys"', prompt)

    def test_prompt_prioritizes_parallel_space_when_tp_dp_ep_are_tunable(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": True,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "expert-parallel-size": 8,
                    },
                    "tunable_envs": [],
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": True,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            prompt = build_prompt(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(study_id=study.study_id),
                capability_profile=None,
            )
            self.assertIn("Prioritize exploring legal topology changes in parallel space", prompt)
            self.assertIn("Parallel space candidates:", prompt)
            self.assertIn('"tensor_parallel_size": 2', prompt)

    def test_parse_proposal_text_repairs_truncated_json(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study = load_study_spec(_write_study_assets(tmp_path))
            proposal = parse_proposal_text(
                """
                {
                  "observation": "obs",
                  "diagnosis": "diag",
                  "config_patch": {
                    "env_patch": {},
                    "flag_patch": {
                      "max-num-seqs": 24
                    }
                  },
                  "expected_effects": [
                    "faster batching"
                  ],
                  "why_not_previous_failures": "none"
                """,
                study,
            )
            self.assertEqual(proposal.diagnosis, "diag")
            self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 24)

    def test_length_only_trace_rows_are_synthesized(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            trace_dir = tmp_path / "trace_windows" / "traces"
            trace_dir.mkdir(parents=True)
            trace_path = trace_dir / "chat_len_only.jsonl"
            with trace_path.open("w", encoding="utf-8") as handle:
                handle.write(
                    json.dumps(
                        {
                            "timestamp": 0.0,
                            "sampling_u": 0.1,
                            "input_length": 32,
                            "output_length": 16
                        }
                    )
                    + "\n"
                )
            windows_path = tmp_path / "trace_windows" / "windows.json"
            windows_path.write_text(
                json.dumps(
                    {
                        "windows": [
                            {
                                "window_id": "w1",
                                "trace_type": "chat",
                                "trace_file": "traces/chat_len_only.jsonl",
                                "window_start": 0.0,
                                "window_end": 10.0
                            }
                        ]
                    }
                ),
                encoding="utf-8",
            )
            study_path = tmp_path / "study.json"
            study_path.write_text(
                json.dumps(
                    {
                        "study_id": "study-len-only",
                        "hardware": {"gpu_count": 1},
                        "model": {
                            "model_id": "m1",
                            "served_model_name": "dummy-model"
                        },
                        "engine": {
                            "engine_name": "vllm",
                            "exec_path": "/usr/local/bin/vllm",
                            "host": "127.0.0.1",
                            "port": 8000,
                            "ready_timeout_s": 10,
                            "request_timeout_s": 10,
                            "healthcheck_path": "/v1/models",
                            "launch_args": [],
                            "base_envs": {},
                            "base_flags": {},
                            "tunable_envs": [],
                            "tunable_flags": []
                        },
                        "trace": {
                            "windows_path": str(windows_path),
                            "window_id": "w1",
                            "max_concurrency": 1,
                            "synthetic_prompt_cap_tokens": 8
                        },
                        "slo": {"target_pass_rate": 0.95},
                        "search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1},
                        "llm": {"system_prompt": "", "max_history_trials": 1}
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            _, requests = load_trace_requests(study, study_spec_path=study_path)
            self.assertEqual(len(requests), 1)
            message = requests[0].body["messages"][0]["content"]
            self.assertEqual(message.count("token"), 8)
            self.assertEqual(requests[0].body["min_tokens"], 16)
            self.assertEqual(requests[0].body["max_tokens"], 16)

    def test_slo_evaluation_step_and_fixed_rules(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study = load_study_spec(_write_study_assets(Path(tmp)))
            outcomes = [
                RequestOutcome(
                    request_id="r1",
                    success=True,
                    ttft_ms=1000,
                    tpot_ms=100,
                    prompt_tokens=1000,
                    completion_tokens=16,
                ),
                RequestOutcome(
                    request_id="r2",
                    success=True,
                    ttft_ms=6000,
                    tpot_ms=100,
                    prompt_tokens=5000,
                    completion_tokens=16,
                ),
            ]
            evaluations, summary = summarize_evaluations(outcomes, study.slo)
            self.assertTrue(evaluations[0].passed)
            self.assertFalse(evaluations[1].passed)
            self.assertEqual(summary["slo_pass_rate"], 0.5)

    def test_trace_completion_tokens_override_forces_min_and_max_tokens(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
                Path(tmp),
                trace_overrides={"completion_tokens_override": 1},
            )
            study = load_study_spec(study_path)
            _, requests = load_trace_requests(study, study_spec_path=study_path)
            self.assertEqual(len(requests), 3)
            self.assertEqual(requests[0].completion_tokens_hint, 1)
            self.assertEqual(requests[1].completion_tokens_hint, 1)
            self.assertEqual(requests[2].completion_tokens_hint, 1)
            self.assertEqual(requests[0].body["min_tokens"], 1)
            self.assertEqual(requests[0].body["max_tokens"], 1)
            self.assertEqual(requests[2].body["min_tokens"], 1)
            self.assertEqual(requests[2].body["max_tokens"], 1)

    def test_run_one_request_fails_fixed_length_completion_mismatch(self) -> None:
        request = TraceRequest(
            row_id="r1",
            arrival_s=0.0,
            sampling_u=0.1,
            body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
            prompt_tokens_hint=8,
            completion_tokens_hint=2,
        )

        with mock.patch(
            "aituner.worker.stream_chat_completion",
            return_value=StreamMetrics(
                ttft_ms=10.0,
                tpot_ms=5.0,
                completion_tokens=1,
            ),
        ):
            outcome = _run_one_request(
                request,
                base_url="http://127.0.0.1:8000",
                timeout_s=1.0,
            )

        self.assertFalse(outcome.success)
        self.assertEqual(outcome.error, "completion_tokens_mismatch expected=2 actual=1")
        self.assertEqual(outcome.completion_tokens, 1)

    def test_build_prompt_mentions_completion_tokens_override(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study_path = _write_study_assets(
                Path(tmp),
                trace_overrides={"completion_tokens_override": 1},
                slo_overrides={"tpot_rule": None},
            )
            study = load_study_spec(study_path)
            store = StudyStore(Path(tmp) / ".aituner")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            prompt = build_prompt(
                study=study,
                window_summary=summarize_window(requests, window),
                state=state,
                capability_profile=None,
            )
            self.assertIn('"completion_tokens_override": 1', prompt)
            self.assertIn("min_tokens=max_tokens=1", prompt)

    def test_slo_evaluation_supports_tpot_only_95_percent_target(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study = load_study_spec(
                _write_study_assets(
                    Path(tmp),
                    slo_overrides={
                        "ttft_rule": None,
                        "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
                    },
                )
            )
            outcomes = [
                RequestOutcome(
                    request_id="r1",
                    success=True,
                    ttft_ms=3000,
                    tpot_ms=10,
                    prompt_tokens=1000,
                    completion_tokens=16,
                ),
                RequestOutcome(
                    request_id="r2",
                    success=True,
                    ttft_ms=9000,
                    tpot_ms=21,
                    prompt_tokens=5000,
                    completion_tokens=16,
                ),
            ]
            evaluations, summary = summarize_evaluations(outcomes, study.slo)
            self.assertEqual([item.passed for item in evaluations], [True, False])
            self.assertEqual(summary["slo_pass_rate"], 0.5)
            self.assertFalse(summary["feasible"])

    def test_build_launch_recipe_serializes_list_flags_once(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            study = load_study_spec(_write_study_assets(Path(tmp)))
            recipe = build_launch_recipe(
                study.engine,
                ConfigPatch(
                    flag_patch={
                        "cuda-graph-sizes": [1, 2, 4],
                    }
                ),
            )
            self.assertIn("--cuda-graph-sizes", recipe.argv)
            flag_index = recipe.argv.index("--cuda-graph-sizes")
            self.assertEqual(recipe.argv[flag_index + 1 : flag_index + 4], ["1", "2", "4"])
            self.assertEqual(recipe.argv.count("--cuda-graph-sizes"), 1)

    def test_prepare_trace_windows_materializes_repo_local_assets(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            legacy_source = tmp_path / "legacy"
            thinking_source = tmp_path / "thinking"
            legacy_source.mkdir()
            thinking_source.mkdir()

            for filename in [
                "qwen_chat_blksz_64_031109-031111",
                "qwen_chat_blksz_64_031121-031123",
                "qwen_chat_blksz_64_031209-031211",
                "qwen_chat_blksz_64_031221-031223",
                "qwen_chat_blksz_64_031309-031311",
                "qwen_chat_blksz_64_031321-031323",
                "qwen_chat_blksz_64_031409-031411",
                "qwen_chat_blksz_64_031421-031423",
                "qwen_chat_blksz_64_031509-031511",
                "qwen_chat_blksz_64_031521-031523",
                "qwen_chat_blksz_64_031609-031611",
                "qwen_chat_blksz_64_031621-031623",
                "qwen_chat_blksz_64_031709-031711",
                "qwen_chat_blksz_64_031721-031723",
            ]:
                for suffix in [".jsonl", "_prompt.jsonl"]:
                    path = legacy_source / f"{filename}{suffix}"
                    path.write_text("", encoding="utf-8")

            peak_trace = legacy_source / "qwen_chat_blksz_64_031109-031111.jsonl"
            peak_prompt = legacy_source / "qwen_chat_blksz_64_031109-031111_prompt.jsonl"
            peak_trace.write_text(
                "\n".join(
                    [
                        json.dumps(
                            {
                                "chat_id": "c1",
                                "turn": 1,
                                "timestamp": 3599.0,
                                "input_length": 10,
                                "output_length": 3,
                            }
                        ),
                        json.dumps(
                            {
                                "chat_id": "c2",
                                "turn": 2,
                                "timestamp": 3605.0,
                                "input_length": 20,
                                "output_length": 7,
                            }
                        ),
                    ]
                )
                + "\n",
                encoding="utf-8",
            )
            peak_prompt.write_text(
                "\n".join(
                    [
                        json.dumps({"chat_id": "c1", "turn": 1, "prompt": "ignore me"}),
                        json.dumps({"chat_id": "c2", "turn": 2, "prompt": "real prompt"}),
                    ]
                )
                + "\n",
                encoding="utf-8",
            )

            output_root = tmp_path / "trace_windows"
            subprocess.run(
                [
                    "python3",
                    "scripts/prepare_trace_windows.py",
                    "--legacy-source",
                    str(legacy_source),
                    "--thinking-source",
                    str(thinking_source),
                    "--output-root",
                    str(output_root),
                    "--workloads",
                    "chat",
                    "--overwrite",
                ],
                check=True,
                cwd=str(REPO_ROOT),
            )

            windows_payload = json.loads((output_root / "windows.json").read_text(encoding="utf-8"))
            windows = {item["window_id"]: item for item in windows_payload["windows"]}
            self.assertIn("chat_w20260311_1000", windows)
            self.assertEqual(windows["chat_w20260311_1000"]["num_requests"], 1)

            trace_path = output_root / windows["chat_w20260311_1000"]["trace_file"]
            rows = [json.loads(line) for line in trace_path.read_text(encoding="utf-8").splitlines()]
            self.assertEqual(len(rows), 1)
            self.assertEqual(rows[0]["prompt"], "real prompt")
            self.assertEqual(rows[0]["timestamp"], 5.0)
            self.assertEqual(rows[0]["output_length"], 7)
            self.assertIsInstance(rows[0]["sampling_u"], float)

    def test_prepare_trace_windows_preserves_existing_files_on_failure(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            legacy_source = tmp_path / "legacy"
            thinking_source = tmp_path / "thinking"
            output_root = tmp_path / "trace_windows"
            traces_dir = output_root / "traces"
            legacy_source.mkdir()
            thinking_source.mkdir()
            traces_dir.mkdir(parents=True)

            for filename in [
                "qwen_chat_blksz_64_031109-031111",
                "qwen_chat_blksz_64_031121-031123",
            ]:
                for suffix in [".jsonl", "_prompt.jsonl"]:
                    path = legacy_source / f"{filename}{suffix}"
                    path.write_text(
                        json.dumps(
                            {
                                "chat_id": "c1",
                                "turn": 1,
                                "timestamp": 3605.0,
                                "input_length": 20,
                                "output_length": 7,
                                "prompt": "prompt",
                            }
                        )
                        + "\n",
                        encoding="utf-8",
                    )

            sentinel = traces_dir / "chat_w20260311_1000.jsonl"
            sentinel.write_text("sentinel\n", encoding="utf-8")

            proc = subprocess.run(
                [
                    "python3",
                    "scripts/prepare_trace_windows.py",
                    "--legacy-source",
                    str(legacy_source),
                    "--thinking-source",
                    str(thinking_source),
                    "--output-root",
                    str(output_root),
                    "--workloads",
                    "chat",
                    "--overwrite",
                ],
                cwd=str(REPO_ROOT),
                capture_output=True,
                text=True,
            )

            self.assertNotEqual(proc.returncode, 0)
            self.assertEqual(sentinel.read_text(encoding="utf-8"), "sentinel\n")
            self.assertEqual(sorted(path.name for path in traces_dir.glob("*.tmp.*")), [])

    def test_binary_search_max_feasible(self) -> None:
        result = binary_search_max_feasible(
            low=0.0,
            high=1.0,
            tolerance=0.01,
            max_probes=8,
            evaluator=lambda threshold: ThresholdProbe(
                threshold=threshold,
                feasible=threshold <= 0.625,
                payload={"threshold": threshold},
            ),
        )
        self.assertLessEqual(result.best_threshold, 0.625)
        self.assertGreaterEqual(result.best_threshold, 0.5)
        self.assertIsNotNone(result.best_feasible_payload)

    def test_binary_search_continues_below_tolerance_when_all_infeasible(self) -> None:
        seen = []

        def evaluator(threshold):
            seen.append(threshold)
            return ThresholdProbe(
                threshold=threshold,
                feasible=False,
                payload={"threshold": threshold},
            )

        result = binary_search_max_feasible(
            low=0.0,
            high=1.0,
            tolerance=0.1,
            max_probes=6,
            evaluator=evaluator,
        )
        self.assertIsNone(result.best_feasible_payload)
        self.assertEqual(len(result.probes), 6)
        self.assertEqual(
            seen,
            [0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625],
        )

    def test_trace_max_requests_uses_window_wide_downsample(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            trace_dir = tmp_path / "trace_windows" / "traces"
            trace_dir.mkdir(parents=True)
            trace_path = trace_dir / "chat_many.jsonl"
            with trace_path.open("w", encoding="utf-8") as handle:
                for idx in range(10):
                    handle.write(
                        json.dumps(
                            {
                                "request_id": f"r{idx}",
                                "timestamp": float(idx),
                                "sampling_u": idx / 10.0,
                                "messages": [{"role": "user", "content": f"hello-{idx}"}],
                                "input_length": 10 + idx,
                                "output_length": 5,
                            }
                        )
                        + "\n"
                    )
            windows_path = tmp_path / "trace_windows" / "windows.json"
            windows_path.write_text(
                json.dumps(
                    {
                        "windows": [
                            {
                                "window_id": "w1",
                                "trace_type": "chat",
                                "trace_file": "traces/chat_many.jsonl",
                                "window_start": 0.0,
                                "window_end": 10.0,
                            }
                        ]
                    }
                ),
                encoding="utf-8",
            )
            study_path = tmp_path / "study.json"
            study_path.write_text(
                json.dumps(
                    {
                        "study_id": "study-downsample",
                        "hardware": {"gpu_count": 1},
                        "model": {"model_id": "m1", "served_model_name": "dummy-model"},
                        "engine": {
                            "engine_name": "vllm",
                            "exec_path": "/usr/local/bin/vllm",
                            "host": "127.0.0.1",
                            "port": 8000,
                            "ready_timeout_s": 10,
                            "request_timeout_s": 10,
                            "healthcheck_path": "/v1/models",
                            "launch_args": [],
                            "base_envs": {},
                            "base_flags": {},
                            "tunable_envs": [],
                            "tunable_flags": [],
                        },
                        "trace": {
                            "windows_path": str(windows_path),
                            "window_id": "w1",
                            "max_concurrency": 1,
                            "max_requests_per_probe": 4,
                        },
                        "slo": {"target_pass_rate": 0.95},
                        "search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1},
                        "llm": {"system_prompt": "", "max_history_trials": 1},
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            _, requests = load_trace_requests(study, study_spec_path=study_path)
            self.assertEqual([item.row_id for item in requests], ["r0", "r2", "r5", "r7"])

    def test_trace_replay_time_scale_scales_arrivals_and_window(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            trace_dir = tmp_path / "trace_windows" / "traces"
            trace_dir.mkdir(parents=True)
            trace_path = trace_dir / "chat_scale.jsonl"
            trace_path.write_text(
                json.dumps(
                    {
                        "request_id": "r1",
                        "timestamp": 10.0,
                        "sampling_u": 0.25,
                        "messages": [{"role": "user", "content": "hello"}],
                        "input_length": 16,
                        "output_length": 4,
                    }
                )
                + "\n",
                encoding="utf-8",
            )
            windows_path = tmp_path / "trace_windows" / "windows.json"
            windows_path.write_text(
                json.dumps(
                    {
                        "windows": [
                            {
                                "window_id": "w1",
                                "trace_type": "chat",
                                "trace_file": "traces/chat_scale.jsonl",
                                "window_start": 0.0,
                                "window_end": 100.0,
                            }
                        ]
                    }
                ),
                encoding="utf-8",
            )
            study_path = tmp_path / "study.json"
            study_path.write_text(
                json.dumps(
                    {
                        "study_id": "study-scale",
                        "hardware": {"gpu_count": 1},
                        "model": {"model_id": "m1", "served_model_name": "dummy-model"},
                        "engine": {
                            "engine_name": "vllm",
                            "exec_path": "/usr/local/bin/vllm",
                            "host": "127.0.0.1",
                            "port": 8000,
                            "ready_timeout_s": 10,
                            "request_timeout_s": 10,
                            "healthcheck_path": "/v1/models",
                            "launch_args": [],
                            "base_envs": {},
                            "base_flags": {},
                            "tunable_envs": [],
                            "tunable_flags": [],
                        },
                        "trace": {
                            "windows_path": str(windows_path),
                            "window_id": "w1",
                            "max_concurrency": 1,
                            "replay_time_scale": 0.1,
                        },
                        "slo": {"target_pass_rate": 0.95},
                        "search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1},
                        "llm": {"system_prompt": "", "max_history_trials": 1},
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            self.assertEqual(window.window_end, 10.0)
            self.assertEqual(requests[0].arrival_s, 1.0)

    def test_proposal_validation_and_job_emission(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)

            proposal_text = json.dumps(
                {
                    "observation": "Current TTFT fails before TPOT.",
                    "diagnosis": "Prefill pressure dominates.",
                    "config_patch": {
                        "env_patch": {"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
                        "flag_patch": {"tensor-parallel-size": 4, "max-num-seqs": 64}
                    },
                    "expected_effects": ["lower TTFT", "raise feasible sampling_u"],
                    "why_not_previous_failures": "Avoids changing unsupported envs."
                }
            )
            proposal = parse_proposal_text(proposal_text, study)
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)

            job = build_trial_job(study=study, trial=trial, repo_root=tmp_path)
            jobs_path = tmp_path / "jobs.toml"
            append_job(jobs_path, job)
            rendered = jobs_path.read_text(encoding="utf-8")
            self.assertIn('name = "study-1-trial-0001"', rendered)
            self.assertIn('command = "python3 -m aituner.cli worker run-trial', rendered)
            self.assertIn('PYTHONPATH = "src"', rendered)

    def test_ingest_trial_results_updates_best(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    "expected_effects": ["raise rate"]
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            Path(trial.result_path).write_text(
                json.dumps(
                    {
                        "study_id": study.study_id,
                        "trial_id": trial.trial_id,
                        "status": "completed",
                        "best_sampling_u": 0.75,
                        "best_request_rate": 12.5,
                        "best_pass_rate": 0.97
                    }
                ),
                encoding="utf-8",
            )
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.best_trial_id, trial.trial_id)
            self.assertEqual(next_state.best_sampling_u, 0.75)
            self.assertEqual(next_state.best_request_rate, 12.5)
            self.assertEqual(next_state.best_parallel_size, 4)
            self.assertEqual(next_state.best_request_rate_per_gpu, 3.125)
            self.assertEqual(
                next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"],
                3.125,
            )

    def test_run_trial_persists_probe_request_details(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["search"]["max_probes"] = 1
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            proposal = Proposal.from_dict(
                {
                    "observation": "baseline",
                    "diagnosis": "baseline",
                    "config_patch": {"env_patch": {}, "flag_patch": {}},
                    "expected_effects": ["measure"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)

            def fake_replay(requests, **kwargs):
                return (
                    [
                        RequestOutcome(
                            request_id=request.row_id,
                            success=True,
                            ttft_ms=10.0,
                            tpot_ms=5.0,
                            prompt_tokens=request.prompt_tokens_hint,
                            completion_tokens=request.completion_tokens_hint,
                        )
                        for request in requests
                    ],
                    False,
                    "",
                )

            process = mock.Mock()
            process.poll.return_value = 0
            with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
                with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
                    with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
                        with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
                            result = run_trial(
                                Path(trial.artifact_dir) / "trial_spec.json"
                            )

            self.assertEqual(result["status"], "completed")
            details_path = Path(trial.artifact_dir) / "probe_details.jsonl"
            self.assertTrue(details_path.exists())
            rows = [
                json.loads(line)
                for line in details_path.read_text(encoding="utf-8").splitlines()
            ]
            self.assertEqual(len(rows), 1)
            self.assertEqual(rows[0]["threshold"], 0.5)
            self.assertEqual(rows[0]["outcomes"][0]["request_id"], "r1")
            self.assertEqual(rows[0]["outcomes"][0]["sampling_u"], 0.1)

    def test_run_trial_marks_full_trace_saturation_as_measurement_ceiling_insufficient(
        self,
    ) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            proposal = Proposal.from_dict(
                {
                    "observation": "baseline",
                    "diagnosis": "baseline",
                    "config_patch": {"env_patch": {}, "flag_patch": {}},
                    "expected_effects": ["measure"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)

            def fake_replay(requests, **kwargs):
                return (
                    [
                        RequestOutcome(
                            request_id=request.row_id,
                            success=True,
                            ttft_ms=10.0,
                            tpot_ms=5.0,
                            prompt_tokens=request.prompt_tokens_hint,
                            completion_tokens=request.completion_tokens_hint,
                        )
                        for request in requests
                    ],
                    False,
                    "",
                )

            process = mock.Mock()
            process.poll.return_value = 0
            with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
                with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
                    with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
                        with mock.patch(
                            "aituner.worker._replay_requests",
                            side_effect=fake_replay,
                        ):
                            result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")

            self.assertEqual(result["status"], "completed")
            self.assertEqual(result["best_request_count"], 3)
            self.assertTrue(result["measurement"]["measurement_ceiling_insufficient"])
            self.assertEqual(result["measurement"]["reason"], "measurement_ceiling_insufficient")
            self.assertIn("auto_high_resolution", result["measurement"])

    def test_run_trial_falls_back_below_inherited_search_floor(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["search"]["max_probes"] = 2
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=1,
                best_sampling_u=0.5,
                best_request_rate=2.0,
                best_request_rate_per_gpu=2.0,
                next_trial_index=2,
                best_by_parallel_size={
                    "1": {
                        "trial_id": "trial-0001",
                        "parallel_size": 1,
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_request_rate_per_gpu": 2.0,
                    }
                },
                trials=[],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "runtime patch",
                    "diagnosis": "measure even if worse than incumbent",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}},
                    "expected_effects": ["measure"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            self.assertEqual(trial.search.low, 0.0)

            trial_spec_path = Path(trial.artifact_dir) / "trial_spec.json"
            trial_spec_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
            trial_spec_payload["search"]["low"] = 0.5
            trial_spec_path.write_text(json.dumps(trial_spec_payload), encoding="utf-8")

            def fake_replay(requests, **kwargs):
                passing = len(requests) <= 1
                return (
                    [
                        RequestOutcome(
                            request_id=request.row_id,
                            success=True,
                            ttft_ms=10.0 if passing else 10000.0,
                            tpot_ms=5.0 if passing else 1000.0,
                            prompt_tokens=request.prompt_tokens_hint,
                            completion_tokens=request.completion_tokens_hint,
                        )
                        for request in requests
                    ],
                    False,
                    "",
                )

            process = mock.Mock()
            process.poll.return_value = 0
            with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
                with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
                    with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
                        with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
                            result = run_trial(trial_spec_path)

            self.assertEqual(result["status"], "completed")
            self.assertEqual(result["best_source"], "lower_range_fallback")
            self.assertEqual(result["best_sampling_u"], 0.375)
            self.assertEqual(result["best_request_rate"], 0.1)
            self.assertEqual(result["primary_search"]["low"], 0.5)
            self.assertIsNone(result["primary_search"]["best_request_rate"])
            self.assertEqual(result["lower_range_fallback"]["low"], 0.0)
            self.assertEqual(result["lower_range_fallback"]["high"], 0.5)
            self.assertEqual(result["lower_range_fallback"]["best_request_rate"], 0.1)
            self.assertEqual(
                [probe["threshold"] for probe in result["primary_search"]["probes"]],
                [0.75, 0.625],
            )
            self.assertEqual(
                [probe["threshold"] for probe in result["lower_range_fallback"]["probes"]],
                [0.25, 0.375],
            )

    def test_run_trial_skips_fallback_below_incumbent_floor(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["search"]["max_probes"] = 2
            payload["search"]["inherit_incumbent_floor"] = True
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=1,
                best_sampling_u=0.5,
                best_request_rate=2.0,
                best_request_rate_per_gpu=2.0,
                next_trial_index=2,
                best_by_parallel_size={
                    "1": {
                        "trial_id": "trial-0001",
                        "parallel_size": 1,
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_request_rate_per_gpu": 2.0,
                    }
                },
                trials=[],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "runtime patch",
                    "diagnosis": "primary range all infeasible",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}},
                    "expected_effects": ["measure"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            self.assertEqual(trial.search.low, 0.5)
            self.assertTrue(trial.search.inherit_incumbent_floor)

            def fake_replay(requests, **kwargs):
                return (
                    [
                        RequestOutcome(
                            request_id=request.row_id,
                            success=True,
                            ttft_ms=10000.0,
                            tpot_ms=1000.0,
                            prompt_tokens=request.prompt_tokens_hint,
                            completion_tokens=request.completion_tokens_hint,
                        )
                        for request in requests
                    ],
                    False,
                    "",
                )

            process = mock.Mock()
            process.poll.return_value = 0
            with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
                with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
                    with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
                        with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
                            result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")

            self.assertEqual(result["status"], "completed")
            self.assertIsNone(result["best_request_rate"])
            self.assertEqual(result["best_source"], "primary_search")
            self.assertEqual(result["primary_search"]["low"], 0.5)
            self.assertIsNone(result["primary_search"]["best_request_rate"])
            self.assertEqual(
                [probe["threshold"] for probe in result["primary_search"]["probes"]],
                [0.75, 0.625],
            )
            self.assertEqual(result["lower_range_fallback"]["triggered"], False)
            self.assertEqual(result["lower_range_fallback"]["skipped"], True)
            self.assertEqual(result["lower_range_fallback"]["probes"], [])
            self.assertEqual(
                result["lower_range_fallback"]["reason"],
                "primary_search_above_incumbent_floor_all_infeasible",
            )
            self.assertEqual(
                result["all_infeasible_diagnostics"]["threshold"],
                0.625,
            )

    def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            proposal = Proposal.from_dict(
                {
                    "observation": "baseline",
                    "diagnosis": "baseline",
                    "config_patch": {"env_patch": {}, "flag_patch": {}},
                    "expected_effects": ["measure"],
                }
            )

            _, next_state = store.materialize_trial(study=study, state=state, proposal=proposal)

            self.assertEqual(state.trials, [])
            self.assertEqual(len(next_state.trials), 1)

    def test_materialize_trial_uses_full_search_range_with_incumbent(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=4,
                best_sampling_u=0.375,
                best_request_rate=3.0,
                best_request_rate_per_gpu=0.75,
                next_trial_index=2,
                best_by_parallel_size={
                    "4": {
                        "trial_id": "trial-0001",
                        "parallel_size": 4,
                        "best_sampling_u": 0.375,
                        "best_request_rate": 3.0,
                        "best_request_rate_per_gpu": 0.75,
                    }
                },
                trials=[],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    "expected_effects": ["raise rate"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            self.assertEqual(trial.search.low, study.search.low)
            self.assertEqual(trial.search.high, 1.0)

    def test_materialize_trial_uses_full_search_range_for_same_parallel_group(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=4,
                best_sampling_u=0.375,
                best_request_rate=3.0,
                best_request_rate_per_gpu=0.75,
                next_trial_index=2,
                best_by_parallel_size={
                    "2": {
                        "trial_id": "trial-0000",
                        "parallel_size": 2,
                        "best_sampling_u": 0.125,
                        "best_request_rate": 0.8,
                        "best_request_rate_per_gpu": 0.4,
                    }
                },
                trials=[],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
                    "expected_effects": ["raise rate"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            self.assertEqual(trial.search.low, study.search.low)

    def test_materialize_trial_can_use_incumbent_floor_when_enabled(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["search"]["inherit_incumbent_floor"] = True
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=4,
                best_sampling_u=0.375,
                best_request_rate=3.0,
                best_request_rate_per_gpu=0.75,
                next_trial_index=2,
                best_by_parallel_size={
                    "4": {
                        "trial_id": "trial-0001",
                        "parallel_size": 4,
                        "best_sampling_u": 0.375,
                        "best_request_rate": 3.0,
                        "best_request_rate_per_gpu": 0.75,
                    }
                },
                trials=[],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    "expected_effects": ["raise rate"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            self.assertEqual(trial.search.low, 0.375)
            self.assertTrue(trial.search.inherit_incumbent_floor)

    def test_materialize_trial_resets_search_floor_for_new_parallel_group(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_parallel_size=4,
                best_sampling_u=0.4,
                best_request_rate=3.0,
                best_request_rate_per_gpu=0.75,
                next_trial_index=2,
                trials=[],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
                    "expected_effects": ["raise rate"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            self.assertEqual(trial.search.low, study.search.low)

    def test_materialize_trial_inherits_incumbent_topology_for_runtime_patch(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": True,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "expert-parallel-size": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": True,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=8,
                best_sampling_u=0.125,
                best_request_rate=3.0,
                best_request_rate_per_gpu=0.375,
                next_trial_index=3,
                best_by_parallel_size={
                    "8": {
                        "trial_id": "trial-0002",
                        "parallel_size": 8,
                        "best_sampling_u": 0.125,
                        "best_request_rate": 3.0,
                        "best_request_rate_per_gpu": 0.375,
                    }
                },
                trials=[
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_sampling_u=0.125,
                        best_request_rate=3.0,
                        best_request_rate_per_gpu=0.375,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                                "expert-parallel-size": 8,
                            },
                        },
                    )
                ],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "Validate runtime headroom around the incumbent.",
                    "diagnosis": "Try lower concurrency on the current best topology.",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
                    "expected_effects": ["validate incumbent runtime headroom"],
                }
            )

            trial, next_state = store.materialize_trial(study=study, state=state, proposal=proposal)

            self.assertEqual(
                trial.config_patch.flag_patch,
                {
                    "tensor-parallel-size": 2,
                    "data-parallel-size": 4,
                    "max-num-seqs": 160,
                },
            )
            self.assertEqual(trial.search.low, study.search.low)
            self.assertEqual(
                next_state.trials[-1].config_patch["flag_patch"],
                {
                    "tensor-parallel-size": 2,
                    "data-parallel-size": 4,
                    "max-num-seqs": 160,
                },
            )

    def test_materialize_trial_keeps_explicit_topology_runtime_patch(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": True,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "expert-parallel-size": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": True,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                next_trial_index=3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                            },
                        },
                    )
                ],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "Validate base topology runtime.",
                    "diagnosis": "Explicitly keep base topology and adjust concurrency.",
                    "config_patch": {
                        "env_patch": {},
                        "flag_patch": {
                            "tensor-parallel-size": 4,
                            "data-parallel-size": 2,
                            "max-num-seqs": 160,
                        },
                    },
                    "expected_effects": ["test base topology runtime headroom"],
                }
            )

            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)

            self.assertEqual(
                trial.config_patch.flag_patch,
                {
                    "tensor-parallel-size": 4,
                    "data-parallel-size": 2,
                    "max-num-seqs": 160,
                },
            )

    def test_ingest_trial_results_records_failure_reason(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    "expected_effects": ["raise rate"]
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            Path(trial.result_path).write_text(
                json.dumps(
                    {
                        "study_id": study.study_id,
                        "trial_id": trial.trial_id,
                        "status": "failed",
                        "failure_reason": "engine_process_exited_before_ready exit_code=1",
                        "probes": []
                    }
                ),
                encoding="utf-8",
            )
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.trials[0].status, "failed")
            self.assertEqual(
                next_state.trials[0].failure_reason,
                "engine_process_exited_before_ready exit_code=1",
            )
            self.assertEqual(next_state.trials[0].failure_stage, "")

    def test_ingest_trial_results_records_failure_stage(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    "expected_effects": ["raise rate"]
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            Path(trial.result_path).write_text(
                json.dumps(
                    {
                        "study_id": study.study_id,
                        "trial_id": trial.trial_id,
                        "status": "failed",
                        "failure_stage": "engine_launch",
                        "failure_reason": "engine_process_exited_before_ready exit_code=1",
                        "probes": []
                    }
                ),
                encoding="utf-8",
            )
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.trials[0].failure_stage, "engine_launch")

    def test_ingest_trial_results_prefers_higher_request_rate_per_gpu(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = store.load_state(study.study_id)
            proposal_a = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    "expected_effects": ["raise rate"],
                }
            )
            trial_a, state = store.materialize_trial(study=study, state=state, proposal=proposal_a)
            Path(trial_a.result_path).write_text(
                json.dumps(
                    {
                        "study_id": study.study_id,
                        "trial_id": trial_a.trial_id,
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 4.0,
                        "best_pass_rate": 0.97,
                    }
                ),
                encoding="utf-8",
            )
            proposal_b = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
                    "expected_effects": ["raise rate"],
                }
            )
            trial_b, _ = store.materialize_trial(study=study, state=state, proposal=proposal_b)
            Path(trial_b.result_path).write_text(
                json.dumps(
                    {
                        "study_id": study.study_id,
                        "trial_id": trial_b.trial_id,
                        "status": "completed",
                        "best_sampling_u": 0.4,
                        "best_request_rate": 3.0,
                        "best_pass_rate": 0.97,
                    }
                ),
                encoding="utf-8",
            )
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.best_trial_id, trial_b.trial_id)
            self.assertEqual(next_state.best_parallel_size, 2)
            self.assertEqual(next_state.best_request_rate, 3.0)
            self.assertEqual(next_state.best_request_rate_per_gpu, 1.5)
            self.assertEqual(next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"], 1.0)
            self.assertEqual(next_state.best_by_parallel_size["2"]["best_request_rate_per_gpu"], 1.5)

    def test_validate_proposal_rejects_invalid_tp_dp_product(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": True,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "expert-parallel-size": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": True,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Bad topology",
                    "config_patch": {
                        "env_patch": {},
                        "flag_patch": {
                            "tensor-parallel-size": 2,
                            "data-parallel-size": 2,
                            "expert-parallel-size": 4,
                        },
                    },
                    "expected_effects": ["raise throughput"],
                }
            )
            with self.assertRaisesRegex(SpecError, "must equal hardware.gpu_count"):
                validate_proposal(proposal, study)

    def test_validate_proposal_rejects_invalid_ep_divisibility(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": True,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "expert-parallel-size": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": True,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Bad EP",
                    "config_patch": {
                        "env_patch": {},
                        "flag_patch": {
                            "expert-parallel-size": 3,
                        },
                    },
                    "expected_effects": ["raise throughput"],
                }
            )
            with self.assertRaisesRegex(SpecError, "expert-parallel-size=3"):
                validate_proposal(proposal, study)

    def test_validate_proposal_accepts_valid_tp_dp_ep_combo(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": True,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "expert-parallel-size": 8,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": True,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1, 2, 4, 8],
                    },
                },
            )
            study = load_study_spec(study_path)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Valid topology",
                    "config_patch": {
                        "env_patch": {},
                        "flag_patch": {
                            "tensor-parallel-size": 2,
                            "data-parallel-size": 4,
                            "expert-parallel-size": 4,
                        },
                    },
                    "expected_effects": ["raise throughput"],
                }
            )
            validated = validate_proposal(proposal, study)
            self.assertEqual(validated.config_patch.flag_patch["tensor-parallel-size"], 2)

    def test_validate_proposal_accepts_allowed_tp_dp_product_above_gpu_count(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": False,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 1,
                        "expert-parallel-size": 1,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": False,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_expert_parallel_sizes": [1],
                    },
                },
            )
            study = load_study_spec(study_path)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Allow product 8",
                    "config_patch": {
                        "env_patch": {},
                        "flag_patch": {
                            "tensor-parallel-size": 4,
                            "data-parallel-size": 2,
                            "expert-parallel-size": 1,
                        },
                    },
                    "expected_effects": ["explore larger topology"],
                }
            )
            validated = validate_proposal(proposal, study)
            self.assertEqual(validated.config_patch.flag_patch["data-parallel-size"], 2)

    def test_validate_proposal_rejects_tp_dp_product_outside_allowed_set(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "enable-expert-parallel": False,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 1,
                        "expert-parallel-size": 1,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "expert-parallel-size",
                    ],
                    "topology_constraints": {
                        "require_tp_dp_product_equals_gpu_count": False,
                        "require_ep_size_leq_tp_dp_product": True,
                        "require_ep_size_divides_tp_dp_product": True,
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                        "allowed_tensor_parallel_sizes": [1, 2, 3, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 3, 4, 8],
                        "allowed_expert_parallel_sizes": [1],
                    },
                },
            )
            study = load_study_spec(study_path)
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Invalid product",
                    "config_patch": {
                        "env_patch": {},
                        "flag_patch": {
                            "tensor-parallel-size": 3,
                            "data-parallel-size": 2,
                            "expert-parallel-size": 1,
                        },
                    },
                    "expected_effects": ["explore invalid topology"],
                }
            )
            with self.assertRaisesRegex(SpecError, "not in \\[1, 2, 4, 8\\]"):
                validate_proposal(proposal, study)

    def test_cli_tune_runs_multiple_manual_proposals(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            proposal1 = tmp_path / "proposal-1.json"
            proposal2 = tmp_path / "proposal-2.json"
            proposal1.write_text(
                json.dumps(
                    {
                        "observation": "trial one",
                        "diagnosis": "conservative",
                        "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                        "expected_effects": ["stable"],
                        "why_not_previous_failures": "",
                    }
                ),
                encoding="utf-8",
            )
            proposal2.write_text(
                json.dumps(
                    {
                        "observation": "trial two",
                        "diagnosis": "more batching",
                        "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
                        "expected_effects": ["higher throughput"],
                        "why_not_previous_failures": "",
                    }
                ),
                encoding="utf-8",
            )
            store_root = tmp_path / "store"

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                trial_id = str(payload["trial_id"])
                trial_root = Path(payload["artifact_dir"])
                if trial_id.endswith("0001"):
                    best_rate = 1.0
                    best_u = 0.5
                else:
                    best_rate = 2.0
                    best_u = 0.75
                result = {
                    "study_id": payload["study_id"],
                    "trial_id": trial_id,
                    "status": "completed",
                    "best_sampling_u": best_u,
                    "best_request_rate": best_rate,
                    "best_pass_rate": 1.0,
                    "best_request_count": 2,
                    "probes": [],
                }
                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
                return result

            with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
                exit_code = cli_main(
                    [
                        "study",
                        "tune",
                        "--spec",
                        str(study_path),
                        "--store-root",
                        str(store_root),
                        "--proposal-file",
                        str(proposal1),
                        "--proposal-file",
                        str(proposal2),
                    ]
                )
            self.assertEqual(exit_code, 0)
            store = StudyStore(store_root)
            state = store.load_state("study-1")
            self.assertEqual(state.best_trial_id, "trial-0002")
            self.assertEqual(state.best_sampling_u, 0.75)
            self.assertEqual(state.best_request_rate, 2.0)
            self.assertEqual(state.next_trial_index, 3)

    def test_cli_tune_honors_should_stop_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            proposal_path = tmp_path / "stop.json"
            proposal_path.write_text(
                json.dumps(
                    {
                        "observation": "incumbent converged",
                        "diagnosis": "no adjacent harness probe is justified",
                        "config_patch": {"env_patch": {}, "flag_patch": {}},
                        "expected_effects": ["stop without spending another GPU trial"],
                        "why_not_previous_failures": "not applicable",
                        "should_stop": True,
                    }
                ),
                encoding="utf-8",
            )
            store_root = tmp_path / "store"
            with mock.patch("aituner.cli.run_trial") as run_trial_mock:
                exit_code = cli_main(
                    [
                        "study",
                        "tune",
                        "--spec",
                        str(study_path),
                        "--store-root",
                        str(store_root),
                        "--proposal-file",
                        str(proposal_path),
                    ]
                )
            self.assertEqual(exit_code, 0)
            run_trial_mock.assert_not_called()
            store = StudyStore(store_root)
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 1)

    def test_cli_tune_vetoes_unauthorized_llm_stop(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            spec = json.loads(study_path.read_text(encoding="utf-8"))
            spec["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://localhost:9/v1",
                "model": "test-model",
                "api_key_env": "AITUNER_TEST_KEY",
            }
            study_path.write_text(json.dumps(spec), encoding="utf-8")
            store_root = tmp_path / "store"
            stop_payload = json.dumps(
                {
                    "observation": "looks done",
                    "diagnosis": "agent thinks it converged",
                    "config_patch": {"env_patch": {}, "flag_patch": {}},
                    "expected_effects": ["stop"],
                    "why_not_previous_failures": "n/a",
                    "should_stop": True,
                }
            )
            buffer = io.StringIO()
            with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch(
                "aituner.cli.call_llm_for_proposal", return_value=stop_payload
            ), contextlib.redirect_stdout(buffer):
                exit_code = cli_main(
                    [
                        "study",
                        "tune",
                        "--spec",
                        str(study_path),
                        "--store-root",
                        str(store_root),
                        "--skip-baseline",
                        "--max-trials",
                        "2",
                    ]
                )
            self.assertEqual(exit_code, 0)
            run_trial_mock.assert_not_called()
            executed = json.loads(buffer.getvalue())["executed_trials"]
            # The first unauthorized LLM stop is vetoed; the second is honored
            # only after the veto budget is spent.
            self.assertTrue(any(item.get("stop_vetoed") for item in executed))
            honored = [item for item in executed if item.get("stopped")]
            self.assertTrue(honored)
            self.assertEqual(honored[-1]["stop_authorized_by"], "llm_after_veto_budget")

    def test_cli_tune_rejects_repeated_materialized_llm_config(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 4,
                        "data-parallel-size": 2,
                        "max-num-seqs": 64,
                    },
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "data-parallel-size",
                        "max-num-seqs",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
                        "allowed_data_parallel_sizes": [1, 2, 4, 8],
                        "allowed_tp_dp_products": [1, 2, 4, 8],
                    },
                },
            )
            spec = json.loads(study_path.read_text(encoding="utf-8"))
            spec["llm"]["use_harness"] = False
            spec["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://localhost:9/v1",
                "model": "test-model",
                "api_key_env": "AITUNER_TEST_KEY",
            }
            study_path.write_text(json.dumps(spec), encoding="utf-8")
            study = load_study_spec(study_path)
            store_root = tmp_path / "store"
            store = StudyStore(store_root)
            store.init_study(spec_path=study_path, study=study)
            store.save_state(
                StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0002",
                    best_parallel_size=8,
                    best_sampling_u=0.125,
                    best_request_rate=3.0,
                    best_request_rate_per_gpu=0.375,
                    next_trial_index=3,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0002",
                            status="completed",
                            parallel_size=8,
                            best_sampling_u=0.125,
                            best_request_rate=3.0,
                            best_request_rate_per_gpu=0.375,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {
                                    "tensor-parallel-size": 2,
                                    "data-parallel-size": 4,
                                    "max-num-seqs": 160,
                                },
                            },
                        )
                    ],
                )
            )
            repeated_runtime_patch = json.dumps(
                {
                    "observation": "Try the same runtime setting.",
                    "diagnosis": "This is duplicate after topology inheritance.",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
                    "expected_effects": ["should be vetoed"],
                    "why_not_previous_failures": "",
                    "should_stop": False,
                }
            )
            stderr = io.StringIO()
            with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch(
                "aituner.cli.call_llm_for_proposal", return_value=repeated_runtime_patch
            ), contextlib.redirect_stderr(stderr):
                exit_code = cli_main(
                    [
                        "study",
                        "tune",
                        "--spec",
                        str(study_path),
                        "--store-root",
                        str(store_root),
                        "--skip-baseline",
                        "--max-trials",
                        "3",
                    ]
                )
            self.assertEqual(exit_code, 2)
            run_trial_mock.assert_not_called()
            self.assertIn("repeats an already tested effective full config", stderr.getvalue())
            self.assertIn("trial-0002", stderr.getvalue())

    def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store_root = tmp_path / "store"
            store = StudyStore(store_root)
            store.init_study(spec_path=study_path, study=study)
            store.save_state(
                StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0002",
                    best_parallel_size=8,
                    best_sampling_u=0.02,
                    best_request_rate=2.4,
                    best_request_rate_per_gpu=0.3,
                    next_trial_index=5,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            parallel_size=8,
                            best_request_rate=0.8,
                            best_request_rate_per_gpu=0.1,
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        ),
                        TrialSummary(
                            trial_id="trial-0002",
                            status="completed",
                            parallel_size=8,
                            best_request_rate=2.4,
                            best_request_rate_per_gpu=0.3,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {
                                    "tensor-parallel-size": 2,
                                    "data-parallel-size": 4,
                                },
                            },
                        ),
                        TrialSummary(
                            trial_id="trial-0003",
                            status="completed",
                            parallel_size=8,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {
                                    "tensor-parallel-size": 1,
                                    "data-parallel-size": 8,
                                },
                            },
                        ),
                        TrialSummary(
                            trial_id="trial-0004",
                            status="completed",
                            parallel_size=8,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"max-num-seqs": 160},
                            },
                        ),
                    ],
                )
            )

            with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
                with mock.patch("aituner.cli.run_trial") as run_trial_mock:
                    exit_code = cli_main(
                        [
                            "study",
                            "tune",
                            "--spec",
                            str(study_path),
                            "--store-root",
                            str(store_root),
                            "--max-trials",
                            "5",
                        ]
                    )

            self.assertEqual(exit_code, 0)
            llm_mock.assert_not_called()
            run_trial_mock.assert_not_called()
            proposal_path = (
                store.study_root(study.study_id)
                / "proposals"
                / "harness-stop-0005.json"
            )
            self.assertTrue(proposal_path.exists())
            proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
            self.assertTrue(proposal["should_stop"])
            snapshot_path = (
                store.study_root(study.study_id)
                / "harness"
                / "candidate-set-0005.json"
            )
            self.assertTrue(snapshot_path.exists())
            snapshot = json.loads(snapshot_path.read_text(encoding="utf-8"))
            self.assertEqual(snapshot["schema_version"], 1)
            self.assertEqual(snapshot["iteration"], 5)
            self.assertIn("candidate_set_hash", snapshot)
            self.assertIn("candidate_set", snapshot)
            self.assertIn("harness_stop", snapshot["decisions"])
            self.assertIn("stop_authority", snapshot["decisions"])
            state = store.load_state(study.study_id)
            self.assertEqual(state.tuning_stop_reason, "harness_stop")
            self.assertEqual(
                state.tuning_stop_details["proposal_name"],
                "harness-stop-0005",
            )
            self.assertEqual(state.tuning_stop_details["proposal_source"], "harness")
            self.assertEqual(
                state.tuning_stop_details["stop_authorized_by"],
                "validator",
            )
            self.assertTrue(state.tuning_stop_diagnosis)

    def test_cli_tune_llm_first_skips_deterministic_harness_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://llm.example/v1",
                "wire_api": "chat.completions",
                "model": "test-model",
                "api_key_env": "OPENAI_API_KEY",
            }
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            store_root = tmp_path / "store"
            store = StudyStore(store_root)
            store.init_study(spec_path=study_path, study=study)
            store.save_state(
                StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_parallel_size=8,
                    best_sampling_u=0.25,
                    best_request_rate=1.0,
                    best_request_rate_per_gpu=0.125,
                    next_trial_index=2,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            parallel_size=8,
                            best_request_rate=1.0,
                            best_request_rate_per_gpu=0.125,
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        )
                    ],
                )
            )

            llm_payload = json.dumps(
                {
                    "observation": "Use harness evidence but let the LLM choose.",
                    "diagnosis": "Try higher admission concurrency.",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
                    "expected_effects": ["measure admission concurrency"],
                    "why_not_previous_failures": "does not repeat a prior full config",
                    "should_stop": False,
                }
            )

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                trial_root = Path(payload["artifact_dir"])
                result = {
                    "study_id": payload["study_id"],
                    "trial_id": payload["trial_id"],
                    "status": "completed",
                    "best_sampling_u": 0.5,
                    "best_request_rate": 2.0,
                    "best_pass_rate": 1.0,
                    "best_request_count": 2,
                    "probes": [],
                }
                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
                return result

            with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload) as llm_mock:
                with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
                    exit_code = cli_main(
                        [
                            "study",
                            "tune",
                            "--spec",
                            str(study_path),
                            "--store-root",
                            str(store_root),
                            "--skip-baseline",
                            "--max-trials",
                            "2",
                            "--proposal-policy",
                            "llm-first",
                        ]
                    )

            self.assertEqual(exit_code, 0)
            llm_mock.assert_called_once()
            proposal_root = store.study_root(study.study_id) / "proposals"
            self.assertTrue((proposal_root / "proposal-0002.json").exists())
            self.assertFalse((proposal_root / "harness-proposal-0002.json").exists())
            self.assertTrue(
                (store.study_root(study.study_id) / "harness" / "candidate-set-0002.json").exists()
            )

    def test_cli_tune_records_advisory_llm_out_of_set_candidate_family_gap(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://llm.example/v1",
                "wire_api": "chat.completions",
                "model": "test-model",
                "api_key_env": "OPENAI_API_KEY",
            }
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            store_root = tmp_path / "store"
            store = StudyStore(store_root)
            store.init_study(spec_path=study_path, study=study)
            store.save_state(
                StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_parallel_size=1,
                    best_sampling_u=0.25,
                    best_request_rate=1.0,
                    best_request_rate_per_gpu=1.0,
                    next_trial_index=2,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            parallel_size=1,
                            best_request_rate=1.0,
                            best_request_rate_per_gpu=1.0,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"max-num-seqs": 8},
                            },
                        )
                    ],
                )
            )
            harness_context = {
                "experiment_plan": {
                    "planner_version": "test",
                    "candidate_set": {
                        "candidate_set_hash": "candidate-set-test",
                        "eligible_candidates": [
                            {
                                "candidate_id": "cand-mns16",
                                "action_id": "coordinate_step:max-num-seqs:8->16",
                                "knob_family": "max-num-seqs",
                                "score": 0.8,
                                "effective_config_fingerprint": "not-the-llm-proposal",
                                "config_patch": {
                                    "env_patch": {},
                                    "flag_patch": {"max-num-seqs": 16},
                                },
                            }
                        ],
                        "blocked_candidates": [],
                    },
                    "next_action": None,
                }
            }
            llm_payload = json.dumps(
                {
                    "observation": "Harness is in the right admission direction but too conservative.",
                    "diagnosis": "Try a larger same-operator admission step.",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
                    "expected_effects": ["test whether admission capacity was underexplored"],
                    "why_not_previous_failures": "new value and no launch failure evidence",
                    "should_stop": False,
                }
            )

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                trial_root = Path(trial_payload["artifact_dir"])
                result = {
                    "study_id": trial_payload["study_id"],
                    "trial_id": trial_payload["trial_id"],
                    "status": "completed",
                    "best_sampling_u": 0.5,
                    "best_request_rate": 2.0,
                    "best_pass_rate": 1.0,
                    "best_request_count": 2,
                    "probes": [],
                }
                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
                return result

            buffer = io.StringIO()
            with mock.patch("aituner.cli.build_harness_context", return_value=harness_context):
                with mock.patch("aituner.llm.build_harness_context", return_value=harness_context):
                    with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
                        with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
                            with contextlib.redirect_stdout(buffer):
                                exit_code = cli_main(
                                    [
                                        "study",
                                        "tune",
                                        "--spec",
                                        str(study_path),
                                        "--store-root",
                                        str(store_root),
                                        "--skip-baseline",
                                        "--max-trials",
                                        "2",
                                        "--proposal-policy",
                                        "llm-first",
                                    ]
                                )

            self.assertEqual(exit_code, 0)
            summary = json.loads(buffer.getvalue())
            executed = summary["executed_trials"]
            self.assertEqual(executed[0]["proposal_origin"], "llm_out_of_set")
            self.assertTrue(executed[0]["candidate_family_gap_path"])
            attribution_path = (
                store.study_root(study.study_id)
                / "proposal_attributions"
                / "proposal-0002.json"
            )
            attribution = json.loads(attribution_path.read_text(encoding="utf-8"))
            self.assertEqual(attribution["proposal_origin"], "llm_out_of_set")
            self.assertEqual(attribution["harness_candidate_policy"], "advisory")
            gap_path = Path(executed[0]["candidate_family_gap_path"])
            gap = json.loads(gap_path.read_text(encoding="utf-8"))
            self.assertEqual(gap["gap_type"], "same_operator_new_step")
            self.assertEqual(gap["review_status"], "pending")
            self.assertEqual(gap["changed_knobs"], ["flag:max-num-seqs"])
            self.assertEqual(gap["proposal_patch"]["flag_patch"]["max-num-seqs"], 24)
            self.assertEqual(gap["nearest_harness_candidates"][0]["candidate_id"], "cand-mns16")

    def test_cli_tune_strict_harness_policy_rejects_llm_out_of_set_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["harness_candidate_policy"] = "strict"
            payload["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://llm.example/v1",
                "wire_api": "chat.completions",
                "model": "test-model",
                "api_key_env": "OPENAI_API_KEY",
            }
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            store_root = tmp_path / "store"
            store = StudyStore(store_root)
            store.init_study(spec_path=study_path, study=study)
            store.save_state(
                StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_parallel_size=1,
                    best_request_rate=1.0,
                    best_request_rate_per_gpu=1.0,
                    next_trial_index=2,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            parallel_size=1,
                            best_request_rate=1.0,
                            best_request_rate_per_gpu=1.0,
                            config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 8}},
                        )
                    ],
                )
            )
            harness_context = {
                "experiment_plan": {
                    "candidate_set": {
                        "candidate_set_hash": "candidate-set-test",
                        "eligible_candidates": [
                            {
                                "candidate_id": "cand-mns16",
                                "effective_config_fingerprint": "not-the-llm-proposal",
                                "config_patch": {
                                    "env_patch": {},
                                    "flag_patch": {"max-num-seqs": 16},
                                },
                            }
                        ],
                    }
                }
            }
            llm_payload = json.dumps(
                {
                    "observation": "Try an out-of-set candidate.",
                    "diagnosis": "strict mode should reject this.",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
                    "expected_effects": ["should not run"],
                    "why_not_previous_failures": "",
                    "should_stop": False,
                }
            )
            stderr = io.StringIO()
            with mock.patch("aituner.cli.build_harness_context", return_value=harness_context):
                with mock.patch("aituner.llm.build_harness_context", return_value=harness_context):
                    with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
                        with mock.patch("aituner.cli.run_trial") as run_trial_mock:
                            with contextlib.redirect_stderr(stderr):
                                exit_code = cli_main(
                                    [
                                        "study",
                                        "tune",
                                        "--spec",
                                        str(study_path),
                                        "--store-root",
                                        str(store_root),
                                        "--skip-baseline",
                                        "--max-trials",
                                        "2",
                                        "--proposal-policy",
                                        "llm-first",
                                    ]
                                )

            self.assertEqual(exit_code, 2)
            run_trial_mock.assert_not_called()
            self.assertIn("llm.harness_candidate_policy=strict", stderr.getvalue())

    def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://llm.example/v1",
                "wire_api": "chat.completions",
                "model": "test-model",
                "api_key_env": "OPENAI_API_KEY",
            }
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            store_root = tmp_path / "store"

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                trial_root = Path(payload["artifact_dir"])
                result = {
                    "study_id": payload["study_id"],
                    "trial_id": payload["trial_id"],
                    "status": "completed",
                    "best_sampling_u": 0.25,
                    "best_request_rate": 1.0,
                    "best_pass_rate": 1.0,
                    "best_request_count": 2,
                    "probes": [],
                }
                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
                return result

            llm_payload = json.dumps(
                {
                    "observation": "baseline done",
                    "diagnosis": "try more batching",
                    "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
                    "expected_effects": ["higher throughput"],
                    "why_not_previous_failures": "",
                    "should_stop": False,
                }
            )
            with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
                with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
                    exit_code = cli_main(
                        [
                            "study",
                            "tune",
                            "--spec",
                            str(study_path),
                            "--store-root",
                            str(store_root),
                            "--max-trials",
                            "2",
                        ]
                    )
            self.assertEqual(exit_code, 0)
            store = StudyStore(store_root)
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 3)
            self.assertEqual(state.trials[0].config_patch, {"env_patch": {}, "flag_patch": {}})
            self.assertEqual(state.trials[1].config_patch["flag_patch"], {"max-num-seqs": 64})

    def test_cli_tune_stops_when_baseline_is_all_infeasible(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://llm.example/v1",
                "wire_api": "chat.completions",
                "model": "test-model",
                "api_key_env": "OPENAI_API_KEY",
            }
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            store_root = tmp_path / "store"

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                trial_root = Path(payload["artifact_dir"])
                result = {
                    "study_id": payload["study_id"],
                    "trial_id": payload["trial_id"],
                    "status": "completed",
                    "best_sampling_u": None,
                    "best_request_rate": None,
                    "best_pass_rate": None,
                    "best_request_count": None,
                    "probes": [
                        {
                            "threshold": 0.5,
                            "feasible": False,
                            "payload": {"pass_rate": 0.0, "request_rate": 2.0},
                        },
                        {
                            "threshold": 0.25,
                            "feasible": False,
                            "payload": {"pass_rate": 0.5, "request_rate": 1.0},
                        },
                    ],
                    "all_infeasible_diagnostics": {
                        "threshold": 0.25,
                        "request_rate": 1.0,
                        "pass_rate": 0.5,
                        "early_stop_reason": "slo_pass_rate_unrecoverable",
                        "latency_summary": {
                            "ttft_ms": {
                                "count": 2,
                                "mean": 1200.0,
                                "p50": 1100.0,
                                "p95": 1900.0,
                                "p99": 1980.0,
                            },
                            "tpot_ms": {
                                "count": 2,
                                "mean": 35.0,
                                "p50": 32.0,
                                "p95": 48.0,
                                "p99": 49.0,
                            },
                        },
                    },
                }
                (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
                return result

            with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
                with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
                    exit_code = cli_main(
                        [
                            "study",
                            "tune",
                            "--spec",
                            str(study_path),
                            "--store-root",
                            str(store_root),
                            "--max-trials",
                            "3",
                        ]
                    )

            self.assertEqual(exit_code, 0)
            llm_mock.assert_not_called()
            store = StudyStore(store_root)
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 2)
            self.assertEqual(len(state.trials), 1)
            self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible")
            self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis)
            self.assertIn("lowest_probe_ttft_ms", state.tuning_stop_diagnosis)
            self.assertEqual(
                state.tuning_stop_details["lowest_probe_latency_ms"]["ttft"]["p95"],
                1900.0,
            )
            self.assertEqual(
                state.tuning_stop_details["lowest_probe_latency_ms"]["tpot"]["p99"],
                49.0,
            )

            with mock.patch("aituner.cli.run_trial") as run_trial_mock:
                with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
                    exit_code = cli_main(
                        [
                            "study",
                            "tune",
                            "--spec",
                            str(study_path),
                            "--store-root",
                            str(store_root),
                            "--max-trials",
                            "3",
                        ]
                    )

            self.assertEqual(exit_code, 0)
            run_trial_mock.assert_not_called()
            llm_mock.assert_not_called()

    def test_cli_tune_max_trials_is_total_budget_on_resume(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["endpoint"] = {
                "provider": "custom",
                "base_url": "http://llm.example/v1",
                "wire_api": "chat.completions",
                "model": "test-model",
                "api_key_env": "OPENAI_API_KEY",
            }
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            store_root = tmp_path / "store"
            study = load_study_spec(study_path)
            store = StudyStore(store_root)
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                next_trial_index=3,
                trials=[
                    TrialSummary(trial_id="trial-0001", status="completed"),
                    TrialSummary(trial_id="trial-0002", status="completed"),
                ],
            )
            store.save_state(state)

            with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
                with mock.patch("aituner.cli.run_trial") as run_trial_mock:
                    exit_code = cli_main(
                        [
                            "study",
                            "tune",
                            "--spec",
                            str(study_path),
                            "--store-root",
                            str(store_root),
                            "--max-trials",
                            "2",
                        ]
                    )

            self.assertEqual(exit_code, 0)
            llm_mock.assert_not_called()
            run_trial_mock.assert_not_called()
            self.assertEqual(store.load_state(study.study_id).next_trial_index, 3)

    def test_load_compare_spec_requires_window_selection(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            compare_path = tmp_path / "compare.json"
            compare_path.write_text(
                json.dumps(
                    {
                        "compare_id": "compare-1",
                        "study_spec_path": str(study_path),
                        "baseline": {"config_patch": {"env_patch": {}, "flag_patch": {}}},
                        "tuned": {"config_patch": {"env_patch": {}, "flag_patch": {}}},
                    }
                ),
                encoding="utf-8",
            )
            with self.assertRaisesRegex(SpecError, "window_ids or window_selector"):
                load_compare_spec(compare_path)

    def test_run_compare_outputs_summary_and_report(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            trace_dir = tmp_path / "trace_windows" / "traces"
            trace_path = trace_dir / "chat_w2.jsonl"
            trace_path.write_text(
                json.dumps(
                    {
                        "request_id": "r4",
                        "timestamp": 0.0,
                        "sampling_u": 0.2,
                        "messages": [{"role": "user", "content": "extra"}],
                        "input_length": 3000,
                        "output_length": 32,
                    }
                )
                + "\n",
                encoding="utf-8",
            )
            windows_path = tmp_path / "trace_windows" / "windows.json"
            windows_payload = json.loads(windows_path.read_text(encoding="utf-8"))
            windows_payload["windows"].append(
                {
                    "window_id": "chat_w2",
                    "trace_type": "chat",
                    "trace_file": "traces/chat_w2.jsonl",
                    "window_start": 0.0,
                    "window_end": 10.0,
                    "date": "2026-03-12",
                    "slot_token": "1000",
                    "slot_label": "10:00-10:10",
                }
            )
            windows_payload["windows"][0]["date"] = "2026-03-11"
            windows_payload["windows"][0]["slot_token"] = "1000"
            windows_payload["windows"][0]["slot_label"] = "10:00-10:10"
            windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
            compare_path = _write_compare_assets(
                tmp_path,
                study_path=study_path,
                window_ids=["chat_w1", "chat_w2"],
            )

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                source_path = Path(trial_payload["study_spec_path"])
                actual_spec_path = Path(source_path.read_text(encoding="utf-8").strip())
                study_payload = json.loads(actual_spec_path.read_text(encoding="utf-8"))
                window_id = study_payload["trace"]["window_id"]
                trial_id = trial_payload["trial_id"]
                rate_map = {
                    ("chat_w1", "baseline"): 1.0,
                    ("chat_w1", "tuned"): 3.0,
                    ("chat_w2", "baseline"): 3.0,
                    ("chat_w2", "tuned"): 7.0,
                }
                best_rate = rate_map[(window_id, trial_id)]
                result = {
                    "study_id": trial_payload["study_id"],
                    "trial_id": trial_id,
                    "status": "completed",
                    "best_sampling_u": 0.5,
                    "best_request_rate": best_rate,
                    "best_pass_rate": 1.0,
                    "best_request_count": 2,
                    "probes": [],
                }
                Path(trial_payload["result_path"]).write_text(
                    json.dumps(result),
                    encoding="utf-8",
                )
                return result

            with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
                summary = run_compare(compare_path, output_root=tmp_path / ".compare")
            self.assertEqual(len(summary["windows"]), 2)
            self.assertEqual(summary["aggregate"]["wins"]["tuned"], 2)
            self.assertTrue((tmp_path / ".compare" / "summary.json").exists())
            self.assertTrue((tmp_path / ".compare" / "report.md").exists())

    def test_compare_aggregate_counts_failed_and_no_feasible_windows(self) -> None:
        summary = _aggregate_summary(
            [
                {
                    "baseline": {
                        "status": "completed",
                        "best_request_rate": 1.0,
                        "best_request_rate_per_gpu": 1.0,
                    },
                    "tuned": {
                        "status": "completed",
                        "best_request_rate": None,
                        "best_request_rate_per_gpu": None,
                    },
                    "delta": {"winner": "baseline"},
                },
                {
                    "baseline": {
                        "status": "failed",
                        "best_request_rate": None,
                        "best_request_rate_per_gpu": None,
                    },
                    "tuned": {
                        "status": "completed",
                        "best_request_rate": 2.0,
                        "best_request_rate_per_gpu": 2.0,
                    },
                    "delta": {"winner": "tuned"},
                },
            ]
        )
        self.assertEqual(summary["baseline_completed_window_count"], 1)
        self.assertEqual(summary["baseline_failed_window_count"], 1)
        self.assertEqual(summary["baseline_no_feasible_window_count"], 1)
        self.assertEqual(summary["tuned_completed_window_count"], 2)
        self.assertEqual(summary["tuned_failed_window_count"], 0)
        self.assertEqual(summary["tuned_no_feasible_window_count"], 1)

    def test_run_compare_resolves_trial_ref_candidate(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            prior_root = tmp_path / "prior-study"
            trial_dir = prior_root / "trials" / "trial-0002"
            trial_dir.mkdir(parents=True)
            trial_spec = {
                "study_id": "prior-study",
                "trial_id": "trial-0002",
                "config_patch": {
                    "env_patch": {},
                    "flag_patch": {"data-parallel-size": 2},
                },
                "search": {
                    "low": 0.0,
                    "high": 1.0,
                    "tolerance": 0.01,
                    "max_probes": 8,
                    "sample_seed": 20260325,
                },
                "study_spec_path": str(study_path),
                "artifact_dir": str(trial_dir),
                "probe_log_path": str(trial_dir / "probe_history.json"),
                "engine_log_path": str(trial_dir / "engine.log"),
                "result_path": str(trial_dir / "result.json"),
            }
            (trial_dir / "trial_spec.json").write_text(json.dumps(trial_spec), encoding="utf-8")
            compare_path = _write_compare_assets(
                tmp_path,
                study_path=study_path,
                window_ids=["chat_w1"],
                baseline={
                    "trial_ref": {
                        "study_root": str(prior_root),
                        "trial_id": "trial-0002",
                    }
                },
            )

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                flags = (trial_payload["config_patch"] or {}).get("flag_patch") or {}
                best_rate = 5.0 if flags.get("data-parallel-size") == 2 else 2.0
                result = {
                    "study_id": trial_payload["study_id"],
                    "trial_id": trial_payload["trial_id"],
                    "status": "completed",
                    "best_sampling_u": 0.5,
                    "best_request_rate": best_rate,
                    "best_pass_rate": 1.0,
                    "best_request_count": 2,
                    "probes": [],
                }
                Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8")
                return result

            with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
                summary = run_compare(compare_path, output_root=tmp_path / ".compare")
            self.assertEqual(summary["baseline_source"]["kind"], "trial_ref")
            self.assertEqual(
                summary["windows"][0]["baseline"]["config_patch"]["flag_patch"]["data-parallel-size"],
                2,
            )

    def test_run_compare_window_selector_filters_windows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            trace_dir = tmp_path / "trace_windows" / "traces"
            for name in ("chat_w2.jsonl", "thinking_w3.jsonl"):
                (trace_dir / name).write_text(
                    json.dumps(
                        {
                            "request_id": name,
                            "timestamp": 0.0,
                            "sampling_u": 0.2,
                            "messages": [{"role": "user", "content": name}],
                            "input_length": 3000,
                            "output_length": 32,
                        }
                    )
                    + "\n",
                    encoding="utf-8",
                )
            windows_path = tmp_path / "trace_windows" / "windows.json"
            windows_payload = json.loads(windows_path.read_text(encoding="utf-8"))
            windows_payload["windows"][0]["date"] = "2026-03-11"
            windows_payload["windows"][0]["slot_token"] = "1000"
            windows_payload["windows"].append(
                {
                    "window_id": "chat_w2",
                    "trace_type": "chat",
                    "trace_file": "traces/chat_w2.jsonl",
                    "window_start": 0.0,
                    "window_end": 10.0,
                    "date": "2026-03-12",
                    "slot_token": "1000",
                }
            )
            windows_payload["windows"].append(
                {
                    "window_id": "thinking_w3",
                    "trace_type": "thinking",
                    "trace_file": "traces/thinking_w3.jsonl",
                    "window_start": 0.0,
                    "window_end": 10.0,
                    "date": "2026-03-12",
                    "slot_token": "1000",
                }
            )
            windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
            compare_path = _write_compare_assets(
                tmp_path,
                study_path=study_path,
                window_selector={"trace_type": "chat", "date_prefix": "2026-03-12"},
            )

            def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
                trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
                result = {
                    "study_id": trial_payload["study_id"],
                    "trial_id": trial_payload["trial_id"],
                    "status": "completed",
                    "best_sampling_u": 0.5,
                    "best_request_rate": 1.0,
                    "best_pass_rate": 1.0,
                    "best_request_count": 2,
                    "probes": [],
                }
                Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8")
                return result

            with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
                summary = run_compare(compare_path, output_root=tmp_path / ".compare")
            self.assertEqual([row["window_id"] for row in summary["windows"]], ["chat_w2"])

    def test_proposal_expected_effects_accepts_string(self) -> None:
        proposal = Proposal.from_dict(
            {
                "observation": "obs",
                "diagnosis": "diag",
                "config_patch": {"env_patch": {}, "flag_patch": {}},
                "expected_effects": "higher throughput",
            }
        )
        self.assertEqual(proposal.expected_effects, ["higher throughput"])

    def test_proposal_expected_effects_accepts_object(self) -> None:
        proposal = Proposal.from_dict(
            {
                "observation": "obs",
                "diagnosis": "diag",
                "config_patch": {"env_patch": {}, "flag_patch": {}},
                "expected_effects": {
                    "throughput": "higher",
                    "ttft": "lower",
                },
            }
        )
        self.assertEqual(
            proposal.expected_effects,
            ["throughput: higher", "ttft: lower"],
        )

    def test_proposal_observation_accepts_object(self) -> None:
        proposal = Proposal.from_dict(
            {
                "observation": {
                    "incumbent_trial": "trial-0002",
                    "boundary_signal": "tpot cliff",
                },
                "diagnosis": "validate incumbent",
                "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
                "expected_effects": ["more TPOT headroom"],
            }
        )
        self.assertIn('"incumbent_trial": "trial-0002"', proposal.observation)
        self.assertEqual(proposal.diagnosis, "validate incumbent")

    def test_proposal_accepts_should_stop(self) -> None:
        proposal = Proposal.from_dict(
            {
                "observation": "obs",
                "diagnosis": "converged",
                "config_patch": {"env_patch": {}, "flag_patch": {}},
                "expected_effects": ["avoid wasting another GPU trial"],
                "should_stop": True,
            }
        )
        self.assertTrue(proposal.should_stop)

    def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            proposal = parse_proposal_text(
                """Here is the proposal:
```json
{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
```""",
                study,
            )
            self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)

    def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
        requests = [
            TraceRequest(
                row_id=f"r{i}",
                arrival_s=0.0,
                sampling_u=0.1 * i,
                body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
                prompt_tokens_hint=8,
                completion_tokens_hint=4,
            )
            for i in range(3)
        ]

        outcomes = [
            RequestOutcome(
                request_id="r0",
                success=False,
                ttft_ms=None,
                tpot_ms=None,
                prompt_tokens=8,
                completion_tokens=4,
                error="request_failed",
            )
        ]

        def fake_run_one_request(*args, **kwargs):
            return outcomes.pop(0)

        def fake_evaluate(outcome: RequestOutcome):
            return type("Eval", (), {"passed": outcome.success})()

        with mock.patch("aituner.worker._run_one_request", side_effect=fake_run_one_request):
            replayed, early_stopped, reason = _replay_requests(
                requests,
                base_url="http://127.0.0.1:8000",
                timeout_s=1.0,
                max_concurrency=1,
                target_pass_rate=0.95,
                max_lag_s=None,
                max_elapsed_s=None,
                evaluate_outcome=fake_evaluate,
            )
        self.assertTrue(early_stopped)
        self.assertEqual(reason, "slo_pass_rate_unrecoverable")
        self.assertEqual(len(replayed), 3)
        self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")

    def test_replay_requests_does_not_wait_for_inflight_after_early_stop(self) -> None:
        requests = [
            TraceRequest(
                row_id="r0",
                arrival_s=0.0,
                sampling_u=0.1,
                body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
                prompt_tokens_hint=8,
                completion_tokens_hint=4,
            ),
            TraceRequest(
                row_id="r1",
                arrival_s=0.0,
                sampling_u=0.2,
                body={"model": "m", "messages": [{"role": "user", "content": "y"}]},
                prompt_tokens_hint=8,
                completion_tokens_hint=4,
            ),
        ]

        class FakeFuture:
            def __init__(self, outcome=None, *, should_fail_if_waited=False):
                self._outcome = outcome
                self._should_fail_if_waited = should_fail_if_waited

            def result(self, timeout=None):
                if self._should_fail_if_waited:
                    raise AssertionError("in-flight future should not be awaited after early stop")
                return self._outcome

            def cancel(self):
                return True

        done_future = FakeFuture(
            RequestOutcome(
                request_id="r0",
                success=False,
                ttft_ms=None,
                tpot_ms=None,
                prompt_tokens=8,
                completion_tokens=4,
                error="request_failed",
            )
        )
        inflight_future = FakeFuture(should_fail_if_waited=True)

        submitted = []

        class FakeExecutor:
            def __init__(self, max_workers):
                self.max_workers = max_workers

            def submit(self, fn, request, **kwargs):
                submitted.append(request.row_id)
                if request.row_id == "r0":
                    return done_future
                return inflight_future

            def shutdown(self, wait=False, cancel_futures=True):
                return None

        def fake_wait(futures, timeout=None, return_when=None):
            self.assertEqual(len(futures), 2)
            return {done_future}, {inflight_future}

        def fake_evaluate(outcome: RequestOutcome):
            return type("Eval", (), {"passed": outcome.success})()

        with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
            with mock.patch("aituner.worker.wait", side_effect=fake_wait):
                replayed, early_stopped, reason = _replay_requests(
                    requests,
                    base_url="http://127.0.0.1:8000",
                    timeout_s=30.0,
                    max_concurrency=2,
                    target_pass_rate=0.95,
                    max_lag_s=None,
                    max_elapsed_s=None,
                    evaluate_outcome=fake_evaluate,
                    drain_inflight_on_early_stop=False,
                )

        self.assertEqual(submitted, ["r0", "r1"])
        self.assertTrue(early_stopped)
        self.assertEqual(reason, "slo_pass_rate_unrecoverable")
        self.assertEqual(len(replayed), 2)
        self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")

    def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
        requests = [
            TraceRequest(
                row_id="r0",
                arrival_s=0.0,
                sampling_u=0.1,
                body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
                prompt_tokens_hint=8,
                completion_tokens_hint=4,
            )
        ]

        class FakeFuture:
            def result(self, timeout=None):
                raise AssertionError("future should not be awaited after elapsed early stop")

            def cancel(self):
                return True

        submitted = []

        class FakeExecutor:
            def __init__(self, max_workers):
                self.max_workers = max_workers

            def submit(self, fn, request, **kwargs):
                submitted.append(request.row_id)
                return FakeFuture()

            def shutdown(self, wait=False, cancel_futures=True):
                return None

        wait_timeouts: list[float] = []

        def fake_wait(futures, timeout=None, return_when=None):
            wait_timeouts.append(timeout)
            return set(), set(futures)

        def fake_evaluate(outcome: RequestOutcome):
            return type("Eval", (), {"passed": outcome.success})()

        monotonic_values = iter([0.0, 0.0, 0.4, 1.2])

        with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
            with mock.patch("aituner.worker.wait", side_effect=fake_wait):
                with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
                    replayed, early_stopped, reason = _replay_requests(
                        requests,
                        base_url="http://127.0.0.1:8000",
                        timeout_s=30.0,
                        max_concurrency=1,
                        target_pass_rate=0.95,
                        max_lag_s=None,
                        max_elapsed_s=1.0,
                        evaluate_outcome=fake_evaluate,
                        drain_inflight_on_early_stop=False,
                    )

        self.assertEqual(submitted, ["r0"])
        self.assertTrue(early_stopped)
        self.assertEqual(reason, "probe_elapsed_s>1.0")
        self.assertEqual(len(replayed), 1)
        self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
        self.assertTrue(wait_timeouts)
        self.assertLessEqual(wait_timeouts[0], 0.5)

    def test_latency_summary_reports_quantiles_and_slo(self) -> None:
        study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
        outcomes = [
            RequestOutcome(
                request_id="r1",
                success=True,
                ttft_ms=100.0,
                tpot_ms=10.0,
                prompt_tokens=100,
                completion_tokens=10,
            ),
            RequestOutcome(
                request_id="r2",
                success=True,
                ttft_ms=200.0,
                tpot_ms=20.0,
                prompt_tokens=5000,
                completion_tokens=10,
            ),
        ]
        evaluations = [evaluate_request(item, study.slo) for item in outcomes]
        summary = _latency_summary(outcomes=outcomes, evaluations=evaluations, study=study)
        self.assertEqual(summary["observed_request_count"], 2)
        self.assertEqual(summary["request_mode"], "chat")
        self.assertEqual(summary["ttft_ms"]["mean"], 150.0)
        self.assertEqual(summary["ttft_ms"]["p50"], 100.0)
        self.assertEqual(summary["ttft_ms"]["p99"], 200.0)
        self.assertEqual(summary["tpot_ms"]["mean"], 15.0)
        self.assertEqual(summary["slo"]["target_pass_rate"], 0.95)

    def test_wait_for_server_or_exit_fails_fast_when_process_exits(self) -> None:
        process = mock.Mock()
        process.poll.return_value = 17
        with self.assertRaisesRegex(RuntimeError, "engine_process_exited_before_ready exit_code=17"):
            _wait_for_server_or_exit(
                process,
                base_url="http://127.0.0.1:8000",
                healthcheck_path="/v1/models",
                ready_timeout_s=10.0,
            )

    def test_terminate_process_tree_kills_process_group(self) -> None:
        process = mock.Mock()
        process.pid = 1234
        process.poll.return_value = None
        process.wait.return_value = 0
        with mock.patch("aituner.worker.os.getpgid", return_value=1234):
            with mock.patch(
                "aituner.worker.os.killpg",
                side_effect=[None, ProcessLookupError],
            ) as mock_killpg:
                _terminate_process_tree(process, timeout_s=1.0)
        self.assertEqual(mock_killpg.call_args_list[0].args[0], 1234)
        self.assertEqual(mock_killpg.call_args_list[0].args[1], 15)

    def test_terminate_process_tree_kills_group_when_parent_already_exited(self) -> None:
        process = mock.Mock()
        process.pid = 1234
        process.poll.return_value = 0
        with mock.patch("aituner.worker.os.getpgid", side_effect=ProcessLookupError):
            with mock.patch(
                "aituner.worker.os.killpg",
                side_effect=[None, ProcessLookupError],
            ) as mock_killpg:
                _terminate_process_tree(process, timeout_s=1.0)
        self.assertEqual(mock_killpg.call_args_list[0].args[0], 1234)
        process.wait.assert_not_called()

    def test_terminate_process_tree_signals_marker_processes_when_group_missing(self) -> None:
        process = mock.Mock()
        process.pid = 1234
        process.poll.return_value = 0
        marker_env = {"AITUNER_TRIAL_ID": "trial-0001"}
        with mock.patch("aituner.worker.os.getpgid", side_effect=ProcessLookupError):
            with mock.patch("aituner.worker.os.killpg", side_effect=ProcessLookupError):
                with mock.patch(
                    "aituner.worker._pids_matching_env",
                    side_effect=[[2222], []],
                ) as mock_pids:
                    with mock.patch("aituner.worker._signal_pids") as mock_signal:
                        _terminate_process_tree(
                            process,
                            timeout_s=1.0,
                            marker_env=marker_env,
                        )
        self.assertEqual(mock_pids.call_args_list[0].args[0], marker_env)
        self.assertEqual(mock_signal.call_args_list[0].args, ([2222], signal.SIGTERM))

    def test_openai_url_avoids_double_v1(self) -> None:
        self.assertEqual(
            _openai_url("http://example.com", "/v1/chat/completions"),
            "http://example.com/v1/chat/completions",
        )
        self.assertEqual(
            _openai_url("http://example.com/v1", "/v1/chat/completions"),
            "http://example.com/v1/chat/completions",
        )

    def test_stream_chat_completion_handles_missing_usage_and_chunks(self) -> None:
        class FakeResponse:
            def __enter__(self):
                return self

            def __exit__(self, exc_type, exc, traceback):
                return False

            def __iter__(self):
                return iter([b"data: {\"choices\": []}\n", b"data: [DONE]\n"])

        with mock.patch("aituner.http_client._urlopen", return_value=FakeResponse()):
            metrics = stream_chat_completion(
                base_url="http://127.0.0.1:8000",
                body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
                timeout_s=1.0,
            )

        self.assertIsNone(metrics.ttft_ms)
        self.assertIsNone(metrics.tpot_ms)
        self.assertIsNone(metrics.completion_tokens)
        self.assertEqual(metrics.completion_tokens_source, "none")

    def test_loopback_urls_bypass_proxy(self) -> None:
        self.assertTrue(_should_bypass_proxy("http://127.0.0.1:8000/v1/models"))
        self.assertTrue(_should_bypass_proxy("http://localhost:8000/health"))
        self.assertFalse(_should_bypass_proxy("http://example.com/v1/models"))


if __name__ == "__main__":
    unittest.main()