from __future__ import annotations import json import hashlib import contextlib import io import math import os import signal import subprocess import tempfile import unittest from pathlib import Path from unittest import mock from aituner.cli import main as cli_main from aituner.compare import _aggregate_summary, load_compare_spec, run_compare from aituner.config_signature import materialized_effective_config_signature from aituner.engine import build_launch_recipe from aituner.http_client import ( HttpClientError, StreamMetrics, _auth_headers, _openai_url, _should_bypass_proxy, stream_chat_completion, ) from aituner.job import append_job, build_trial_job from aituner.harness import ( _effective_config_signature, build_harness_context, build_harness_guided_proposal, build_harness_stop_proposal, ) from aituner.lca import ( build_study_workload_profile, build_workload_profile, find_convergence_prefix, profile_similarity, resolve_length_mode, similarity_report, ) from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal from aituner.search import ThresholdProbe, binary_search_max_feasible from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations from aituner.spec import ( AdaptiveStopSpec, ConfigPatch, LLMEndpointSpec, Proposal, SloSpec, SpecError, StudyState, TrialSummary, load_study_spec, ) from aituner.store import StudyStore, resolve_auto_high_search from aituner.trace import load_trace_requests, summarize_window from aituner.worker import ( _adaptive_replay_set, _probe_drain_deadline, _install_sigterm_as_keyboardinterrupt, _restore_sigterm, _should_extend_on_boundary, _best_feasible_probe_record, _latency_summary, _run_one_request, _replay_requests, _terminate_process_tree, _wait_for_server_or_exit, run_trial, ) from aituner.trace import TraceRequest, WindowRecord REPO_ROOT = Path(__file__).resolve().parents[1] def _write_study_assets( tmp_path: Path, *, trace_overrides: dict[str, object] | None = None, slo_overrides: dict[str, object] | None = None, engine_overrides: dict[str, object] | None = None, search_overrides: dict[str, object] | None = None, ) -> Path: trace_dir = tmp_path / "trace_windows" / "traces" trace_dir.mkdir(parents=True) trace_path = trace_dir / "chat_w1.jsonl" rows = [ { "request_id": "r1", "timestamp": 0.0, "sampling_u": 0.10, "messages": [{"role": "user", "content": "hello"}], "input_length": 1000, "output_length": 16 }, { "request_id": "r2", "timestamp": 1.0, "sampling_u": 0.50, "messages": [{"role": "user", "content": "world"}], "input_length": 5000, "output_length": 32 }, { "request_id": "r3", "timestamp": 2.0, "sampling_u": 0.90, "messages": [{"role": "user", "content": "!"}], "input_length": 20000, "output_length": 64 } ] with trace_path.open("w", encoding="utf-8") as handle: for row in rows: handle.write(json.dumps(row) + "\n") windows_path = tmp_path / "trace_windows" / "windows.json" windows_payload = { "u_field": "sampling_u", "windows": [ { "window_id": "chat_w1", "trace_type": "chat", "trace_file": "traces/chat_w1.jsonl", "window_start": 0.0, "window_end": 10.0 } ] } windows_path.write_text(json.dumps(windows_payload), encoding="utf-8") capability_path = tmp_path / "capability.json" capability_path.write_text( json.dumps({"prefill_service_by_bucket": {"4k": {"tp4_ms": 320, "tp8_ms": 240}}}), encoding="utf-8", ) study_path = tmp_path / "study.json" trace_payload: dict[str, object] = { "windows_path": str(windows_path), "window_id": "chat_w1", "u_field": "sampling_u", "timestamp_field": "timestamp", "max_concurrency": 4, } if trace_overrides: trace_payload.update(trace_overrides) study_payload = { "study_id": "study-1", "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]}, "model": { "model_id": "qwen", "served_model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507" }, "engine": { "engine_name": "vllm", "engine_version": "0.1", "exec_path": "/usr/local/bin/vllm", "cwd": str(tmp_path), "host": "127.0.0.1", "port": 8000, "healthcheck_path": "/v1/models", "ready_timeout_s": 30, "request_timeout_s": 30, "launch_args": ["serve", "/models/qwen"], "base_envs": {"BASE_ENV": "1"}, "base_flags": {"host": "127.0.0.1", "port": 8000}, "tunable_envs": ["VLLM_ATTENTION_BACKEND"], "tunable_flags": ["tensor-parallel-size", "max-num-seqs"], "python_executable": "python3" }, "trace": trace_payload, "slo": { "target_pass_rate": 0.95, "ttft_rule": { "kind": "step_ms", "buckets": [ {"max_input_tokens": 4096, "threshold_ms": 2000}, {"max_input_tokens": 16384, "threshold_ms": 5000}, {"threshold_ms": 9000} ] }, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 120} }, "search": { "low": 0.0, "high": 1.0, "tolerance": 0.01, "max_probes": 8, "sample_seed": 20260325 }, "llm": {"system_prompt": "Tune it.", "max_history_trials": 8}, "capability_profile_path": str(capability_path) } if slo_overrides: study_payload["slo"].update(slo_overrides) if engine_overrides: study_payload["engine"].update(engine_overrides) if search_overrides: study_payload["search"].update(search_overrides) study_path.write_text(json.dumps(study_payload), encoding="utf-8") return study_path def _write_compare_assets( tmp_path: Path, *, study_path: Path, window_ids: list[str] | None = None, window_selector: dict[str, object] | None = None, baseline: dict[str, object] | None = None, tuned: dict[str, object] | None = None, ) -> Path: compare_path = tmp_path / "compare.json" payload: dict[str, object] = { "compare_id": "compare-1", "study_spec_path": str(study_path), "baseline": baseline or {"config_patch": {"env_patch": {}, "flag_patch": {}}}, "tuned": tuned or { "config_patch": { "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, } }, } if window_ids is not None: payload["window_ids"] = window_ids if window_selector is not None: payload["window_selector"] = window_selector compare_path.write_text(json.dumps(payload), encoding="utf-8") return compare_path class CoreFlowTests(unittest.TestCase): def test_trace_and_prompt_flow(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") study_root = store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) window, requests = load_trace_requests(study, study_spec_path=study_path) summary = summarize_window(requests, window) self.assertEqual(summary["request_count"], 3) self.assertEqual(summary["request_rate"], 0.3) prompt = build_prompt( study=study, window_summary=summary, state=state, capability_profile={"queueing_knee_by_bucket": {"4k": 1000}}, ) self.assertIn("allowed_flag_keys", prompt) self.assertIn("study-1", prompt) self.assertIn('"current_best"', prompt) self.assertIn("queueing_knee_by_bucket", prompt) self.assertIn("Harnesses:", prompt) self.assertIn("workload_lca_profile", prompt) self.assertIn("knob_harnesses", prompt) self.assertTrue(study_root.exists()) def test_search_auto_high_schema_is_backward_compatible(self) -> None: with tempfile.TemporaryDirectory() as tmp: study_path = _write_study_assets( Path(tmp), search_overrides={"high": 0.4}, ) study = load_study_spec(study_path) self.assertFalse(study.search.auto_high.enabled) updated, evidence = resolve_auto_high_search( search=study.search, sampling_us=[0.1, 0.9], ) self.assertEqual(updated.high, 0.4) self.assertEqual(evidence["reason"], "auto_high_disabled") def test_search_auto_high_caps_at_policy_and_trace(self) -> None: with tempfile.TemporaryDirectory() as tmp: study_path = _write_study_assets( Path(tmp), search_overrides={ "high": 0.2, "auto_high": { "enabled": True, "max_sampling_u": 0.8, "require_human_confirmation_beyond_trace": True, }, }, ) study = load_study_spec(study_path) capped_by_policy, policy_evidence = resolve_auto_high_search( search=study.search, sampling_us=[0.1, 0.9], ) self.assertEqual(capped_by_policy.high, 0.8) self.assertEqual( policy_evidence["reason"], "search_high_raised_to_trace_ceiling", ) capped_by_trace, trace_evidence = resolve_auto_high_search( search=study.search, sampling_us=[0.1, 0.7], ) self.assertEqual(capped_by_trace.high, 0.7) self.assertEqual(trace_evidence["effective_ceiling"], 0.7) low_above_ceiling = study.search.__class__.from_dict( { "low": 0.9, "high": 0.95, "tolerance": study.search.tolerance, "max_probes": study.search.max_probes, "sample_seed": study.search.sample_seed, "auto_high": { "enabled": True, "max_sampling_u": 0.8, "require_human_confirmation_beyond_trace": True, }, } ) unchanged, invalid_evidence = resolve_auto_high_search( search=low_above_ceiling, sampling_us=[0.1, 0.9], ) self.assertEqual(unchanged.low, 0.9) self.assertEqual(unchanged.high, 0.95) self.assertEqual( invalid_evidence["reason"], "auto_high_ceiling_below_search_low", ) high_search = study.search.__class__.from_dict( { "low": 0.0, "high": 0.95, "tolerance": study.search.tolerance, "max_probes": study.search.max_probes, "sample_seed": study.search.sample_seed, "auto_high": { "enabled": True, "max_sampling_u": 0.8, "require_human_confirmation_beyond_trace": True, }, } ) lowered, lowered_evidence = resolve_auto_high_search( search=high_search, sampling_us=[0.1, 0.9], ) self.assertEqual(lowered.high, 0.8) self.assertEqual( lowered_evidence["reason"], "search_high_lowered_to_trace_ceiling", ) def test_effective_config_signature_treats_noop_patch_as_baseline(self) -> None: with tempfile.TemporaryDirectory() as tmp: study_path = _write_study_assets( Path(tmp), engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "gpu-memory-utilization": 0.5, "max-num-seqs": 8, }, }, ) study = load_study_spec(study_path) baseline = _effective_config_signature(study, {"env_patch": {}, "flag_patch": {}}) noop_tp = _effective_config_signature( study, {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 8}}, ) noop_tp_string = _effective_config_signature( study, {"env_patch": {}, "flag_patch": {"tensor-parallel-size": "8"}}, ) changed_tp = _effective_config_signature( study, {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, ) self.assertEqual(baseline, noop_tp) self.assertEqual(baseline, noop_tp_string) self.assertNotEqual(baseline, changed_tp) def test_materialized_signature_inherits_incumbent_topology(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 4, "data-parallel-size": 2, "max-num-seqs": 64, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-seqs", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=8, trials=[ TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, "max-num-seqs": 160, }, }, ) ], ) runtime_only = Proposal.from_dict( { "observation": "Try the same runtime cap.", "diagnosis": "This should materialize on incumbent topology.", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}}, "expected_effects": ["no-op after topology inheritance"], } ) explicit = Proposal.from_dict( { "observation": "Explicit duplicate.", "diagnosis": "Same effective execution config.", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": "2", "data-parallel-size": "4", "max-num-seqs": "160", }, }, "expected_effects": ["same config"], } ) self.assertEqual( materialized_effective_config_signature( study=study, state=state, proposal=runtime_only, ), materialized_effective_config_signature( study=study, state=state, proposal=explicit, ), ) def test_lca_workload_profile_uses_standard_10d_features(self) -> None: window = WindowRecord( window_id="w1", trace_path=Path("trace.jsonl"), trace_type="chat", window_start=0.0, window_end=4.0, source_payload={"block_size": 64}, ) requests = [ TraceRequest( row_id="r1", arrival_s=0.0, sampling_u=1.0, body={}, prompt_tokens_hint=100, completion_tokens_hint=10, metadata={"hash_ids": [1, 2]}, ), TraceRequest( row_id="r2", arrival_s=1.0, sampling_u=1.0, body={}, prompt_tokens_hint=100, completion_tokens_hint=20, metadata={"hash_ids": [1, 3]}, ), ] profile = build_workload_profile( requests, window, gpu_count=2, length_mode="total", ) self.assertEqual(len(profile.feature_names), 10) self.assertEqual(len(profile.vector), 10) self.assertEqual(profile.feature_names[0], "L.log_mean_length") self.assertAlmostEqual(profile.stats["cache"]["total_hit_length"], 64.0) self.assertAlmostEqual(profile.stats["cache"]["hit_rate"], 64.0 / 230.0) self.assertAlmostEqual(profile.stats["cache"]["input_hit_rate"], 64.0 / 200.0) self.assertAlmostEqual(profile.vector[3], math.log1p(32.0)) self.assertAlmostEqual(profile.vector[5], 1.0) self.assertAlmostEqual(profile.stats["arrival"]["request_rate_per_gpu"], 0.25) self.assertAlmostEqual(profile.stats["arrival"]["fano_1s"], 0.5) self.assertEqual(resolve_length_mode(request_mode="decode_only"), "output") def test_harness_context_uses_canonical_lca_vector(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) profile = build_study_workload_profile(study, requests, window) state = StudyState(study_id=study.study_id, trials=[]) summary = summarize_window(requests, window) context = build_harness_context( study=study, window_summary=summary, state=state, workload_profile=profile, ) block = context["workload_lca_profile"] # The labeled L-C-A block is the canonical 10-dim metric, not ad-hoc. self.assertEqual(block["vector"], profile.vector) self.assertEqual(len(block["vector"]), 10) self.assertIn("RobustScaler", block["metric"]) # Without a profile it falls back to the legacy ad-hoc rendering. legacy = build_harness_context( study=study, window_summary=summary, state=state, )["workload_lca_profile"] self.assertNotIn("vector", legacy) def _steady_requests(self, count: int, *, input_tokens: int = 100) -> list: return [ TraceRequest( row_id=f"r{i}", arrival_s=float(i), sampling_u=1.0, body={}, prompt_tokens_hint=input_tokens, completion_tokens_hint=16, metadata={"hash_ids": None}, ) for i in range(count) ] def _conv_window(self) -> WindowRecord: return WindowRecord( window_id="conv", trace_path=Path("trace.jsonl"), trace_type="chat", window_start=0.0, window_end=0.0, source_payload={"block_size": 64}, ) def test_convergence_prefix_stops_early_on_stationary_trace(self) -> None: requests = self._steady_requests(60) point = find_convergence_prefix( requests, self._conv_window(), gpu_count=1, length_mode="total", tau=0.9, tau_c=0.9, stable_checks=3, max_checks=20, min_fraction=0.1, ) self.assertTrue(point.converged) # A stationary workload should be trustworthy well before the full window. self.assertLess(point.stop_index, len(requests)) self.assertLess(point.fraction, 1.0) self.assertTrue(point.checks) def test_convergence_prefix_waits_when_cache_warms_late(self) -> None: window = self._conv_window() # First half: no prefix reuse. Second half: every request reuses block 1, # so the C dimension only stabilizes once the reuse regime is exercised. requests = [] for i in range(30): requests.append( TraceRequest( row_id=f"cold{i}", arrival_s=float(i), sampling_u=1.0, body={}, prompt_tokens_hint=640, completion_tokens_hint=16, metadata={"hash_ids": [10_000 + i]}, ) ) for i in range(30): requests.append( TraceRequest( row_id=f"warm{i}", arrival_s=float(30 + i), sampling_u=1.0, body={}, prompt_tokens_hint=640, completion_tokens_hint=16, metadata={"hash_ids": [1, 2, 3, 4, 5]}, ) ) point = find_convergence_prefix( requests, window, gpu_count=1, length_mode="total", tau=0.9, tau_c=0.95, stable_checks=2, max_checks=20, min_fraction=0.1, ) # The C family similarity must be low while only the cold half is seen. early = [c for c in point.checks if c["fraction"] <= 0.4] self.assertTrue(early) self.assertTrue(any(c["family_similarity"]["C"] < 0.9 for c in early)) def test_stop_authority_mirrors_validator_and_blocks_fresh_stop(self) -> None: with tempfile.TemporaryDirectory() as tmp: study = load_study_spec(_write_study_assets(Path(tmp))) state = StudyState(study_id=study.study_id, trials=[]) context = build_harness_context( study=study, window_summary={}, state=state, ) authority = context["stop_authority"] # The authority is the deterministic validator; with no completed # trials it must not authorize a stop. self.assertEqual( authority["authorized"], context["harness_stop"]["should_stop"] ) self.assertFalse(authority["authorized"]) def test_adaptive_replay_set_truncates_only_when_enabled(self) -> None: from types import SimpleNamespace requests = self._steady_requests(60) window = self._conv_window() enabled_study = SimpleNamespace( trace=SimpleNamespace( adaptive_stop=AdaptiveStopSpec( enabled=True, tau=0.9, tau_c=0.9, stable_checks=3, max_checks=20, min_fraction=0.1, ), request_mode="chat", ), hardware=SimpleNamespace(gpu_count=1), ) replay, certificate = _adaptive_replay_set( requests, study=enabled_study, window=window ) self.assertIsNotNone(certificate) self.assertTrue(certificate["enabled"]) self.assertEqual(len(replay), certificate["stop_index"]) self.assertLessEqual(len(replay), len(requests)) disabled_study = SimpleNamespace( trace=SimpleNamespace( adaptive_stop=AdaptiveStopSpec(enabled=False), request_mode="chat", ), hardware=SimpleNamespace(gpu_count=1), ) passthrough, no_cert = _adaptive_replay_set( requests, study=disabled_study, window=window ) self.assertIsNone(no_cert) self.assertEqual(len(passthrough), len(requests)) def test_boundary_guard_extends_only_near_the_slo_knee(self) -> None: converged = {"converged": True} # Truncated, converged, pass-rate on the knee -> re-measure full. self.assertTrue( _should_extend_on_boundary( pass_rate=0.961, target_pass_rate=0.95, certificate=converged, truncated=True, boundary_delta=0.02, ) ) self.assertTrue( _should_extend_on_boundary( pass_rate=0.946, target_pass_rate=0.95, certificate=converged, truncated=True, boundary_delta=0.02, ) ) # Clearly feasible / clearly infeasible -> trust the truncated verdict. self.assertFalse( _should_extend_on_boundary( pass_rate=0.99, target_pass_rate=0.95, certificate=converged, truncated=True, boundary_delta=0.02, ) ) self.assertFalse( _should_extend_on_boundary( pass_rate=0.50, target_pass_rate=0.95, certificate=converged, truncated=True, boundary_delta=0.02, ) ) # Not truncated, not converged, guard disabled, or no certificate -> no extend. self.assertFalse( _should_extend_on_boundary( pass_rate=0.95, target_pass_rate=0.95, certificate=converged, truncated=False, boundary_delta=0.02, ) ) self.assertFalse( _should_extend_on_boundary( pass_rate=0.95, target_pass_rate=0.95, certificate={"converged": False}, truncated=True, boundary_delta=0.02, ) ) self.assertFalse( _should_extend_on_boundary( pass_rate=0.95, target_pass_rate=0.95, certificate=converged, truncated=True, boundary_delta=0.0, ) ) self.assertFalse( _should_extend_on_boundary( pass_rate=0.95, target_pass_rate=0.95, certificate=None, truncated=True, boundary_delta=0.02, ) ) def test_probe_drain_deadline_tracks_admitted_set_and_caps_at_ceiling(self) -> None: slo = SloSpec.from_dict( { "target_pass_rate": 0.95, "ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, } ) def req(arrival_s: float, in_tok: int, out_tok: int) -> TraceRequest: return TraceRequest( row_id="r", arrival_s=arrival_s, sampling_u=0.1, body={}, prompt_tokens_hint=in_tok, completion_tokens_hint=out_tok, metadata={}, ) # 100 requests, last arrival 500s, p99 in=8000 / out=2000. reqs = [req(float(i * 5), 8000, 2000) for i in range(100)] # deadline = last_arrival + (ttft_ms + p99_out*tpot_ms)/1000 + margin # = 495 + (5000 + 2000*50)/1000 + 30 = 495 + 105 + 30 = 630 self.assertAlmostEqual( _probe_drain_deadline(reqs, slo, ceiling=1000.0), 630.0, places=3 ) # Ceiling caps a deadline that would otherwise exceed it. self.assertEqual(_probe_drain_deadline(reqs, slo, ceiling=400.0), 400.0) # No requests or no TPOT rule -> fall back to the ceiling. self.assertEqual(_probe_drain_deadline([], slo, ceiling=400.0), 400.0) def test_linear_ms_ttft_rule_scales_with_input_length(self) -> None: slo = SloSpec.from_dict( { "target_pass_rate": 0.95, "ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, } ) def ev(prompt_tokens: int, ttft_ms: float): return evaluate_request( RequestOutcome( request_id="r", success=True, ttft_ms=ttft_ms, tpot_ms=10.0, prompt_tokens=prompt_tokens, completion_tokens=8, ), slo, ) # threshold = 4000 + 0.125*L_in : 8k->5000ms, 0->4000ms self.assertTrue(ev(8000, 4900).passed) self.assertFalse(ev(8000, 5100).passed) self.assertTrue(ev(0, 3900).passed) self.assertFalse(ev(0, 4100).passed) def test_streaming_socket_timeout_is_a_failed_request_not_a_crash(self) -> None: # A request that exceeds request_timeout_s raises TimeoutError mid-stream; # it must surface as HttpClientError (a failed request), never escape to # crash the trial. with mock.patch( "aituner.http_client._urlopen", side_effect=TimeoutError("timed out") ): with self.assertRaises(HttpClientError): stream_chat_completion( base_url="http://127.0.0.1:1/v1", body={"messages": [{"role": "user", "content": "hi"}], "stream": True}, timeout_s=0.5, ) outcome = _run_one_request( TraceRequest( row_id="r", arrival_s=0.0, sampling_u=1.0, body={"messages": [{"role": "user", "content": "hi"}], "stream": True}, prompt_tokens_hint=10, completion_tokens_hint=None, ), base_url="http://127.0.0.1:1/v1", timeout_s=0.5, ) self.assertFalse(outcome.success) self.assertIn("timed out", outcome.error) def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None: # So a killed `study tune` runs the engine-teardown finally instead of # orphaning the vLLM EngineCore workers on the GPUs. import signal as _signal previous = _install_sigterm_as_keyboardinterrupt() try: with self.assertRaises(KeyboardInterrupt): _signal.raise_signal(_signal.SIGTERM) finally: _restore_sigterm(previous) def test_lca_similarity_matrix_separates_different_profiles(self) -> None: window = WindowRecord( window_id="base", trace_path=Path("trace.jsonl"), trace_type="chat", window_start=0.0, window_end=4.0, source_payload={"block_size": 64}, ) def make_profile(window_id: str, input_tokens: int, *, arrival_gap: float) -> object: reqs = [ TraceRequest( row_id=f"{window_id}-1", arrival_s=0.0, sampling_u=1.0, body={}, prompt_tokens_hint=input_tokens, completion_tokens_hint=16, metadata={"hash_ids": [window_id, 1]}, ), TraceRequest( row_id=f"{window_id}-2", arrival_s=arrival_gap, sampling_u=1.0, body={}, prompt_tokens_hint=input_tokens, completion_tokens_hint=16, metadata={"hash_ids": [window_id, 1, 2]}, ), ] return build_workload_profile( reqs, WindowRecord( window_id=window_id, trace_path=window.trace_path, trace_type=window.trace_type, window_start=window.window_start, window_end=window.window_end, source_payload=window.source_payload, ), gpu_count=1, length_mode="total", ) p1 = make_profile("same-a", 100, arrival_gap=1.0) p2 = make_profile("same-b", 100, arrival_gap=1.0) p3 = make_profile("different", 10000, arrival_gap=0.1) report = similarity_report([p1, p2, p3]) self.assertAlmostEqual(profile_similarity(p1, p2), 1.0) self.assertGreater(report["matrix"][0][1], report["matrix"][0][2]) self.assertIn("L", report["pairs"][2]["family_similarity"]) def test_cli_profile_window_outputs_lca_profile(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) stdout = io.StringIO() with mock.patch("sys.stdout", stdout): rc = cli_main( [ "profile", "window", "--spec", str(study_path), "--gpu-count", "8", ] ) self.assertEqual(rc, 0) payload = json.loads(stdout.getvalue()) self.assertEqual(payload["profile"]["window_id"], "chat_w1") self.assertEqual(len(payload["profile"]["vector"]), 10) self.assertEqual(payload["profile"]["gpu_count"], 8) def test_cli_profile_window_does_not_resolve_llm_endpoint(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["endpoint"] = { "provider": "codex", "model": "gpt-5.4", } study_path.write_text(json.dumps(payload), encoding="utf-8") stdout = io.StringIO() with mock.patch("sys.stdout", stdout): rc = cli_main(["profile", "window", "--spec", str(study_path)]) self.assertEqual(rc, 0) self.assertEqual(json.loads(stdout.getvalue())["profile"]["window_id"], "chat_w1") def test_harness_uses_latency_failures_before_generic_unrecoverable(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) result_path = tmp_path / "trial-result.json" result_path.write_text( json.dumps( { "status": "completed", "probes": [ { "threshold": 0.25, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.3, "request_rate": 1.0, "early_stopped": True, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": { "ttft_ms>5000.0": 70, "tpot_ms>50.0": 5, "probe_elapsed_s>240.0": 100, }, "ttft_ms": {"p95": 6500.0, "p99": 7200.0}, }, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, trials=[ TrialSummary( trial_id="trial-0001", status="completed", result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={ "prompt_tokens_p95": 5000, "prompt_tail_ratio_p95_p50": 3.0, }, state=state, ) self.assertEqual( context["recent_trial_diagnostics"][0]["active_bottleneck"], "ttft_prefill", ) def test_harness_blocks_repeating_infeasible_plateau_family(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) trial_summaries = [] for index, (dp, pass_rate, p95) in enumerate( [(4, 0.345, 3818.4), (8, 0.345, 3823.4)], start=3 ): result_path = tmp_path / f"trial-{index:04d}.json" result_path.write_text( json.dumps( { "status": "completed", "best_request_rate": None, "all_infeasible_diagnostics": { "threshold": 0.0078125, "request_count": 148, "request_rate": 0.22, "pass_rate": pass_rate, "early_stopped": True, "early_stop_reason": "elapsed", "latency_summary": { "failed_reason_counts": {"ttft_ms>5000.0": 97}, "ttft_ms": {"p95": p95, "p99": 5800.0}, }, }, } ), encoding="utf-8", ) trial_summaries.append( TrialSummary( trial_id=f"trial-{index:04d}", status="completed", result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 1, "data-parallel-size": dp, "expert-parallel-size": 1, }, }, ) ) context = build_harness_context( study=study, window_summary={ "prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.83, }, state=StudyState(study_id=study.study_id, trials=trial_summaries), ) guard = context["convergence_guard"]["infeasible_progress"] self.assertTrue(guard["plateau_detected"]) self.assertTrue(guard["stop_if_next_probe_repeats_family"]) self.assertEqual(guard["blocked_primary_family"], "data-parallel-size") self.assertTrue( context["convergence_guard"][ "should_stop_if_no_harness_can_justify_a_new_adjacent_probe" ] ) self.assertFalse(context["convergence_guard"]["deterministic_stop"]) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertIsNone(build_harness_stop_proposal(context)) def test_harness_strong_incumbent_guard_after_large_gain(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate_per_gpu=0.21, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=1, best_request_rate=0.035, best_request_rate_per_gpu=0.035, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=2, best_request_rate=0.42, best_request_rate_per_gpu=0.21, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 1, }, }, ), ], ) context = build_harness_context( study=study, window_summary={ "prompt_tokens_p95": 7628, "prompt_tokens_p99": 8102, "prompt_tail_ratio_p95_p50": 3.83, }, state=state, ) guard = context["convergence_guard"]["strong_incumbent"] self.assertTrue(guard["guard_active"]) self.assertGreaterEqual(guard["incumbent_gain_vs_baseline"], 3.0) self.assertFalse( context["convergence_guard"][ "should_stop_if_no_harness_can_justify_a_new_adjacent_probe" ] ) self.assertEqual( context["convergence_guard"]["reason"], "strong_incumbent_requires_validation_probes", ) self.assertIn("validate", guard["recommended_next_action"]) def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=8, best_sampling_u=0.02, best_request_rate=2.4, best_request_rate_per_gpu=0.3, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=0.8, best_request_rate_per_gpu=0.1, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_request_rate=2.4, best_request_rate_per_gpu=0.3, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, }, }, ), TrialSummary( trial_id="trial-0003", status="completed", parallel_size=8, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 1, "data-parallel-size": 8, }, }, ), TrialSummary( trial_id="trial-0004", status="completed", parallel_size=8, config_patch={ "env_patch": {}, "flag_patch": {"max-num-seqs": 160}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) self.assertTrue(context["harness_stop"]["should_stop"]) self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") proposal = build_harness_stop_proposal(context) self.assertIsNotNone(proposal) self.assertTrue(proposal.should_stop) def test_harness_stop_after_non_improving_feasible_validation_is_exhausted(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=8, best_sampling_u=0.02, best_request_rate=2.4, best_request_rate_per_gpu=0.3, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=0.8, best_request_rate_per_gpu=0.1, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_request_rate=2.4, best_request_rate_per_gpu=0.3, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, }, }, ), TrialSummary( trial_id="trial-0003", status="completed", parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 1, "data-parallel-size": 8, }, }, ), TrialSummary( trial_id="trial-0004", status="completed", parallel_size=8, best_request_rate=2.1, best_request_rate_per_gpu=0.2625, config_patch={ "env_patch": {}, "flag_patch": {"max-num-seqs": 160}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) self.assertTrue(context["harness_stop"]["should_stop"]) self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") def test_harness_stop_after_gmu_incumbent_and_non_improving_topology_validation(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "gpu-memory-utilization", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0007", best_request_rate=6.8667, best_request_rate_per_gpu=3.4333, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=2.2, best_request_rate_per_gpu=2.2, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=6.5167, best_request_rate_per_gpu=3.2583, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), TrialSummary( trial_id="trial-0003", status="completed", best_request_rate=8.3667, best_request_rate_per_gpu=2.0917, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), TrialSummary( trial_id="trial-0007", status="completed", best_request_rate=6.8667, best_request_rate_per_gpu=3.4333, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "gpu-memory-utilization": 0.97, }, }, ), TrialSummary( trial_id="trial-0008", status="completed", best_request_rate=4.1833, best_request_rate_per_gpu=1.0458, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "data-parallel-size": 2, }, }, ), TrialSummary( trial_id="trial-0009", status="completed", best_request_rate=8.3667, best_request_rate_per_gpu=1.0458, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 8}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 1500}, state=state, ) self.assertTrue(context["harness_stop"]["should_stop"]) self.assertEqual( context["harness_stop"]["reason"], "post_incumbent_validation_exhausted", ) proposal = build_harness_stop_proposal(context) self.assertIsNotNone(proposal) self.assertTrue(proposal.should_stop) def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={"tunable_flags": ["max-num-seqs"]}, ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0006", best_parallel_size=8, best_request_rate=2.4, best_request_rate_per_gpu=0.3, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=0.8, best_request_rate_per_gpu=0.1, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_request_rate=0.88, best_request_rate_per_gpu=0.11, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}}, ), TrialSummary( trial_id="trial-0003", status="completed", parallel_size=8, best_request_rate=0.96, best_request_rate_per_gpu=0.12, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 24}}, ), TrialSummary( trial_id="trial-0004", status="completed", parallel_size=8, best_request_rate=1.04, best_request_rate_per_gpu=0.13, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 32}}, ), TrialSummary( trial_id="trial-0005", status="completed", parallel_size=8, best_request_rate=2.24, best_request_rate_per_gpu=0.28, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 40}}, ), TrialSummary( trial_id="trial-0006", status="completed", parallel_size=8, best_request_rate=2.4, best_request_rate_per_gpu=0.3, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 48}}, ), TrialSummary( trial_id="trial-0007", status="completed", parallel_size=8, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 56}}, ), TrialSummary( trial_id="trial-0008", status="completed", parallel_size=8, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 64}}, ), TrialSummary( trial_id="trial-0009", status="completed", parallel_size=8, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 72}}, ), TrialSummary( trial_id="trial-0010", status="completed", parallel_size=8, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 80}}, ), TrialSummary( trial_id="trial-0011", status="failed", parallel_size=8, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 88}}, ), TrialSummary( trial_id="trial-0012", status="completed", parallel_size=8, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 96}}, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) self.assertTrue(context["harness_stop"]["should_stop"]) self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") self.assertGreater( context["harness_stop"]["evidence"]["incumbent_gain_vs_baseline"], 2.9, ) def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=8, best_request_rate=2.4, best_request_rate_per_gpu=0.3, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=0.8, best_request_rate_per_gpu=0.1, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_request_rate=2.4, best_request_rate_per_gpu=0.3, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertIsNone(build_harness_stop_proposal(context)) def test_harness_stop_when_incumbent_saturates_search_high(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.99609375, "best_request_rate": 9.0, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.99609375, "feasible": True, "payload": { "request_count": 10, "pass_rate": 1.0, "request_rate": 9.0, "early_stopped": False, "early_stop_reason": "", "latency_summary": {"failed_reason_counts": {}}, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=9.0, best_request_rate_per_gpu=9.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=9.0, best_request_rate_per_gpu=9.0, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertEqual( context["harness_stop"]["reason"], "search_high_saturation_requires_parallel_size_evidence", ) self.assertEqual( context["harness_stop"]["evidence"]["objective"], "request_rate_per_gpu", ) proposal = build_harness_stop_proposal(context) self.assertIsNone(proposal) def test_harness_stop_allows_feasible_high_probe_with_some_failures(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) result_path = tmp_path / "trial-0004.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.99609375, "best_request_rate": 1.77, "best_pass_rate": 0.968, "probes": [ { "threshold": 0.99609375, "feasible": True, "payload": { "request_count": 1063, "pass_rate": 0.968, "request_rate": 1.77, "early_stopped": False, "early_stop_reason": "", "latency_summary": { "failed_reason_counts": { "tpot_ms>50.0": 34, } }, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0004", best_request_rate=1.77, best_request_rate_per_gpu=0.4425, trials=[ TrialSummary( trial_id="trial-0004", status="completed", best_request_rate=1.77, best_request_rate_per_gpu=0.4425, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertEqual( context["harness_stop"]["reason"], "search_high_saturation_requires_parallel_size_evidence", ) def test_harness_stop_blocks_high_saturation_for_fixed_product_tp_dp_redistribution( self, ) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, }, "tunable_flags": ["tensor-parallel-size", "data-parallel-size"], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_tp_dp_products": [8], "require_tp_dp_product_equals_gpu_count": True, }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.99609375, "best_request_rate": 8.0, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.99609375, "feasible": True, "payload": { "request_count": 10, "pass_rate": 1.0, "request_rate": 8.0, "early_stopped": False, "early_stop_reason": "", "latency_summary": {"failed_reason_counts": {}}, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=8.0, best_request_rate_per_gpu=1.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=8.0, best_request_rate_per_gpu=1.0, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 8, "data-parallel-size": 1, }, }, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertEqual( context["harness_stop"]["reason"], "search_high_saturation_requires_parallel_size_evidence", ) def test_harness_does_not_repropose_noop_topology_equivalent_to_baseline( self, ) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "gpu-memory-utilization": 0.5, "max-num-seqs": 8, }, "tunable_flags": ["tensor-parallel-size", "max-num-seqs"], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) trial1_result = tmp_path / "trial-0001.json" trial1_result.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.935616858887, "best_request_rate": 8.0, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.935616858887, "feasible": True, "payload": { "request_count": 480, "pass_rate": 1.0, "request_rate": 8.0, "early_stopped": False, "early_stop_reason": "", "latency_summary": {"failed_reason_counts": {}}, }, } ], } ), encoding="utf-8", ) trial2_result = tmp_path / "trial-0002.json" trial2_result.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.810867944369, "best_request_rate": 6.95, "best_pass_rate": 0.9784, "probes": [ { "threshold": 0.873242401628, "feasible": False, "payload": { "request_count": 450, "pass_rate": 0.7844, "request_rate": 7.5, "early_stopped": True, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": { "ttft_ms>2000.0": 42, "slo_pass_rate_unrecoverable": 49, } }, }, }, { "threshold": 0.810867944369, "feasible": True, "payload": { "request_count": 417, "pass_rate": 0.9784, "request_rate": 6.95, "early_stopped": False, "early_stop_reason": "", "latency_summary": { "failed_reason_counts": {"ttft_ms>2000.0": 9} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=4, best_sampling_u=0.810867944369, best_request_rate=6.95, best_request_rate_per_gpu=1.7375, next_trial_index=3, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=8.0, best_request_rate_per_gpu=1.0, result_path=str(trial1_result), config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=4, best_request_rate=6.95, best_request_rate_per_gpu=1.7375, result_path=str(trial2_result), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) candidate_set = context["experiment_plan"]["candidate_set"] self.assertEqual(candidate_set["version"], "candidate-set-v1") self.assertIn("candidate_set_hash", candidate_set) self.assertGreaterEqual( candidate_set["blocked_reason_summary"].get( "blocked_noop_or_repeat_effective_full_config", 0, ), 1, ) baseline_fingerprint = hashlib.sha256( _effective_config_signature( study, {"env_patch": {}, "flag_patch": {}}, ).encode("utf-8") ).hexdigest() blocked_baseline_equivalent = [ item for item in candidate_set["blocked_candidates"] if item.get("effective_config_fingerprint") == baseline_fingerprint ] self.assertTrue(blocked_baseline_equivalent) self.assertEqual( blocked_baseline_equivalent[0]["blocked_reason"], "blocked_noop_or_repeat_effective_full_config", ) self.assertIn("effective_config_fingerprint", blocked_baseline_equivalent[0]) actions = context["experiment_plan"]["candidate_actions"] self.assertFalse( any( action.get("config_patch", {}).get("flag_patch") == {"tensor-parallel-size": 8} for action in actions ) ) proposal = build_harness_guided_proposal(context) self.assertTrue( proposal is None or proposal.config_patch.flag_patch != {"tensor-parallel-size": 8} ) def test_harness_guided_first_tp_probe_for_latency_bottleneck(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": ["tensor-parallel-size", "data-parallel-size"], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_data_parallel_sizes": [1, 2], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.25, "best_request_rate": 2.0, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.5, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.6, "request_rate": 4.0, "early_stopped": True, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>50.0": 40}, }, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=2.0, best_request_rate_per_gpu=2.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=2.0, best_request_rate_per_gpu=2.0, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048}, state=state, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2}) self.assertFalse(proposal.should_stop) def test_harness_guided_runtime_seed_preserves_tp_incumbent(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "gpu-memory-utilization", "enable-chunked-prefill", "max-num-batched-tokens", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.75, "best_request_rate": 6.0, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.75, "feasible": True, "payload": { "request_count": 100, "pass_rate": 1.0, "request_rate": 6.0, "early_stopped": False, "early_stop_reason": "", "latency_summary": {"failed_reason_counts": {}}, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=6.0, best_request_rate_per_gpu=3.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=2.0, best_request_rate_per_gpu=2.0, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=6.0, best_request_rate_per_gpu=3.0, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p99": 8100}, state=state, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual( proposal.config_patch.flag_patch, { "tensor-parallel-size": 2, "enable-chunked-prefill": True, "max-num-batched-tokens": 16384, }, ) def test_harness_runtime_refinement_preserves_incumbent_runtime_knobs(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "gpu-memory-utilization", "max-num-seqs", "enable-chunked-prefill", "max-num-batched-tokens", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.098, "best_request_rate": 3.3, "best_pass_rate": 0.97, "probes": [ { "threshold": 0.098, "feasible": True, "payload": { "request_count": 100, "pass_rate": 0.97, "request_rate": 3.3, "early_stopped": False, "early_stop_reason": "", "latency_summary": {"failed_reason_counts": {}}, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=3.3, best_request_rate_per_gpu=0.825, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=2.5, best_request_rate_per_gpu=0.625, config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=3.3, best_request_rate_per_gpu=0.825, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.92, "max-num-seqs": 48, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p99": 8100}, state=state, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual( proposal.config_patch.flag_patch, { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.92, "max-num-seqs": 48, "enable-chunked-prefill": True, "max-num-batched-tokens": 16384, }, ) def test_harness_raises_gpu_mem_util_on_settled_decode_bound_incumbent(self) -> None: """Regression for the coverage gap that let the naive baseline beat the harness: a settled TP incumbent that is decode_tpot-bound must get a gpu-memory-utilization raise (KV-cache headroom) before the harness is allowed to stop.""" with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, slo_overrides={ "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, }, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "gpu-memory-utilization", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.074, "best_request_rate": 2.6, "best_pass_rate": 0.97, "probes": [ { "threshold": 0.074, "feasible": True, "payload": { "request_count": 300, "pass_rate": 0.97, "request_rate": 2.6, "latency_summary": {"failed_reason_counts": {}}, }, }, { "threshold": 0.09, "feasible": False, "payload": { "request_count": 300, "pass_rate": 0.6, "request_rate": 3.2, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>50.0": 90} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=2.6, best_request_rate_per_gpu=0.65, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=1.1, best_request_rate_per_gpu=0.275, config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=2.6, best_request_rate_per_gpu=0.65, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.9, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 1500}, state=state ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertFalse(proposal.should_stop) # TP4 preserved; gpu-memory-utilization hill-climbed one step (0.9 -> 0.92). self.assertEqual( proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4 ) self.assertEqual( proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.92 ) # And the harness must NOT authorize a stop while that knob is untried. self.assertIsNone(build_harness_stop_proposal(context)) def test_harness_climbs_tp_before_gpu_mem_util_micro_tuning(self) -> None: """gpu-memory-utilization must not preempt an untried TP increase: at a TP2 incumbent with TP4 still reachable, the harness must climb TP, not micro-tune runtime.""" with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, slo_overrides={ "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, }, engine_overrides={ "tunable_flags": ["tensor-parallel-size", "gpu-memory-utilization"], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.03, "best_request_rate": 1.1, "best_pass_rate": 0.97, "probes": [ { "threshold": 0.03, "feasible": True, "payload": { "request_count": 300, "pass_rate": 0.97, "request_rate": 1.1, "latency_summary": {"failed_reason_counts": {}}, }, }, { "threshold": 0.05, "feasible": False, "payload": { "request_count": 300, "pass_rate": 0.6, "request_rate": 1.6, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>50.0": 90} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=1.1, best_request_rate_per_gpu=0.55, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=0.6, best_request_rate_per_gpu=0.6, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=1.1, best_request_rate_per_gpu=0.55, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "gpu-memory-utilization": 0.9, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 1500}, state=state ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) # Must climb TP (to 4), and must NOT micro-tune gpu-memory-utilization yet. self.assertEqual( proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4 ) self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch) def test_harness_brackets_down_from_bad_high_tp_start_before_runtime_tuning(self) -> None: """A no-LLM run that starts at the max TP should validate the adjacent lower topology before spending trials on runtime micro-tuning.""" with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, slo_overrides={ "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, }, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "gpu-memory-utilization": 0.5, "max-num-seqs": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "gpu-memory-utilization", "max-num-seqs", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.05, "best_request_rate": 8.0, "best_pass_rate": 0.96, "probes": [ { "threshold": 0.05, "feasible": True, "payload": { "request_count": 300, "pass_rate": 0.96, "request_rate": 8.0, "latency_summary": {"failed_reason_counts": {}}, }, }, { "threshold": 0.08, "feasible": False, "payload": { "request_count": 300, "pass_rate": 0.5, "request_rate": 10.0, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 120} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=8.0, best_request_rate_per_gpu=1.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=8.0, best_request_rate_per_gpu=1.0, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 6500}, state=state, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual( proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4 ) self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch) self.assertNotIn("max-num-seqs", proposal.config_patch.flag_patch) def test_harness_jumps_low_gpu_mem_util_to_nominal_floor_after_topology_settles(self) -> None: """A pathological gmu=0.5 start should jump to the normal operating floor after topology is bracketed instead of wasting many 0.02 hill-climb trials.""" with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, slo_overrides={ "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, }, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 2, "data-parallel-size": 1, "gpu-memory-utilization": 0.5, }, "tunable_flags": [ "tensor-parallel-size", "gpu-memory-utilization", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.07, "best_request_rate": 2.4, "best_pass_rate": 0.97, "probes": [ { "threshold": 0.07, "feasible": True, "payload": { "request_count": 300, "pass_rate": 0.97, "request_rate": 2.4, "latency_summary": {"failed_reason_counts": {}}, }, }, { "threshold": 0.1, "feasible": False, "payload": { "request_count": 300, "pass_rate": 0.55, "request_rate": 3.1, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>50.0": 90} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=2.4, best_request_rate_per_gpu=1.2, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=2.4, best_request_rate_per_gpu=1.2, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=2.2, best_request_rate_per_gpu=0.55, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 1500}, state=state, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual( proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.9 ) self.assertNotIn("tensor-parallel-size", proposal.config_patch.flag_patch) def test_harness_stops_gpu_mem_util_climb_after_tied_same_topology_probe(self) -> None: """A same-topology gpu-memory-utilization probe must improve per-GPU rate before the hill-climb continues; launch success alone is not evidence to keep climbing.""" with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, slo_overrides={ "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, }, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "gpu-memory-utilization", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.75, "best_request_rate": 6.5, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.75, "feasible": True, "payload": { "request_count": 300, "pass_rate": 1.0, "request_rate": 6.5, "latency_summary": {"failed_reason_counts": {}}, }, }, { "threshold": 0.765625, "feasible": False, "payload": { "request_count": 300, "pass_rate": 0.6, "request_rate": 6.7, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 80} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=6.5, best_request_rate_per_gpu=3.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=2.2, best_request_rate_per_gpu=2.2, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=6.5, best_request_rate_per_gpu=3.25, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), TrialSummary( trial_id="trial-0003", status="completed", best_request_rate=8.4, best_request_rate_per_gpu=2.1, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), TrialSummary( trial_id="trial-0004", status="completed", best_request_rate=6.5, best_request_rate_per_gpu=3.25, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "gpu-memory-utilization": 0.92, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 1500}, state=state, ) candidates = context["experiment_plan"]["candidate_actions"] self.assertNotIn( {"tensor-parallel-size": 2, "gpu-memory-utilization": 0.94}, [ item["config_patch"]["flag_patch"] for item in candidates if item["knob_family"] == "gpu-memory-utilization" ], ) def test_harness_projects_measured_runtime_delta_to_other_frontier_anchor(self) -> None: """A runtime improvement found on one topology must be tested on other Pareto anchors before the harness can keep micro-tuning the source topology.""" with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, slo_overrides={ "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50}, }, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 2, "data-parallel-size": 1, "gpu-memory-utilization": 0.5, "max-num-seqs": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "gpu-memory-utilization", "max-num-seqs", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [2, 4, 8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [2, 4, 8], }, }, ) study = load_study_spec(study_path) latest_result_path = tmp_path / "trial-0005.json" latest_result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.1, "best_request_rate": 8.0, "best_pass_rate": 0.96, "probes": [ { "threshold": 0.1, "feasible": True, "payload": { "request_count": 300, "pass_rate": 0.96, "request_rate": 8.0, "latency_summary": {"failed_reason_counts": {}}, }, }, { "threshold": 0.12, "feasible": False, "payload": { "request_count": 300, "pass_rate": 0.6, "request_rate": 9.0, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 100} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0005", best_request_rate=8.0, best_request_rate_per_gpu=2.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=2, best_request_rate=2.9, best_request_rate_per_gpu=1.45, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=4, best_request_rate=6.95, best_request_rate_per_gpu=1.7375, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), TrialSummary( trial_id="trial-0003", status="completed", parallel_size=8, best_request_rate=8.0, best_request_rate_per_gpu=1.0, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 8}, }, ), TrialSummary( trial_id="trial-0004", status="completed", parallel_size=4, best_request_rate=6.95, best_request_rate_per_gpu=1.7375, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "max-num-seqs": 16, }, }, ), TrialSummary( trial_id="trial-0005", status="completed", parallel_size=4, best_request_rate=8.0, best_request_rate_per_gpu=2.0, result_path=str(latest_result_path), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.9, }, }, ), TrialSummary( trial_id="trial-0006", status="completed", parallel_size=4, best_request_rate=8.0, best_request_rate_per_gpu=2.0, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.9, "max-num-seqs": 16, }, }, ), TrialSummary( trial_id="trial-0007", status="completed", parallel_size=4, best_request_rate=8.0, best_request_rate_per_gpu=2.0, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.92, }, }, ), TrialSummary( trial_id="trial-0008", status="completed", parallel_size=4, best_request_rate=8.0, best_request_rate_per_gpu=2.0, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.9, "max-num-batched-tokens": 16384, "max-num-seqs": 16, }, }, ), TrialSummary( trial_id="trial-0009", status="completed", parallel_size=4, best_request_rate=8.0, best_request_rate_per_gpu=2.0, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "gpu-memory-utilization": 0.9, "enable-chunked-prefill": True, "max-num-batched-tokens": 8192, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 6500}, state=state, ) next_action = context["experiment_plan"]["next_action"] self.assertEqual(next_action["knob_family"], "frontier-delta-projection") self.assertEqual( next_action["config_patch"]["flag_patch"], { "tensor-parallel-size": 2, "data-parallel-size": 1, "gpu-memory-utilization": 0.9, }, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) materialized_signature = materialized_effective_config_signature( study=study, state=state, proposal=proposal, ) tested_signatures = { _effective_config_signature(study, trial.config_patch) for trial in state.trials } self.assertNotIn(materialized_signature, tested_signatures) self.assertIsNone(build_harness_stop_proposal(context)) def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "max-num-batched-tokens", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.96, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_count": 100, "pass_rate": 0.96, "request_rate": 2.0, "latency_summary": {"failed_reason_counts": {}}, }, }, { "threshold": 0.75, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.6, "request_rate": 3.0, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>25.0": 40} }, }, }, ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=2.0, best_request_rate_per_gpu=1.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=0.5, best_request_rate_per_gpu=0.5, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=2.0, best_request_rate_per_gpu=1.0, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8}, state=state, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 4}) self.assertEqual( context["harness_proposal"]["reason"], "topology_frontier_probe_for_slo_pressure", ) def test_profile_driven_planner_scores_unmeasured_tp_frontier(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "max-num-batched-tokens", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_tp_dp_products": [1, 2, 4], }, }, ) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.96, "probes": [ { "threshold": 0.75, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.6, "request_rate": 3.0, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 35} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8}, state=StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=2.0, best_request_rate_per_gpu=1.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=0.5, best_request_rate_per_gpu=0.5, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=2.0, best_request_rate_per_gpu=1.0, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), ], ), ) plan = context["experiment_plan"] self.assertEqual(plan["planner_version"], "profile-driven-v1") self.assertEqual(plan["next_action"]["knob_family"], "topology") self.assertEqual( plan["next_action"]["config_patch"]["flag_patch"], {"tensor-parallel-size": 4}, ) self.assertIn("ttft_prefill", context["bottleneck_hypotheses"][0]["name"]) self.assertFalse(context["harness_stop"]["should_stop"]) def test_profile_driven_topology_does_not_introduce_ep_for_ttft(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": {"host": "127.0.0.1", "port": 8000}, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", "enable-expert-parallel", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_data_parallel_sizes": [1], "allowed_expert_parallel_sizes": [1, 2], "allowed_tp_dp_products": [1, 2, 4], "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "require_enable_expert_parallel_when_ep_gt_one": True, }, }, ) result_paths: list[Path] = [] for idx in range(1, 4): result_path = tmp_path / f"trial-000{idx}.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.25, "best_request_rate": 2.0, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.5, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.6, "request_rate": 4.0, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"ttft_ms>2000": 40} }, }, } ], } ), encoding="utf-8", ) result_paths.append(result_path) study = load_study_spec(study_path) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 8192}, state=StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=4.0, best_request_rate_per_gpu=2.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=2.0, best_request_rate_per_gpu=2.0, result_path=str(result_paths[0]), config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=4.0, best_request_rate_per_gpu=2.0, result_path=str(result_paths[1]), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), TrialSummary( trial_id="trial-0003", status="completed", best_request_rate=4.0, best_request_rate_per_gpu=1.0, result_path=str(result_paths[2]), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), ], ), ) candidate_actions = context["experiment_plan"]["candidate_actions"] for action in candidate_actions: patch = action["config_patch"]["flag_patch"] self.assertNotIn("enable-expert-parallel", patch) self.assertNotIn("expert-parallel-size", patch) def test_profile_driven_planner_prefers_decode_concurrency_relief(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={"request_mode": "decode_only"}, slo_overrides={ "ttft_rule": None, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20}, }, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 4, "max-num-seqs": 64, }, "tunable_flags": ["tensor-parallel-size", "max-num-seqs"], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_tp_dp_products": [1, 2, 4], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.25, "best_request_rate": 1.0, "best_pass_rate": 0.97, "probes": [ { "threshold": 0.5, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.5, "request_rate": 2.0, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>20.0": 50} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) context = build_harness_context( study=study, window_summary={}, state=StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=1.0, best_request_rate_per_gpu=0.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=1.0, best_request_rate_per_gpu=0.25, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ), ) plan = context["experiment_plan"] self.assertEqual(plan["next_action"]["knob_family"], "max-num-seqs") self.assertEqual( plan["next_action"]["config_patch"]["flag_patch"], {"max-num-seqs": 32}, ) def test_prefill_convergence_stop_waits_for_sequence_concurrency_probe(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 4, "data-parallel-size": 1, "max-num-batched-tokens": 8192, "max-num-seqs": 64, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [4, 8], "allowed_data_parallel_sizes": [1, 2], "allowed_tp_dp_products": [4, 8], }, }, ) def write_result(name: str, best_rate: float | None, pass_rate: float) -> Path: path = tmp_path / f"{name}.json" payload = { "status": "completed", "best_sampling_u": 0.091796875 if best_rate is not None else None, "best_request_rate": best_rate, "best_pass_rate": pass_rate if best_rate is not None else None, "probes": [ { "threshold": 0.09375, "feasible": best_rate is not None, "payload": { "request_rate": best_rate, "pass_rate": pass_rate, "early_stop_reason": ( "" if best_rate is not None else "slo_pass_rate_unrecoverable" ), "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 32} }, }, } ], } path.write_text(json.dumps(payload), encoding="utf-8") return path study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_sampling_u=0.091796875, best_request_rate=2.303, best_request_rate_per_gpu=0.288, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.303, best_request_rate_per_gpu=0.288, best_pass_rate=0.952, result_path=str(write_result("trial-0001", 2.303, 0.952)), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 8, "data-parallel-size": 1, }, }, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_request_rate=2.303, best_request_rate_per_gpu=0.288, best_pass_rate=0.953, result_path=str(write_result("trial-0002", 2.303, 0.953)), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 8, "max-num-batched-tokens": 32768, }, }, ), TrialSummary( trial_id="trial-0003", status="completed", parallel_size=8, result_path=str(write_result("trial-0003", None, 0.0)), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "data-parallel-size": 2, }, }, ), TrialSummary( trial_id="trial-0004", status="completed", parallel_size=8, best_request_rate=2.303, best_request_rate_per_gpu=0.288, best_pass_rate=0.954, result_path=str(write_result("trial-0004", 2.303, 0.954)), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 12288, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000}, state=state, ) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertEqual( context["harness_stop"]["reason"], "experiment_plan_has_high_value_candidate", ) action = context["experiment_plan"]["next_action"] self.assertEqual(action["knob_family"], "prefill-scheduler-interaction") self.assertEqual(action["action_id"], "raise_prefill_quantum_with_chunked_prefill") flag_patch = action["config_patch"]["flag_patch"] self.assertEqual(flag_patch["tensor-parallel-size"], 8) self.assertGreater(flag_patch["max-num-batched-tokens"], 8192) def test_prefill_scheduler_lowers_quantum_by_normalized_ratio(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 32768, "max-num-seqs": 8, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [8], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.95, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_rate": 2.0, "pass_rate": 0.95, "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 24} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0}, state=state, ) action = context["experiment_plan"]["next_action"] flag_patch = action["config_patch"]["flag_patch"] self.assertEqual(action["knob_family"], "prefill-scheduler-interaction") self.assertEqual(action["action_id"], "lower_prefill_quantum_with_chunked_prefill") self.assertLess(flag_patch["max-num-batched-tokens"], 32768) factors = action["score_factors"] self.assertLess( factors["prefill_quantum_ratio_target"], factors["prefill_quantum_ratio_current"], ) def test_prefill_scheduler_quantum_step_scales_with_prompt_length(self) -> None: targets: list[int] = [] for prompt_p95 in (8192, 16384): with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 32768, "max-num-seqs": 8, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [8], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.95, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_rate": 2.0, "pass_rate": 0.95, "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 24} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={ "prompt_tokens_p95": prompt_p95, "prompt_tail_ratio_p95_p50": 4.0, }, state=state, ) action = context["experiment_plan"]["next_action"] self.assertEqual(action["knob_family"], "prefill-scheduler-interaction") targets.append(action["config_patch"]["flag_patch"]["max-num-batched-tokens"]) self.assertGreater(targets[1], targets[0]) def test_prefill_scheduler_coverage_precedes_gmu_microtune(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 2, "data-parallel-size": 1, "gpu-memory-utilization": 0.7, "max-num-seqs": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "gpu-memory-utilization", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [2, 4], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [2, 4], }, }, trace_overrides={"max_concurrency": 64}, ) def write_result(name: str, request_rate: float) -> Path: path = tmp_path / f"{name}.json" path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": request_rate, "best_pass_rate": 0.95, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_rate": request_rate, "pass_rate": 0.95, "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 24} }, }, } ], } ), encoding="utf-8", ) return path study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=2, best_request_rate=4.05, best_request_rate_per_gpu=2.025, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=2, best_request_rate=4.05, best_request_rate_per_gpu=2.025, result_path=str(write_result("trial-0001", 4.05)), config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=4, best_request_rate=8.0, best_request_rate_per_gpu=2.0, result_path=str(write_result("trial-0002", 8.0)), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 7774, "prompt_tail_ratio_p95_p50": 3.0}, state=state, ) action = context["experiment_plan"]["next_action"] self.assertEqual(action["knob_family"], "prefill-scheduler-interaction") self.assertEqual(action["action_id"], "seed_chunked_prefill_quantum") self.assertGreater( action["score_factors"]["uncovered_scheduler_dimension_bonus"], 0.0, ) families = { item["knob_family"] for item in context["experiment_plan"]["candidate_actions"] } self.assertNotIn("enable-chunked-prefill", families) def test_prefill_scheduler_admission_pressure_only_uses_normalized_seq_cap(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={"max_concurrency": 64}, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 8192, "max-num-seqs": 8, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [8], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.5, "probes": [ { "threshold": 0.5, "feasible": False, "payload": { "request_rate": 2.0, "pass_rate": 0.5, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": {"failed_reason_counts": {}}, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0}, state=state, ) action = context["experiment_plan"]["next_action"] flag_patch = action["config_patch"]["flag_patch"] self.assertEqual(action["knob_family"], "prefill-scheduler-interaction") self.assertEqual(action["action_id"], "raise_admission_pressure_with_chunked_prefill") self.assertEqual(flag_patch["max-num-seqs"], 16) self.assertNotIn("max-num-batched-tokens", flag_patch) self.assertEqual(action["score_factors"]["admission_pressure_direction"], "raise") self.assertLess( action["score_factors"]["admission_pressure_ratio_current"], action["score_factors"]["admission_pressure_ratio_target"], ) def test_prefill_scheduler_lowers_excess_admission_pressure(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={"max_concurrency": 64}, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 8192, "max-num-seqs": 128, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [8], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.95, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_rate": 2.0, "pass_rate": 0.95, "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 24} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0}, state=state, ) action = context["experiment_plan"]["next_action"] flag_patch = action["config_patch"]["flag_patch"] self.assertEqual(action["knob_family"], "prefill-scheduler-interaction") self.assertEqual(action["action_id"], "lower_admission_pressure_with_chunked_prefill") self.assertLess(flag_patch["max-num-seqs"], 128) self.assertNotIn("max-num-batched-tokens", flag_patch) self.assertEqual(action["score_factors"]["admission_pressure_direction"], "lower") self.assertLess( action["score_factors"]["admission_pressure_ratio_target"], action["score_factors"]["admission_pressure_ratio_current"], ) def test_prefill_scheduler_negative_applicability_matrix(self) -> None: variants = [ ( {"request_mode": "decode_only"}, {"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0}, ), ( {}, { "prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0, "prefix_cache": {"repeated_token_ratio_estimate": 0.75}, }, ), ( {}, {"prompt_tokens_p95": 2048, "prompt_tail_ratio_p95_p50": 1.0}, ), ] for trace_overrides, window_summary in variants: with self.subTest(trace_overrides=trace_overrides, window_summary=window_summary): with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides=trace_overrides, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 8192, "max-num-seqs": 8, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [8], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.95, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_rate": 2.0, "pass_rate": 0.95, "latency_summary": { "failed_reason_counts": { "ttft_ms>4000.0": 24 } }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary=window_summary, state=state, ) families = { item["knob_family"] for item in context["experiment_plan"]["candidate_actions"] } self.assertNotIn("prefill-scheduler-interaction", families) def test_prefill_scheduler_does_not_preempt_open_topology_frontier(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 2, "data-parallel-size": 1, "max-num-batched-tokens": 8192, "max-num-seqs": 8, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [2, 4], "allowed_data_parallel_sizes": [1, 2], "allowed_tp_dp_products": [4, 8], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.95, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_rate": 2.0, "pass_rate": 0.95, "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 24} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=4, best_request_rate=2.0, best_request_rate_per_gpu=0.5, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=4, best_request_rate=2.0, best_request_rate_per_gpu=0.5, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"data-parallel-size": 2}, }, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0}, state=state, ) action = context["experiment_plan"]["next_action"] self.assertEqual(action["knob_family"], "topology") self.assertEqual( action["config_patch"]["flag_patch"], {"tensor-parallel-size": 4, "data-parallel-size": 2}, ) families = { item["knob_family"] for item in context["experiment_plan"]["candidate_actions"] } self.assertNotIn("prefill-scheduler-interaction", families) def test_prefill_scheduler_not_active_for_short_prompt_workload(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 32768, "max-num-seqs": 8, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [8], "allowed_data_parallel_sizes": [1], "allowed_tp_dp_products": [8], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.95, "probes": [ { "threshold": 0.5, "feasible": True, "payload": { "request_rate": 2.0, "pass_rate": 0.95, "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 24} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.0, best_request_rate_per_gpu=0.25, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 2048, "prompt_tail_ratio_p95_p50": 1.0}, state=state, ) families = { item["knob_family"] for item in context["experiment_plan"]["candidate_actions"] } self.assertNotIn("prefill-scheduler-interaction", families) def test_prefill_sequence_probe_followed_by_joint_runtime_probe(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 4, "data-parallel-size": 1, "max-num-batched-tokens": 8192, "max-num-seqs": 64, "enable-chunked-prefill": True, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-batched-tokens", "max-num-seqs", "enable-chunked-prefill", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [4, 8], "allowed_data_parallel_sizes": [1, 2], "allowed_tp_dp_products": [4, 8], }, }, ) def write_result(name: str) -> Path: path = tmp_path / f"{name}.json" payload = { "status": "completed", "best_sampling_u": 0.091796875, "best_request_rate": 2.303, "best_pass_rate": 0.951, "probes": [ { "threshold": 0.09375, "feasible": True, "payload": { "request_rate": 2.303, "pass_rate": 0.951, "latency_summary": { "failed_reason_counts": {"ttft_ms>4000.0": 32} }, }, } ], } path.write_text(json.dumps(payload), encoding="utf-8") return path study = load_study_spec(study_path) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_sampling_u=0.091796875, best_request_rate=2.303, best_request_rate_per_gpu=0.288, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=2.303, best_request_rate_per_gpu=0.288, best_pass_rate=0.952, result_path=str(write_result("trial-0001")), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 8, "data-parallel-size": 1, }, }, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_request_rate=2.303, best_request_rate_per_gpu=0.288, best_pass_rate=0.950, result_path=str(write_result("trial-0002")), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 8, "max-num-seqs": 96, }, }, ), TrialSummary( trial_id="trial-0003", status="completed", parallel_size=8, best_request_rate=2.303, best_request_rate_per_gpu=0.288, best_pass_rate=0.950, result_path=str(write_result("trial-0003")), config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 8, "data-parallel-size": 1, "max-num-batched-tokens": 12288, }, }, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000}, state=state, ) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertEqual( context["harness_stop"]["reason"], "experiment_plan_has_high_value_candidate", ) action = context["experiment_plan"]["next_action"] flag_patch = action["config_patch"]["flag_patch"] self.assertEqual(action["knob_family"], "prefill-scheduler-interaction") self.assertEqual(action["action_id"], "raise_prefill_quantum_with_chunked_prefill") self.assertEqual(flag_patch["tensor-parallel-size"], 8) self.assertGreater(flag_patch["max-num-batched-tokens"], 8192) self.assertLess(flag_patch["max-num-batched-tokens"], 24000) def test_slo_unrecoverable_does_not_mask_latency_bottleneck(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, slo_overrides={ "ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000}, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 25}, }, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "max-num-seqs", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_tp_dp_products": [1, 2, 4], }, }, ) result_path = tmp_path / "trial-0001.json" result_path.write_text( json.dumps( { "status": "completed", "best_request_rate": 0.065, "best_request_rate_per_gpu": 0.065, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.015625, "feasible": False, "payload": { "request_count": 290, "pass_rate": 0.041, "request_rate": 0.483, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": { "ttft_ms>4000.0": 2, "tpot_ms>25.0": 14, "slo_pass_rate_unrecoverable": 263, } }, }, }, { "threshold": 0.001953125, "feasible": True, "payload": { "request_count": 39, "pass_rate": 1.0, "request_rate": 0.065, "latency_summary": {"failed_reason_counts": {}}, }, }, ], } ), encoding="utf-8", ) study = load_study_spec(study_path) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8}, state=StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=0.065, best_request_rate_per_gpu=0.065, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=0.065, best_request_rate_per_gpu=0.065, best_pass_rate=1.0, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ), ) self.assertNotEqual( context["bottleneck_hypotheses"][0]["name"], "admission_or_queueing", ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2}) def test_harness_excludes_topology_above_visible_gpu_count(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_envs": {"CUDA_VISIBLE_DEVICES": "0,1,2,4,5,6,7"}, "tunable_flags": ["tensor-parallel-size"], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) result_path = tmp_path / "trial-0003.json" result_path.write_text( json.dumps( { "status": "completed", "best_request_rate": 1.078, "best_pass_rate": 0.958, "probes": [ { "threshold": 0.039, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.8, "request_rate": 1.10, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>25.0": 20} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8}, state=StudyState( study_id=study.study_id, best_trial_id="trial-0003", best_request_rate=1.078, best_request_rate_per_gpu=0.2695, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=0.065, best_request_rate_per_gpu=0.065, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=0.398, best_request_rate_per_gpu=0.199, config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), TrialSummary( trial_id="trial-0003", status="completed", best_request_rate=1.078, best_request_rate_per_gpu=0.2695, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}, }, ), ], ), ) candidates = context["candidate_actions"] self.assertFalse( any( action["config_patch"]["flag_patch"].get("tensor-parallel-size") == 8 for action in candidates ) ) proposal = build_harness_guided_proposal(context) self.assertTrue( proposal is None or proposal.config_patch.flag_patch.get("tensor-parallel-size") != 8 ) def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "tunable_flags": ["tensor-parallel-size", "max-num-seqs"], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4], "allowed_tp_dp_products": [1, 2, 4], }, }, ) study = load_study_spec(study_path) result_path = tmp_path / "trial-0002.json" result_path.write_text( json.dumps( { "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 0.96, "probes": [ { "threshold": 0.75, "feasible": False, "payload": { "request_count": 100, "pass_rate": 0.6, "request_rate": 3.0, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>25.0": 40} }, }, } ], } ), encoding="utf-8", ) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_request_rate=2.0, best_request_rate_per_gpu=1.0, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=0.5, best_request_rate_per_gpu=0.5, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", best_request_rate=2.0, best_request_rate_per_gpu=1.0, result_path=str(result_path), config_patch={ "env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}, }, ), TrialSummary( trial_id="trial-0003", status="completed", best_request_rate=1.98, best_request_rate_per_gpu=0.99, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 8}}, ), TrialSummary( trial_id="trial-0004", status="completed", best_request_rate=1.98, best_request_rate_per_gpu=0.99, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}}, ), ], ) context = build_harness_context( study=study, window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8}, state=state, ) self.assertFalse(context["harness_stop"]["should_stop"]) self.assertEqual(context["harness_stop"]["reason"], "topology_frontier_requires_probe") def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={ "input_length_filter": { "min_input_tokens": 0, "max_input_tokens": 8192, } }, ) study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) summary = summarize_window(requests, window) self.assertEqual(len(requests), 2) self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000]) self.assertEqual(summary["request_count"], 2) self.assertEqual(summary["prompt_tokens_p95"], 5000.0) self.assertIn("prefix_cache", summary) self.assertIn("arrival_burst_ratio_p95_to_mean", summary) prompt = build_prompt( study=study, window_summary=summary, state=StudyState(study_id=study.study_id), capability_profile=None, ) self.assertIn('"input_length_filter"', prompt) self.assertIn('"max_input_tokens": 8192', prompt) def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={ "input_length_filter": { "min_input_tokens": 8193, "max_input_tokens": 8192, } }, ) with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="): load_study_spec(study_path) def test_trace_rejects_non_positive_max_requests_per_probe(self) -> None: with tempfile.TemporaryDirectory() as tmp: study_path = _write_study_assets( Path(tmp), trace_overrides={"max_requests_per_probe": 0}, ) with self.assertRaisesRegex(SpecError, "max_requests_per_probe must be > 0"): load_study_spec(study_path) def test_trace_rejects_invalid_replay_time_scale(self) -> None: with tempfile.TemporaryDirectory() as tmp: study_path = _write_study_assets( Path(tmp), trace_overrides={"replay_time_scale": 0.0}, ) with self.assertRaisesRegex(SpecError, "replay_time_scale must be > 0"): load_study_spec(study_path) def test_decode_only_mode_is_loaded_and_prompt_mentions_it(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={"request_mode": "decode_only"}, slo_overrides={ "ttft_rule": None, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20}, }, ) study = load_study_spec(study_path) self.assertEqual(study.trace.request_mode, "decode_only") self.assertTrue(study.trace.restart_engine_after_early_stop) window, requests = load_trace_requests(study, study_spec_path=study_path) prompt = build_prompt( study=study, window_summary=summarize_window(requests, window), state=StudyState(study_id=study.study_id), capability_profile=None, ) self.assertIn('"request_mode": "decode_only"', prompt) self.assertIn('"restart_engine_after_early_stop": true', prompt) self.assertIn("There is no TTFT SLO for this study.", prompt) self.assertIn("decode-only", prompt) def test_decode_only_restart_after_early_stop_can_be_disabled(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={ "request_mode": "decode_only", "restart_engine_after_early_stop": False, }, ) study = load_study_spec(study_path) self.assertFalse(study.trace.restart_engine_after_early_stop) def test_chat_mode_does_not_restart_after_early_stop_by_default(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) self.assertEqual(study.trace.request_mode, "chat") self.assertFalse(study.trace.restart_engine_after_early_stop) def test_decode_only_harness_defaults_to_decode_tpot(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={"request_mode": "decode_only"}, slo_overrides={ "ttft_rule": None, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20}, }, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-seqs", "max-num-batched-tokens", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_tp_dp_products": [8], "require_tp_dp_product_equals_gpu_count": True, }, }, ) study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) context = build_harness_context( study=study, window_summary=summarize_window(requests, window), state=StudyState(study_id=study.study_id), ) active = { harness["knob_family"] for harness in context["knob_harnesses"] if harness["active_now"] } self.assertIn("tensor-parallel-size", active) self.assertIn("data-parallel-size", active) self.assertIn("max-num-seqs", active) self.assertIn("max-num-batched-tokens", active) self.assertIn( "For decode_only studies, ignore TTFT", "\n".join(context["proposal_rules"]), ) self.assertIn( "config_patch is applied to the study base config", "\n".join(context["proposal_rules"]), ) def test_decode_topology_planner_prefers_dp_redistribution_and_preserves_ep(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={"request_mode": "decode_only"}, slo_overrides={ "ttft_rule": None, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 40}, }, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": True, "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 8, "max-num-seqs": 192, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", "max-num-seqs", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1, 2, 4, 8], "require_tp_dp_product_equals_gpu_count": True, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "require_enable_expert_parallel_when_ep_gt_one": True, }, }, ) result_path = tmp_path / "trial-0001-result.json" result_path.write_text( json.dumps( { "status": "completed", "best_request_rate": 0.47, "best_pass_rate": 0.98, "probes": [ { "threshold": 0.04, "feasible": False, "payload": { "request_rate": 0.72, "pass_rate": 0.3, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>40.0": 80} }, }, } ], } ), encoding="utf-8", ) study = load_study_spec(study_path) context = build_harness_context( study=study, window_summary={}, state=StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_request_rate=0.47, best_request_rate_per_gpu=0.05875, trials=[ TrialSummary( trial_id="trial-0001", status="completed", best_request_rate=0.47, best_request_rate_per_gpu=0.05875, best_pass_rate=0.98, result_path=str(result_path), config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ), ) action = context["experiment_plan"]["next_action"] self.assertEqual(action["knob_family"], "topology") self.assertEqual( action["config_patch"]["flag_patch"], {"tensor-parallel-size": 2, "data-parallel-size": 4}, ) proposal = build_harness_guided_proposal(context) self.assertIsNotNone(proposal) self.assertEqual( proposal.config_patch.flag_patch, {"tensor-parallel-size": 2, "data-parallel-size": 4}, ) def test_prompt_can_disable_harness_for_ablation(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["use_harness"] = False study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) prompt = build_prompt( study=study, window_summary=summarize_window(requests, window), state=StudyState(study_id=study.study_id), capability_profile=None, ) self.assertFalse(study.llm.use_harness) self.assertIn("Study context:", prompt) self.assertIn("Trial history:", prompt) self.assertIn("Known launch failures:", prompt) self.assertNotIn('"paper_alignment"', prompt) self.assertNotIn("Harnesses:", prompt) self.assertNotIn("Disabled by llm.use_harness=false", prompt) self.assertNotIn("without harness hints", prompt) self.assertNotIn("Window summary:", prompt) self.assertNotIn("Parallel space candidates:", prompt) self.assertNotIn("Prioritize exploring legal topology changes", prompt) def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, trace_overrides={"request_mode": "decode_only"}, slo_overrides={ "ttft_rule": None, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20}, }, engine_overrides={ "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-seqs", ] }, ) result_path = tmp_path / "trial-0001-result.json" result_path.write_text( json.dumps( { "status": "completed", "best_request_rate": 1.0, "best_pass_rate": 1.0, "probes": [ { "threshold": 0.1, "feasible": False, "payload": { "request_rate": 2.0, "pass_rate": 0.1, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "failed_reason_counts": {"tpot_ms>20.0": 20} }, }, }, { "threshold": 0.01, "feasible": True, "payload": { "request_rate": 1.0, "pass_rate": 1.0, "early_stop_reason": "probe_elapsed_s>1200.0", "latency_summary": { "failed_reason_counts": {"probe_elapsed_s>1200.0": 1} }, }, }, ], } ), encoding="utf-8", ) study = load_study_spec(study_path) context = build_harness_context( study=study, window_summary={}, state=StudyState( study_id=study.study_id, trials=[ TrialSummary( trial_id="trial-0001", status="completed", result_path=str(result_path), ) ], ), ) diagnostics = context["recent_trial_diagnostics"] self.assertEqual(diagnostics[0]["active_bottleneck"], "decode_tpot") active = { harness["knob_family"] for harness in context["knob_harnesses"] if harness["active_now"] } self.assertIn("data-parallel-size", active) self.assertIn("max-num-seqs", active) def test_best_feasible_probe_record_keeps_partial_probe_evidence(self) -> None: best = _best_feasible_probe_record( [ { "threshold": 0.03125, "request_rate": 0.72, "pass_rate": 0.3, "feasible": False, }, { "threshold": 0.015625, "request_rate": 0.3533, "pass_rate": 0.99, "feasible": True, }, { "threshold": 0.017578125, "request_rate": 0.3833, "pass_rate": 0.995, "feasible": True, }, ] ) self.assertIsNotNone(best) self.assertEqual(best["threshold"], 0.017578125) self.assertEqual(best["request_rate"], 0.3833) def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "served-model-name": "engine-name", } }, ) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["model"]["served_model_name"] = "trace-name" study_path.write_text(json.dumps(payload), encoding="utf-8") with self.assertRaisesRegex(SpecError, "must match engine.base_flags"): load_study_spec(study_path) def test_bailian_endpoint_defaults(self) -> None: endpoint = LLMEndpointSpec.from_dict({"provider": "bailian", "model": "qwen-plus"}) self.assertEqual(endpoint.provider, "bailian") self.assertEqual( endpoint.base_url, "https://dashscope.aliyuncs.com/compatible-mode/v1" ) self.assertEqual(endpoint.api_key_env, "DASHSCOPE_API_KEY") def test_codex_endpoint_resolves_base_url_from_codex_config(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) codex_dir = tmp_path / ".codex" codex_dir.mkdir(parents=True) (codex_dir / "config.toml").write_text( '\n'.join( [ 'model_provider = "ipads"', 'model_reasoning_effort = "high"', "", "[model_providers.ipads]", 'base_url = "http://codex.example/v1"', 'wire_api = "responses"', ] ), encoding="utf-8", ) with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True): endpoint = LLMEndpointSpec.from_dict({"provider": "codex", "model": "gpt-5.4"}) self.assertEqual(endpoint.provider, "codex") self.assertEqual(endpoint.base_url, "http://codex.example/v1") self.assertEqual(endpoint.wire_api, "responses") self.assertFalse(endpoint.stream) self.assertEqual(endpoint.reasoning_effort, "high") self.assertEqual(endpoint.api_key_env, "OPENAI_API_KEY") def test_codex_stream_forces_chat_completions_wire_api(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) codex_dir = tmp_path / ".codex" codex_dir.mkdir(parents=True) (codex_dir / "config.toml").write_text( '\n'.join( [ 'model_provider = "ipads"', "", "[model_providers.ipads]", 'base_url = "http://codex.example/v1"', 'wire_api = "responses"', ] ), encoding="utf-8", ) with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True): endpoint = LLMEndpointSpec.from_dict( {"provider": "codex", "model": "gpt-5.4", "stream": True} ) self.assertTrue(endpoint.stream) self.assertEqual(endpoint.wire_api, "chat.completions") def test_endpoint_stream_flag(self) -> None: endpoint = LLMEndpointSpec.from_dict( { "provider": "custom", "base_url": "http://example/v1", "wire_api": "chat.completions", "stream": True, "model": "x", "api_key_env": "OPENAI_API_KEY", } ) self.assertTrue(endpoint.stream) def test_extract_response_text_supports_responses_api_output(self) -> None: text = _extract_response_text( { "output": [ { "type": "message", "content": [ {"type": "output_text", "text": '{"diagnosis":"ok"}'} ], } ] } ) self.assertEqual(text, '{"diagnosis":"ok"}') def test_auth_headers_load_bailian_key_from_dotenv(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) (tmp_path / ".env").write_text('DASHSCOPE_API_KEY="dash-key"\n', encoding="utf-8") with mock.patch.dict(os.environ, {}, clear=True): with mock.patch("pathlib.Path.cwd", return_value=tmp_path): headers = _auth_headers("DASHSCOPE_API_KEY", "bailian") self.assertEqual(headers["Authorization"], "Bearer dash-key") def test_auth_headers_load_codex_auth_and_proxy(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) codex_dir = tmp_path / ".codex" codex_dir.mkdir(parents=True) (codex_dir / "config.toml").write_text( '\n'.join( [ "[network]", 'http_proxy = "http://proxy.example:3128"', 'https_proxy = "http://proxy.example:3128"', ] ), encoding="utf-8", ) (codex_dir / "auth.json").write_text( json.dumps({"OPENAI_API_KEY": "sk-codex-test"}), encoding="utf-8", ) with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True): with mock.patch("pathlib.Path.cwd", return_value=tmp_path): headers = _auth_headers("OPENAI_API_KEY", "codex") self.assertEqual(os.environ["http_proxy"], "http://proxy.example:3128") self.assertEqual(os.environ["HTTP_PROXY"], "http://proxy.example:3128") self.assertEqual(headers["Authorization"], "Bearer sk-codex-test") def test_prompt_includes_failed_trial_context(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) prompt = build_prompt( study=study, window_summary=summarize_window(requests, window), state=StudyState( study_id=study.study_id, trials=[ TrialSummary( trial_id="trial-0001", status="failed", diagnosis="flashinfer looked promising", config_patch={ "env_patch": {"VLLM_ATTENTION_BACKEND": "FLASHINFER"}, "flag_patch": {"tensor-parallel-size": 4}, }, failure_reason="engine_process_exited_before_ready exit_code=1", ) ], ), capability_profile=None, ) self.assertIn('"status": "failed"', prompt) self.assertIn('"failure_reason": "engine_process_exited_before_ready exit_code=1"', prompt) self.assertIn('"VLLM_ATTENTION_BACKEND": "FLASHINFER"', prompt) self.assertIn("Known launch failures:", prompt) def test_prompt_includes_failure_stage_for_launch_failures(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) prompt = build_prompt( study=study, window_summary=summarize_window(requests, window), state=StudyState( study_id=study.study_id, trials=[ TrialSummary( trial_id="trial-0002", status="failed", diagnosis="bad topology", config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 3, "data-parallel-size": 3, }, }, failure_stage="engine_launch", failure_reason="engine_process_exited_before_ready exit_code=1", ) ], ), capability_profile=None, ) self.assertIn('"failure_stage": "engine_launch"', prompt) self.assertIn('"implicated_flag_keys"', prompt) def test_prompt_prioritizes_parallel_space_when_tp_dp_ep_are_tunable(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": True, "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 8, }, "tunable_envs": [], "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", "max-num-seqs", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": True, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) prompt = build_prompt( study=study, window_summary=summarize_window(requests, window), state=StudyState(study_id=study.study_id), capability_profile=None, ) self.assertIn("Prioritize exploring legal topology changes in parallel space", prompt) self.assertIn("Parallel space candidates:", prompt) self.assertIn('"tensor_parallel_size": 2', prompt) def test_parse_proposal_text_repairs_truncated_json(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study = load_study_spec(_write_study_assets(tmp_path)) proposal = parse_proposal_text( """ { "observation": "obs", "diagnosis": "diag", "config_patch": { "env_patch": {}, "flag_patch": { "max-num-seqs": 24 } }, "expected_effects": [ "faster batching" ], "why_not_previous_failures": "none" """, study, ) self.assertEqual(proposal.diagnosis, "diag") self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 24) def test_length_only_trace_rows_are_synthesized(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) trace_dir = tmp_path / "trace_windows" / "traces" trace_dir.mkdir(parents=True) trace_path = trace_dir / "chat_len_only.jsonl" with trace_path.open("w", encoding="utf-8") as handle: handle.write( json.dumps( { "timestamp": 0.0, "sampling_u": 0.1, "input_length": 32, "output_length": 16 } ) + "\n" ) windows_path = tmp_path / "trace_windows" / "windows.json" windows_path.write_text( json.dumps( { "windows": [ { "window_id": "w1", "trace_type": "chat", "trace_file": "traces/chat_len_only.jsonl", "window_start": 0.0, "window_end": 10.0 } ] } ), encoding="utf-8", ) study_path = tmp_path / "study.json" study_path.write_text( json.dumps( { "study_id": "study-len-only", "hardware": {"gpu_count": 1}, "model": { "model_id": "m1", "served_model_name": "dummy-model" }, "engine": { "engine_name": "vllm", "exec_path": "/usr/local/bin/vllm", "host": "127.0.0.1", "port": 8000, "ready_timeout_s": 10, "request_timeout_s": 10, "healthcheck_path": "/v1/models", "launch_args": [], "base_envs": {}, "base_flags": {}, "tunable_envs": [], "tunable_flags": [] }, "trace": { "windows_path": str(windows_path), "window_id": "w1", "max_concurrency": 1, "synthetic_prompt_cap_tokens": 8 }, "slo": {"target_pass_rate": 0.95}, "search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1}, "llm": {"system_prompt": "", "max_history_trials": 1} } ), encoding="utf-8", ) study = load_study_spec(study_path) _, requests = load_trace_requests(study, study_spec_path=study_path) self.assertEqual(len(requests), 1) message = requests[0].body["messages"][0]["content"] self.assertEqual(message.count("token"), 8) self.assertEqual(requests[0].body["min_tokens"], 16) self.assertEqual(requests[0].body["max_tokens"], 16) def test_slo_evaluation_step_and_fixed_rules(self) -> None: with tempfile.TemporaryDirectory() as tmp: study = load_study_spec(_write_study_assets(Path(tmp))) outcomes = [ RequestOutcome( request_id="r1", success=True, ttft_ms=1000, tpot_ms=100, prompt_tokens=1000, completion_tokens=16, ), RequestOutcome( request_id="r2", success=True, ttft_ms=6000, tpot_ms=100, prompt_tokens=5000, completion_tokens=16, ), ] evaluations, summary = summarize_evaluations(outcomes, study.slo) self.assertTrue(evaluations[0].passed) self.assertFalse(evaluations[1].passed) self.assertEqual(summary["slo_pass_rate"], 0.5) def test_trace_completion_tokens_override_forces_min_and_max_tokens(self) -> None: with tempfile.TemporaryDirectory() as tmp: study_path = _write_study_assets( Path(tmp), trace_overrides={"completion_tokens_override": 1}, ) study = load_study_spec(study_path) _, requests = load_trace_requests(study, study_spec_path=study_path) self.assertEqual(len(requests), 3) self.assertEqual(requests[0].completion_tokens_hint, 1) self.assertEqual(requests[1].completion_tokens_hint, 1) self.assertEqual(requests[2].completion_tokens_hint, 1) self.assertEqual(requests[0].body["min_tokens"], 1) self.assertEqual(requests[0].body["max_tokens"], 1) self.assertEqual(requests[2].body["min_tokens"], 1) self.assertEqual(requests[2].body["max_tokens"], 1) def test_run_one_request_fails_fixed_length_completion_mismatch(self) -> None: request = TraceRequest( row_id="r1", arrival_s=0.0, sampling_u=0.1, body={"model": "m", "messages": [{"role": "user", "content": "x"}]}, prompt_tokens_hint=8, completion_tokens_hint=2, ) with mock.patch( "aituner.worker.stream_chat_completion", return_value=StreamMetrics( ttft_ms=10.0, tpot_ms=5.0, completion_tokens=1, ), ): outcome = _run_one_request( request, base_url="http://127.0.0.1:8000", timeout_s=1.0, ) self.assertFalse(outcome.success) self.assertEqual(outcome.error, "completion_tokens_mismatch expected=2 actual=1") self.assertEqual(outcome.completion_tokens, 1) def test_build_prompt_mentions_completion_tokens_override(self) -> None: with tempfile.TemporaryDirectory() as tmp: study_path = _write_study_assets( Path(tmp), trace_overrides={"completion_tokens_override": 1}, slo_overrides={"tpot_rule": None}, ) study = load_study_spec(study_path) store = StudyStore(Path(tmp) / ".aituner") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) window, requests = load_trace_requests(study, study_spec_path=study_path) prompt = build_prompt( study=study, window_summary=summarize_window(requests, window), state=state, capability_profile=None, ) self.assertIn('"completion_tokens_override": 1', prompt) self.assertIn("min_tokens=max_tokens=1", prompt) def test_slo_evaluation_supports_tpot_only_95_percent_target(self) -> None: with tempfile.TemporaryDirectory() as tmp: study = load_study_spec( _write_study_assets( Path(tmp), slo_overrides={ "ttft_rule": None, "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20}, }, ) ) outcomes = [ RequestOutcome( request_id="r1", success=True, ttft_ms=3000, tpot_ms=10, prompt_tokens=1000, completion_tokens=16, ), RequestOutcome( request_id="r2", success=True, ttft_ms=9000, tpot_ms=21, prompt_tokens=5000, completion_tokens=16, ), ] evaluations, summary = summarize_evaluations(outcomes, study.slo) self.assertEqual([item.passed for item in evaluations], [True, False]) self.assertEqual(summary["slo_pass_rate"], 0.5) self.assertFalse(summary["feasible"]) def test_build_launch_recipe_serializes_list_flags_once(self) -> None: with tempfile.TemporaryDirectory() as tmp: study = load_study_spec(_write_study_assets(Path(tmp))) recipe = build_launch_recipe( study.engine, ConfigPatch( flag_patch={ "cuda-graph-sizes": [1, 2, 4], } ), ) self.assertIn("--cuda-graph-sizes", recipe.argv) flag_index = recipe.argv.index("--cuda-graph-sizes") self.assertEqual(recipe.argv[flag_index + 1 : flag_index + 4], ["1", "2", "4"]) self.assertEqual(recipe.argv.count("--cuda-graph-sizes"), 1) def test_prepare_trace_windows_materializes_repo_local_assets(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) legacy_source = tmp_path / "legacy" thinking_source = tmp_path / "thinking" legacy_source.mkdir() thinking_source.mkdir() for filename in [ "qwen_chat_blksz_64_031109-031111", "qwen_chat_blksz_64_031121-031123", "qwen_chat_blksz_64_031209-031211", "qwen_chat_blksz_64_031221-031223", "qwen_chat_blksz_64_031309-031311", "qwen_chat_blksz_64_031321-031323", "qwen_chat_blksz_64_031409-031411", "qwen_chat_blksz_64_031421-031423", "qwen_chat_blksz_64_031509-031511", "qwen_chat_blksz_64_031521-031523", "qwen_chat_blksz_64_031609-031611", "qwen_chat_blksz_64_031621-031623", "qwen_chat_blksz_64_031709-031711", "qwen_chat_blksz_64_031721-031723", ]: for suffix in [".jsonl", "_prompt.jsonl"]: path = legacy_source / f"{filename}{suffix}" path.write_text("", encoding="utf-8") peak_trace = legacy_source / "qwen_chat_blksz_64_031109-031111.jsonl" peak_prompt = legacy_source / "qwen_chat_blksz_64_031109-031111_prompt.jsonl" peak_trace.write_text( "\n".join( [ json.dumps( { "chat_id": "c1", "turn": 1, "timestamp": 3599.0, "input_length": 10, "output_length": 3, } ), json.dumps( { "chat_id": "c2", "turn": 2, "timestamp": 3605.0, "input_length": 20, "output_length": 7, } ), ] ) + "\n", encoding="utf-8", ) peak_prompt.write_text( "\n".join( [ json.dumps({"chat_id": "c1", "turn": 1, "prompt": "ignore me"}), json.dumps({"chat_id": "c2", "turn": 2, "prompt": "real prompt"}), ] ) + "\n", encoding="utf-8", ) output_root = tmp_path / "trace_windows" subprocess.run( [ "python3", "scripts/prepare_trace_windows.py", "--legacy-source", str(legacy_source), "--thinking-source", str(thinking_source), "--output-root", str(output_root), "--workloads", "chat", "--overwrite", ], check=True, cwd=str(REPO_ROOT), ) windows_payload = json.loads((output_root / "windows.json").read_text(encoding="utf-8")) windows = {item["window_id"]: item for item in windows_payload["windows"]} self.assertIn("chat_w20260311_1000", windows) self.assertEqual(windows["chat_w20260311_1000"]["num_requests"], 1) trace_path = output_root / windows["chat_w20260311_1000"]["trace_file"] rows = [json.loads(line) for line in trace_path.read_text(encoding="utf-8").splitlines()] self.assertEqual(len(rows), 1) self.assertEqual(rows[0]["prompt"], "real prompt") self.assertEqual(rows[0]["timestamp"], 5.0) self.assertEqual(rows[0]["output_length"], 7) self.assertIsInstance(rows[0]["sampling_u"], float) def test_prepare_trace_windows_preserves_existing_files_on_failure(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) legacy_source = tmp_path / "legacy" thinking_source = tmp_path / "thinking" output_root = tmp_path / "trace_windows" traces_dir = output_root / "traces" legacy_source.mkdir() thinking_source.mkdir() traces_dir.mkdir(parents=True) for filename in [ "qwen_chat_blksz_64_031109-031111", "qwen_chat_blksz_64_031121-031123", ]: for suffix in [".jsonl", "_prompt.jsonl"]: path = legacy_source / f"{filename}{suffix}" path.write_text( json.dumps( { "chat_id": "c1", "turn": 1, "timestamp": 3605.0, "input_length": 20, "output_length": 7, "prompt": "prompt", } ) + "\n", encoding="utf-8", ) sentinel = traces_dir / "chat_w20260311_1000.jsonl" sentinel.write_text("sentinel\n", encoding="utf-8") proc = subprocess.run( [ "python3", "scripts/prepare_trace_windows.py", "--legacy-source", str(legacy_source), "--thinking-source", str(thinking_source), "--output-root", str(output_root), "--workloads", "chat", "--overwrite", ], cwd=str(REPO_ROOT), capture_output=True, text=True, ) self.assertNotEqual(proc.returncode, 0) self.assertEqual(sentinel.read_text(encoding="utf-8"), "sentinel\n") self.assertEqual(sorted(path.name for path in traces_dir.glob("*.tmp.*")), []) def test_binary_search_max_feasible(self) -> None: result = binary_search_max_feasible( low=0.0, high=1.0, tolerance=0.01, max_probes=8, evaluator=lambda threshold: ThresholdProbe( threshold=threshold, feasible=threshold <= 0.625, payload={"threshold": threshold}, ), ) self.assertLessEqual(result.best_threshold, 0.625) self.assertGreaterEqual(result.best_threshold, 0.5) self.assertIsNotNone(result.best_feasible_payload) def test_binary_search_continues_below_tolerance_when_all_infeasible(self) -> None: seen = [] def evaluator(threshold): seen.append(threshold) return ThresholdProbe( threshold=threshold, feasible=False, payload={"threshold": threshold}, ) result = binary_search_max_feasible( low=0.0, high=1.0, tolerance=0.1, max_probes=6, evaluator=evaluator, ) self.assertIsNone(result.best_feasible_payload) self.assertEqual(len(result.probes), 6) self.assertEqual( seen, [0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625], ) def test_trace_max_requests_uses_window_wide_downsample(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) trace_dir = tmp_path / "trace_windows" / "traces" trace_dir.mkdir(parents=True) trace_path = trace_dir / "chat_many.jsonl" with trace_path.open("w", encoding="utf-8") as handle: for idx in range(10): handle.write( json.dumps( { "request_id": f"r{idx}", "timestamp": float(idx), "sampling_u": idx / 10.0, "messages": [{"role": "user", "content": f"hello-{idx}"}], "input_length": 10 + idx, "output_length": 5, } ) + "\n" ) windows_path = tmp_path / "trace_windows" / "windows.json" windows_path.write_text( json.dumps( { "windows": [ { "window_id": "w1", "trace_type": "chat", "trace_file": "traces/chat_many.jsonl", "window_start": 0.0, "window_end": 10.0, } ] } ), encoding="utf-8", ) study_path = tmp_path / "study.json" study_path.write_text( json.dumps( { "study_id": "study-downsample", "hardware": {"gpu_count": 1}, "model": {"model_id": "m1", "served_model_name": "dummy-model"}, "engine": { "engine_name": "vllm", "exec_path": "/usr/local/bin/vllm", "host": "127.0.0.1", "port": 8000, "ready_timeout_s": 10, "request_timeout_s": 10, "healthcheck_path": "/v1/models", "launch_args": [], "base_envs": {}, "base_flags": {}, "tunable_envs": [], "tunable_flags": [], }, "trace": { "windows_path": str(windows_path), "window_id": "w1", "max_concurrency": 1, "max_requests_per_probe": 4, }, "slo": {"target_pass_rate": 0.95}, "search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1}, "llm": {"system_prompt": "", "max_history_trials": 1}, } ), encoding="utf-8", ) study = load_study_spec(study_path) _, requests = load_trace_requests(study, study_spec_path=study_path) self.assertEqual([item.row_id for item in requests], ["r0", "r2", "r5", "r7"]) def test_trace_replay_time_scale_scales_arrivals_and_window(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) trace_dir = tmp_path / "trace_windows" / "traces" trace_dir.mkdir(parents=True) trace_path = trace_dir / "chat_scale.jsonl" trace_path.write_text( json.dumps( { "request_id": "r1", "timestamp": 10.0, "sampling_u": 0.25, "messages": [{"role": "user", "content": "hello"}], "input_length": 16, "output_length": 4, } ) + "\n", encoding="utf-8", ) windows_path = tmp_path / "trace_windows" / "windows.json" windows_path.write_text( json.dumps( { "windows": [ { "window_id": "w1", "trace_type": "chat", "trace_file": "traces/chat_scale.jsonl", "window_start": 0.0, "window_end": 100.0, } ] } ), encoding="utf-8", ) study_path = tmp_path / "study.json" study_path.write_text( json.dumps( { "study_id": "study-scale", "hardware": {"gpu_count": 1}, "model": {"model_id": "m1", "served_model_name": "dummy-model"}, "engine": { "engine_name": "vllm", "exec_path": "/usr/local/bin/vllm", "host": "127.0.0.1", "port": 8000, "ready_timeout_s": 10, "request_timeout_s": 10, "healthcheck_path": "/v1/models", "launch_args": [], "base_envs": {}, "base_flags": {}, "tunable_envs": [], "tunable_flags": [], }, "trace": { "windows_path": str(windows_path), "window_id": "w1", "max_concurrency": 1, "replay_time_scale": 0.1, }, "slo": {"target_pass_rate": 0.95}, "search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1}, "llm": {"system_prompt": "", "max_history_trials": 1}, } ), encoding="utf-8", ) study = load_study_spec(study_path) window, requests = load_trace_requests(study, study_spec_path=study_path) self.assertEqual(window.window_end, 10.0) self.assertEqual(requests[0].arrival_s, 1.0) def test_proposal_validation_and_job_emission(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal_text = json.dumps( { "observation": "Current TTFT fails before TPOT.", "diagnosis": "Prefill pressure dominates.", "config_patch": { "env_patch": {"VLLM_ATTENTION_BACKEND": "FLASHINFER"}, "flag_patch": {"tensor-parallel-size": 4, "max-num-seqs": 64} }, "expected_effects": ["lower TTFT", "raise feasible sampling_u"], "why_not_previous_failures": "Avoids changing unsupported envs." } ) proposal = parse_proposal_text(proposal_text, study) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) job = build_trial_job(study=study, trial=trial, repo_root=tmp_path) jobs_path = tmp_path / "jobs.toml" append_job(jobs_path, job) rendered = jobs_path.read_text(encoding="utf-8") self.assertIn('name = "study-1-trial-0001"', rendered) self.assertIn('command = "python3 -m aituner.cli worker run-trial', rendered) self.assertIn('PYTHONPATH = "src"', rendered) def test_ingest_trial_results_updates_best(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, "expected_effects": ["raise rate"] } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) Path(trial.result_path).write_text( json.dumps( { "study_id": study.study_id, "trial_id": trial.trial_id, "status": "completed", "best_sampling_u": 0.75, "best_request_rate": 12.5, "best_pass_rate": 0.97 } ), encoding="utf-8", ) next_state = store.ingest_trial_results(study.study_id) self.assertEqual(next_state.best_trial_id, trial.trial_id) self.assertEqual(next_state.best_sampling_u, 0.75) self.assertEqual(next_state.best_request_rate, 12.5) self.assertEqual(next_state.best_parallel_size, 4) self.assertEqual(next_state.best_request_rate_per_gpu, 3.125) self.assertEqual( next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"], 3.125, ) def test_run_trial_persists_probe_request_details(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["search"]["max_probes"] = 1 study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal = Proposal.from_dict( { "observation": "baseline", "diagnosis": "baseline", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": ["measure"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) def fake_replay(requests, **kwargs): return ( [ RequestOutcome( request_id=request.row_id, success=True, ttft_ms=10.0, tpot_ms=5.0, prompt_tokens=request.prompt_tokens_hint, completion_tokens=request.completion_tokens_hint, ) for request in requests ], False, "", ) process = mock.Mock() process.poll.return_value = 0 with mock.patch("aituner.worker.subprocess.Popen", return_value=process): with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None): with mock.patch("aituner.worker._terminate_process_tree", return_value=None): with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay): result = run_trial( Path(trial.artifact_dir) / "trial_spec.json" ) self.assertEqual(result["status"], "completed") details_path = Path(trial.artifact_dir) / "probe_details.jsonl" self.assertTrue(details_path.exists()) rows = [ json.loads(line) for line in details_path.read_text(encoding="utf-8").splitlines() ] self.assertEqual(len(rows), 1) self.assertEqual(rows[0]["threshold"], 0.5) self.assertEqual(rows[0]["outcomes"][0]["request_id"], "r1") self.assertEqual(rows[0]["outcomes"][0]["sampling_u"], 0.1) def test_run_trial_marks_full_trace_saturation_as_measurement_ceiling_insufficient( self, ) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal = Proposal.from_dict( { "observation": "baseline", "diagnosis": "baseline", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": ["measure"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) def fake_replay(requests, **kwargs): return ( [ RequestOutcome( request_id=request.row_id, success=True, ttft_ms=10.0, tpot_ms=5.0, prompt_tokens=request.prompt_tokens_hint, completion_tokens=request.completion_tokens_hint, ) for request in requests ], False, "", ) process = mock.Mock() process.poll.return_value = 0 with mock.patch("aituner.worker.subprocess.Popen", return_value=process): with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None): with mock.patch("aituner.worker._terminate_process_tree", return_value=None): with mock.patch( "aituner.worker._replay_requests", side_effect=fake_replay, ): result = run_trial(Path(trial.artifact_dir) / "trial_spec.json") self.assertEqual(result["status"], "completed") self.assertEqual(result["best_request_count"], 3) self.assertTrue(result["measurement"]["measurement_ceiling_insufficient"]) self.assertEqual(result["measurement"]["reason"], "measurement_ceiling_insufficient") self.assertIn("auto_high_resolution", result["measurement"]) def test_run_trial_falls_back_below_inherited_search_floor(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["search"]["max_probes"] = 2 study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=1, best_sampling_u=0.5, best_request_rate=2.0, best_request_rate_per_gpu=2.0, next_trial_index=2, best_by_parallel_size={ "1": { "trial_id": "trial-0001", "parallel_size": 1, "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_request_rate_per_gpu": 2.0, } }, trials=[], ) proposal = Proposal.from_dict( { "observation": "runtime patch", "diagnosis": "measure even if worse than incumbent", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}}, "expected_effects": ["measure"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual(trial.search.low, 0.0) trial_spec_path = Path(trial.artifact_dir) / "trial_spec.json" trial_spec_payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) trial_spec_payload["search"]["low"] = 0.5 trial_spec_path.write_text(json.dumps(trial_spec_payload), encoding="utf-8") def fake_replay(requests, **kwargs): passing = len(requests) <= 1 return ( [ RequestOutcome( request_id=request.row_id, success=True, ttft_ms=10.0 if passing else 10000.0, tpot_ms=5.0 if passing else 1000.0, prompt_tokens=request.prompt_tokens_hint, completion_tokens=request.completion_tokens_hint, ) for request in requests ], False, "", ) process = mock.Mock() process.poll.return_value = 0 with mock.patch("aituner.worker.subprocess.Popen", return_value=process): with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None): with mock.patch("aituner.worker._terminate_process_tree", return_value=None): with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay): result = run_trial(trial_spec_path) self.assertEqual(result["status"], "completed") self.assertEqual(result["best_source"], "lower_range_fallback") self.assertEqual(result["best_sampling_u"], 0.375) self.assertEqual(result["best_request_rate"], 0.1) self.assertEqual(result["primary_search"]["low"], 0.5) self.assertIsNone(result["primary_search"]["best_request_rate"]) self.assertEqual(result["lower_range_fallback"]["low"], 0.0) self.assertEqual(result["lower_range_fallback"]["high"], 0.5) self.assertEqual(result["lower_range_fallback"]["best_request_rate"], 0.1) self.assertEqual( [probe["threshold"] for probe in result["primary_search"]["probes"]], [0.75, 0.625], ) self.assertEqual( [probe["threshold"] for probe in result["lower_range_fallback"]["probes"]], [0.25, 0.375], ) def test_run_trial_skips_fallback_below_incumbent_floor(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["search"]["max_probes"] = 2 payload["search"]["inherit_incumbent_floor"] = True study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=1, best_sampling_u=0.5, best_request_rate=2.0, best_request_rate_per_gpu=2.0, next_trial_index=2, best_by_parallel_size={ "1": { "trial_id": "trial-0001", "parallel_size": 1, "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_request_rate_per_gpu": 2.0, } }, trials=[], ) proposal = Proposal.from_dict( { "observation": "runtime patch", "diagnosis": "primary range all infeasible", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}}, "expected_effects": ["measure"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual(trial.search.low, 0.5) self.assertTrue(trial.search.inherit_incumbent_floor) def fake_replay(requests, **kwargs): return ( [ RequestOutcome( request_id=request.row_id, success=True, ttft_ms=10000.0, tpot_ms=1000.0, prompt_tokens=request.prompt_tokens_hint, completion_tokens=request.completion_tokens_hint, ) for request in requests ], False, "", ) process = mock.Mock() process.poll.return_value = 0 with mock.patch("aituner.worker.subprocess.Popen", return_value=process): with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None): with mock.patch("aituner.worker._terminate_process_tree", return_value=None): with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay): result = run_trial(Path(trial.artifact_dir) / "trial_spec.json") self.assertEqual(result["status"], "completed") self.assertIsNone(result["best_request_rate"]) self.assertEqual(result["best_source"], "primary_search") self.assertEqual(result["primary_search"]["low"], 0.5) self.assertIsNone(result["primary_search"]["best_request_rate"]) self.assertEqual( [probe["threshold"] for probe in result["primary_search"]["probes"]], [0.75, 0.625], ) self.assertEqual(result["lower_range_fallback"]["triggered"], False) self.assertEqual(result["lower_range_fallback"]["skipped"], True) self.assertEqual(result["lower_range_fallback"]["probes"], []) self.assertEqual( result["lower_range_fallback"]["reason"], "primary_search_above_incumbent_floor_all_infeasible", ) self.assertEqual( result["all_infeasible_diagnostics"]["threshold"], 0.625, ) def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal = Proposal.from_dict( { "observation": "baseline", "diagnosis": "baseline", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": ["measure"], } ) _, next_state = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual(state.trials, []) self.assertEqual(len(next_state.trials), 1) def test_materialize_trial_uses_full_search_range_with_incumbent(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=4, best_sampling_u=0.375, best_request_rate=3.0, best_request_rate_per_gpu=0.75, next_trial_index=2, best_by_parallel_size={ "4": { "trial_id": "trial-0001", "parallel_size": 4, "best_sampling_u": 0.375, "best_request_rate": 3.0, "best_request_rate_per_gpu": 0.75, } }, trials=[], ) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, "expected_effects": ["raise rate"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual(trial.search.low, study.search.low) self.assertEqual(trial.search.high, 1.0) def test_materialize_trial_uses_full_search_range_for_same_parallel_group(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=4, best_sampling_u=0.375, best_request_rate=3.0, best_request_rate_per_gpu=0.75, next_trial_index=2, best_by_parallel_size={ "2": { "trial_id": "trial-0000", "parallel_size": 2, "best_sampling_u": 0.125, "best_request_rate": 0.8, "best_request_rate_per_gpu": 0.4, } }, trials=[], ) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}}, "expected_effects": ["raise rate"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual(trial.search.low, study.search.low) def test_materialize_trial_can_use_incumbent_floor_when_enabled(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["search"]["inherit_incumbent_floor"] = True study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=4, best_sampling_u=0.375, best_request_rate=3.0, best_request_rate_per_gpu=0.75, next_trial_index=2, best_by_parallel_size={ "4": { "trial_id": "trial-0001", "parallel_size": 4, "best_sampling_u": 0.375, "best_request_rate": 3.0, "best_request_rate_per_gpu": 0.75, } }, trials=[], ) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, "expected_effects": ["raise rate"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual(trial.search.low, 0.375) self.assertTrue(trial.search.inherit_incumbent_floor) def test_materialize_trial_resets_search_floor_for_new_parallel_group(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=4, best_sampling_u=0.4, best_request_rate=3.0, best_request_rate_per_gpu=0.75, next_trial_index=2, trials=[], ) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}}, "expected_effects": ["raise rate"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual(trial.search.low, study.search.low) def test_materialize_trial_inherits_incumbent_topology_for_runtime_patch(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": True, "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", "max-num-seqs", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": True, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=8, best_sampling_u=0.125, best_request_rate=3.0, best_request_rate_per_gpu=0.375, next_trial_index=3, best_by_parallel_size={ "8": { "trial_id": "trial-0002", "parallel_size": 8, "best_sampling_u": 0.125, "best_request_rate": 3.0, "best_request_rate_per_gpu": 0.375, } }, trials=[ TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_sampling_u=0.125, best_request_rate=3.0, best_request_rate_per_gpu=0.375, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, "expert-parallel-size": 8, }, }, ) ], ) proposal = Proposal.from_dict( { "observation": "Validate runtime headroom around the incumbent.", "diagnosis": "Try lower concurrency on the current best topology.", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}}, "expected_effects": ["validate incumbent runtime headroom"], } ) trial, next_state = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual( trial.config_patch.flag_patch, { "tensor-parallel-size": 2, "data-parallel-size": 4, "max-num-seqs": 160, }, ) self.assertEqual(trial.search.low, study.search.low) self.assertEqual( next_state.trials[-1].config_patch["flag_patch"], { "tensor-parallel-size": 2, "data-parallel-size": 4, "max-num-seqs": 160, }, ) def test_materialize_trial_keeps_explicit_topology_runtime_patch(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": True, "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", "max-num-seqs", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": True, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, best_trial_id="trial-0002", next_trial_index=3, trials=[ TrialSummary( trial_id="trial-0002", status="completed", config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, }, }, ) ], ) proposal = Proposal.from_dict( { "observation": "Validate base topology runtime.", "diagnosis": "Explicitly keep base topology and adjust concurrency.", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "data-parallel-size": 2, "max-num-seqs": 160, }, }, "expected_effects": ["test base topology runtime headroom"], } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) self.assertEqual( trial.config_patch.flag_patch, { "tensor-parallel-size": 4, "data-parallel-size": 2, "max-num-seqs": 160, }, ) def test_ingest_trial_results_records_failure_reason(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, "expected_effects": ["raise rate"] } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) Path(trial.result_path).write_text( json.dumps( { "study_id": study.study_id, "trial_id": trial.trial_id, "status": "failed", "failure_reason": "engine_process_exited_before_ready exit_code=1", "probes": [] } ), encoding="utf-8", ) next_state = store.ingest_trial_results(study.study_id) self.assertEqual(next_state.trials[0].status, "failed") self.assertEqual( next_state.trials[0].failure_reason, "engine_process_exited_before_ready exit_code=1", ) self.assertEqual(next_state.trials[0].failure_stage, "") def test_ingest_trial_results_records_failure_stage(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, "expected_effects": ["raise rate"] } ) trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) Path(trial.result_path).write_text( json.dumps( { "study_id": study.study_id, "trial_id": trial.trial_id, "status": "failed", "failure_stage": "engine_launch", "failure_reason": "engine_process_exited_before_ready exit_code=1", "probes": [] } ), encoding="utf-8", ) next_state = store.ingest_trial_results(study.study_id) self.assertEqual(next_state.trials[0].failure_stage, "engine_launch") def test_ingest_trial_results_prefers_higher_request_rate_per_gpu(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store = StudyStore(tmp_path / ".aituner" / "studies") store.init_study(spec_path=study_path, study=study) state = store.load_state(study.study_id) proposal_a = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, "expected_effects": ["raise rate"], } ) trial_a, state = store.materialize_trial(study=study, state=state, proposal=proposal_a) Path(trial_a.result_path).write_text( json.dumps( { "study_id": study.study_id, "trial_id": trial_a.trial_id, "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 4.0, "best_pass_rate": 0.97, } ), encoding="utf-8", ) proposal_b = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Diag", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}}, "expected_effects": ["raise rate"], } ) trial_b, _ = store.materialize_trial(study=study, state=state, proposal=proposal_b) Path(trial_b.result_path).write_text( json.dumps( { "study_id": study.study_id, "trial_id": trial_b.trial_id, "status": "completed", "best_sampling_u": 0.4, "best_request_rate": 3.0, "best_pass_rate": 0.97, } ), encoding="utf-8", ) next_state = store.ingest_trial_results(study.study_id) self.assertEqual(next_state.best_trial_id, trial_b.trial_id) self.assertEqual(next_state.best_parallel_size, 2) self.assertEqual(next_state.best_request_rate, 3.0) self.assertEqual(next_state.best_request_rate_per_gpu, 1.5) self.assertEqual(next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"], 1.0) self.assertEqual(next_state.best_by_parallel_size["2"]["best_request_rate_per_gpu"], 1.5) def test_validate_proposal_rejects_invalid_tp_dp_product(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": True, "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": True, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Bad topology", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 2, "expert-parallel-size": 4, }, }, "expected_effects": ["raise throughput"], } ) with self.assertRaisesRegex(SpecError, "must equal hardware.gpu_count"): validate_proposal(proposal, study) def test_validate_proposal_rejects_invalid_ep_divisibility(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": True, "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": True, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Bad EP", "config_patch": { "env_patch": {}, "flag_patch": { "expert-parallel-size": 3, }, }, "expected_effects": ["raise throughput"], } ) with self.assertRaisesRegex(SpecError, "expert-parallel-size=3"): validate_proposal(proposal, study) def test_validate_proposal_accepts_valid_tp_dp_ep_combo(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": True, "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 8, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": True, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1, 2, 4, 8], }, }, ) study = load_study_spec(study_path) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Valid topology", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, "expert-parallel-size": 4, }, }, "expected_effects": ["raise throughput"], } ) validated = validate_proposal(proposal, study) self.assertEqual(validated.config_patch.flag_patch["tensor-parallel-size"], 2) def test_validate_proposal_accepts_allowed_tp_dp_product_above_gpu_count(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": False, "tensor-parallel-size": 4, "data-parallel-size": 1, "expert-parallel-size": 1, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": False, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tp_dp_products": [1, 2, 4, 8], "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_expert_parallel_sizes": [1], }, }, ) study = load_study_spec(study_path) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Allow product 8", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "data-parallel-size": 2, "expert-parallel-size": 1, }, }, "expected_effects": ["explore larger topology"], } ) validated = validate_proposal(proposal, study) self.assertEqual(validated.config_patch.flag_patch["data-parallel-size"], 2) def test_validate_proposal_rejects_tp_dp_product_outside_allowed_set(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "enable-expert-parallel": False, "tensor-parallel-size": 4, "data-parallel-size": 1, "expert-parallel-size": 1, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "expert-parallel-size", ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": False, "require_ep_size_leq_tp_dp_product": True, "require_ep_size_divides_tp_dp_product": True, "allowed_tp_dp_products": [1, 2, 4, 8], "allowed_tensor_parallel_sizes": [1, 2, 3, 4, 8], "allowed_data_parallel_sizes": [1, 2, 3, 4, 8], "allowed_expert_parallel_sizes": [1], }, }, ) study = load_study_spec(study_path) proposal = Proposal.from_dict( { "observation": "Obs", "diagnosis": "Invalid product", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": 3, "data-parallel-size": 2, "expert-parallel-size": 1, }, }, "expected_effects": ["explore invalid topology"], } ) with self.assertRaisesRegex(SpecError, "not in \\[1, 2, 4, 8\\]"): validate_proposal(proposal, study) def test_cli_tune_runs_multiple_manual_proposals(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) proposal1 = tmp_path / "proposal-1.json" proposal2 = tmp_path / "proposal-2.json" proposal1.write_text( json.dumps( { "observation": "trial one", "diagnosis": "conservative", "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, "expected_effects": ["stable"], "why_not_previous_failures": "", } ), encoding="utf-8", ) proposal2.write_text( json.dumps( { "observation": "trial two", "diagnosis": "more batching", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}}, "expected_effects": ["higher throughput"], "why_not_previous_failures": "", } ), encoding="utf-8", ) store_root = tmp_path / "store" def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) trial_id = str(payload["trial_id"]) trial_root = Path(payload["artifact_dir"]) if trial_id.endswith("0001"): best_rate = 1.0 best_u = 0.5 else: best_rate = 2.0 best_u = 0.75 result = { "study_id": payload["study_id"], "trial_id": trial_id, "status": "completed", "best_sampling_u": best_u, "best_request_rate": best_rate, "best_pass_rate": 1.0, "best_request_count": 2, "probes": [], } (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") return result with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial): exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--proposal-file", str(proposal1), "--proposal-file", str(proposal2), ] ) self.assertEqual(exit_code, 0) store = StudyStore(store_root) state = store.load_state("study-1") self.assertEqual(state.best_trial_id, "trial-0002") self.assertEqual(state.best_sampling_u, 0.75) self.assertEqual(state.best_request_rate, 2.0) self.assertEqual(state.next_trial_index, 3) def test_cli_tune_honors_should_stop_proposal(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) proposal_path = tmp_path / "stop.json" proposal_path.write_text( json.dumps( { "observation": "incumbent converged", "diagnosis": "no adjacent harness probe is justified", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": ["stop without spending another GPU trial"], "why_not_previous_failures": "not applicable", "should_stop": True, } ), encoding="utf-8", ) store_root = tmp_path / "store" with mock.patch("aituner.cli.run_trial") as run_trial_mock: exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--proposal-file", str(proposal_path), ] ) self.assertEqual(exit_code, 0) run_trial_mock.assert_not_called() store = StudyStore(store_root) state = store.load_state("study-1") self.assertEqual(state.next_trial_index, 1) def test_cli_tune_vetoes_unauthorized_llm_stop(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) spec = json.loads(study_path.read_text(encoding="utf-8")) spec["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://localhost:9/v1", "model": "test-model", "api_key_env": "AITUNER_TEST_KEY", } study_path.write_text(json.dumps(spec), encoding="utf-8") store_root = tmp_path / "store" stop_payload = json.dumps( { "observation": "looks done", "diagnosis": "agent thinks it converged", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": ["stop"], "why_not_previous_failures": "n/a", "should_stop": True, } ) buffer = io.StringIO() with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch( "aituner.cli.call_llm_for_proposal", return_value=stop_payload ), contextlib.redirect_stdout(buffer): exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--skip-baseline", "--max-trials", "2", ] ) self.assertEqual(exit_code, 0) run_trial_mock.assert_not_called() executed = json.loads(buffer.getvalue())["executed_trials"] # The first unauthorized LLM stop is vetoed; the second is honored # only after the veto budget is spent. self.assertTrue(any(item.get("stop_vetoed") for item in executed)) honored = [item for item in executed if item.get("stopped")] self.assertTrue(honored) self.assertEqual(honored[-1]["stop_authorized_by"], "llm_after_veto_budget") def test_cli_tune_rejects_repeated_materialized_llm_config(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets( tmp_path, engine_overrides={ "base_flags": { "host": "127.0.0.1", "port": 8000, "tensor-parallel-size": 4, "data-parallel-size": 2, "max-num-seqs": 64, }, "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "max-num-seqs", ], "topology_constraints": { "allowed_tensor_parallel_sizes": [1, 2, 4, 8], "allowed_data_parallel_sizes": [1, 2, 4, 8], "allowed_tp_dp_products": [1, 2, 4, 8], }, }, ) spec = json.loads(study_path.read_text(encoding="utf-8")) spec["llm"]["use_harness"] = False spec["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://localhost:9/v1", "model": "test-model", "api_key_env": "AITUNER_TEST_KEY", } study_path.write_text(json.dumps(spec), encoding="utf-8") study = load_study_spec(study_path) store_root = tmp_path / "store" store = StudyStore(store_root) store.init_study(spec_path=study_path, study=study) store.save_state( StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=8, best_sampling_u=0.125, best_request_rate=3.0, best_request_rate_per_gpu=0.375, next_trial_index=3, trials=[ TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_sampling_u=0.125, best_request_rate=3.0, best_request_rate_per_gpu=0.375, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, "max-num-seqs": 160, }, }, ) ], ) ) repeated_runtime_patch = json.dumps( { "observation": "Try the same runtime setting.", "diagnosis": "This is duplicate after topology inheritance.", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}}, "expected_effects": ["should be vetoed"], "why_not_previous_failures": "", "should_stop": False, } ) stderr = io.StringIO() with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch( "aituner.cli.call_llm_for_proposal", return_value=repeated_runtime_patch ), contextlib.redirect_stderr(stderr): exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--skip-baseline", "--max-trials", "3", ] ) self.assertEqual(exit_code, 2) run_trial_mock.assert_not_called() self.assertIn("repeats an already tested effective full config", stderr.getvalue()) self.assertIn("trial-0002", stderr.getvalue()) def test_cli_tune_uses_harness_stop_before_llm(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) store_root = tmp_path / "store" store = StudyStore(store_root) store.init_study(spec_path=study_path, study=study) store.save_state( StudyState( study_id=study.study_id, best_trial_id="trial-0002", best_parallel_size=8, best_sampling_u=0.02, best_request_rate=2.4, best_request_rate_per_gpu=0.3, next_trial_index=5, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=0.8, best_request_rate_per_gpu=0.1, config_patch={"env_patch": {}, "flag_patch": {}}, ), TrialSummary( trial_id="trial-0002", status="completed", parallel_size=8, best_request_rate=2.4, best_request_rate_per_gpu=0.3, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 2, "data-parallel-size": 4, }, }, ), TrialSummary( trial_id="trial-0003", status="completed", parallel_size=8, config_patch={ "env_patch": {}, "flag_patch": { "tensor-parallel-size": 1, "data-parallel-size": 8, }, }, ), TrialSummary( trial_id="trial-0004", status="completed", parallel_size=8, config_patch={ "env_patch": {}, "flag_patch": {"max-num-seqs": 160}, }, ), ], ) ) with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: with mock.patch("aituner.cli.run_trial") as run_trial_mock: exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--max-trials", "5", ] ) self.assertEqual(exit_code, 0) llm_mock.assert_not_called() run_trial_mock.assert_not_called() proposal_path = ( store.study_root(study.study_id) / "proposals" / "harness-stop-0005.json" ) self.assertTrue(proposal_path.exists()) proposal = json.loads(proposal_path.read_text(encoding="utf-8")) self.assertTrue(proposal["should_stop"]) snapshot_path = ( store.study_root(study.study_id) / "harness" / "candidate-set-0005.json" ) self.assertTrue(snapshot_path.exists()) snapshot = json.loads(snapshot_path.read_text(encoding="utf-8")) self.assertEqual(snapshot["schema_version"], 1) self.assertEqual(snapshot["iteration"], 5) self.assertIn("candidate_set_hash", snapshot) self.assertIn("candidate_set", snapshot) self.assertIn("harness_stop", snapshot["decisions"]) self.assertIn("stop_authority", snapshot["decisions"]) state = store.load_state(study.study_id) self.assertEqual(state.tuning_stop_reason, "harness_stop") self.assertEqual( state.tuning_stop_details["proposal_name"], "harness-stop-0005", ) self.assertEqual(state.tuning_stop_details["proposal_source"], "harness") self.assertEqual( state.tuning_stop_details["stop_authorized_by"], "validator", ) self.assertTrue(state.tuning_stop_diagnosis) def test_cli_tune_llm_first_skips_deterministic_harness_proposal(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://llm.example/v1", "wire_api": "chat.completions", "model": "test-model", "api_key_env": "OPENAI_API_KEY", } study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) store_root = tmp_path / "store" store = StudyStore(store_root) store.init_study(spec_path=study_path, study=study) store.save_state( StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=8, best_sampling_u=0.25, best_request_rate=1.0, best_request_rate_per_gpu=0.125, next_trial_index=2, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=8, best_request_rate=1.0, best_request_rate_per_gpu=0.125, config_patch={"env_patch": {}, "flag_patch": {}}, ) ], ) ) llm_payload = json.dumps( { "observation": "Use harness evidence but let the LLM choose.", "diagnosis": "Try higher admission concurrency.", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}}, "expected_effects": ["measure admission concurrency"], "why_not_previous_failures": "does not repeat a prior full config", "should_stop": False, } ) def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) trial_root = Path(payload["artifact_dir"]) result = { "study_id": payload["study_id"], "trial_id": payload["trial_id"], "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 1.0, "best_request_count": 2, "probes": [], } (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") return result with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload) as llm_mock: with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial): exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--skip-baseline", "--max-trials", "2", "--proposal-policy", "llm-first", ] ) self.assertEqual(exit_code, 0) llm_mock.assert_called_once() proposal_root = store.study_root(study.study_id) / "proposals" self.assertTrue((proposal_root / "proposal-0002.json").exists()) self.assertFalse((proposal_root / "harness-proposal-0002.json").exists()) self.assertTrue( (store.study_root(study.study_id) / "harness" / "candidate-set-0002.json").exists() ) def test_cli_tune_records_advisory_llm_out_of_set_candidate_family_gap(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://llm.example/v1", "wire_api": "chat.completions", "model": "test-model", "api_key_env": "OPENAI_API_KEY", } study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) store_root = tmp_path / "store" store = StudyStore(store_root) store.init_study(spec_path=study_path, study=study) store.save_state( StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=1, best_sampling_u=0.25, best_request_rate=1.0, best_request_rate_per_gpu=1.0, next_trial_index=2, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=1, best_request_rate=1.0, best_request_rate_per_gpu=1.0, config_patch={ "env_patch": {}, "flag_patch": {"max-num-seqs": 8}, }, ) ], ) ) harness_context = { "experiment_plan": { "planner_version": "test", "candidate_set": { "candidate_set_hash": "candidate-set-test", "eligible_candidates": [ { "candidate_id": "cand-mns16", "action_id": "coordinate_step:max-num-seqs:8->16", "knob_family": "max-num-seqs", "score": 0.8, "effective_config_fingerprint": "not-the-llm-proposal", "config_patch": { "env_patch": {}, "flag_patch": {"max-num-seqs": 16}, }, } ], "blocked_candidates": [], }, "next_action": None, } } llm_payload = json.dumps( { "observation": "Harness is in the right admission direction but too conservative.", "diagnosis": "Try a larger same-operator admission step.", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}}, "expected_effects": ["test whether admission capacity was underexplored"], "why_not_previous_failures": "new value and no launch failure evidence", "should_stop": False, } ) def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) trial_root = Path(trial_payload["artifact_dir"]) result = { "study_id": trial_payload["study_id"], "trial_id": trial_payload["trial_id"], "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 2.0, "best_pass_rate": 1.0, "best_request_count": 2, "probes": [], } (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") return result buffer = io.StringIO() with mock.patch("aituner.cli.build_harness_context", return_value=harness_context): with mock.patch("aituner.llm.build_harness_context", return_value=harness_context): with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload): with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial): with contextlib.redirect_stdout(buffer): exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--skip-baseline", "--max-trials", "2", "--proposal-policy", "llm-first", ] ) self.assertEqual(exit_code, 0) summary = json.loads(buffer.getvalue()) executed = summary["executed_trials"] self.assertEqual(executed[0]["proposal_origin"], "llm_out_of_set") self.assertTrue(executed[0]["candidate_family_gap_path"]) attribution_path = ( store.study_root(study.study_id) / "proposal_attributions" / "proposal-0002.json" ) attribution = json.loads(attribution_path.read_text(encoding="utf-8")) self.assertEqual(attribution["proposal_origin"], "llm_out_of_set") self.assertEqual(attribution["harness_candidate_policy"], "advisory") gap_path = Path(executed[0]["candidate_family_gap_path"]) gap = json.loads(gap_path.read_text(encoding="utf-8")) self.assertEqual(gap["gap_type"], "same_operator_new_step") self.assertEqual(gap["review_status"], "pending") self.assertEqual(gap["changed_knobs"], ["flag:max-num-seqs"]) self.assertEqual(gap["proposal_patch"]["flag_patch"]["max-num-seqs"], 24) self.assertEqual(gap["nearest_harness_candidates"][0]["candidate_id"], "cand-mns16") def test_cli_tune_strict_harness_policy_rejects_llm_out_of_set_proposal(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["harness_candidate_policy"] = "strict" payload["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://llm.example/v1", "wire_api": "chat.completions", "model": "test-model", "api_key_env": "OPENAI_API_KEY", } study_path.write_text(json.dumps(payload), encoding="utf-8") study = load_study_spec(study_path) store_root = tmp_path / "store" store = StudyStore(store_root) store.init_study(spec_path=study_path, study=study) store.save_state( StudyState( study_id=study.study_id, best_trial_id="trial-0001", best_parallel_size=1, best_request_rate=1.0, best_request_rate_per_gpu=1.0, next_trial_index=2, trials=[ TrialSummary( trial_id="trial-0001", status="completed", parallel_size=1, best_request_rate=1.0, best_request_rate_per_gpu=1.0, config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 8}}, ) ], ) ) harness_context = { "experiment_plan": { "candidate_set": { "candidate_set_hash": "candidate-set-test", "eligible_candidates": [ { "candidate_id": "cand-mns16", "effective_config_fingerprint": "not-the-llm-proposal", "config_patch": { "env_patch": {}, "flag_patch": {"max-num-seqs": 16}, }, } ], } } } llm_payload = json.dumps( { "observation": "Try an out-of-set candidate.", "diagnosis": "strict mode should reject this.", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}}, "expected_effects": ["should not run"], "why_not_previous_failures": "", "should_stop": False, } ) stderr = io.StringIO() with mock.patch("aituner.cli.build_harness_context", return_value=harness_context): with mock.patch("aituner.llm.build_harness_context", return_value=harness_context): with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload): with mock.patch("aituner.cli.run_trial") as run_trial_mock: with contextlib.redirect_stderr(stderr): exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--skip-baseline", "--max-trials", "2", "--proposal-policy", "llm-first", ] ) self.assertEqual(exit_code, 2) run_trial_mock.assert_not_called() self.assertIn("llm.harness_candidate_policy=strict", stderr.getvalue()) def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://llm.example/v1", "wire_api": "chat.completions", "model": "test-model", "api_key_env": "OPENAI_API_KEY", } study_path.write_text(json.dumps(payload), encoding="utf-8") store_root = tmp_path / "store" def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) trial_root = Path(payload["artifact_dir"]) result = { "study_id": payload["study_id"], "trial_id": payload["trial_id"], "status": "completed", "best_sampling_u": 0.25, "best_request_rate": 1.0, "best_pass_rate": 1.0, "best_request_count": 2, "probes": [], } (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") return result llm_payload = json.dumps( { "observation": "baseline done", "diagnosis": "try more batching", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}}, "expected_effects": ["higher throughput"], "why_not_previous_failures": "", "should_stop": False, } ) with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial): with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload): exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--max-trials", "2", ] ) self.assertEqual(exit_code, 0) store = StudyStore(store_root) state = store.load_state("study-1") self.assertEqual(state.next_trial_index, 3) self.assertEqual(state.trials[0].config_patch, {"env_patch": {}, "flag_patch": {}}) self.assertEqual(state.trials[1].config_patch["flag_patch"], {"max-num-seqs": 64}) def test_cli_tune_stops_when_baseline_is_all_infeasible(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://llm.example/v1", "wire_api": "chat.completions", "model": "test-model", "api_key_env": "OPENAI_API_KEY", } study_path.write_text(json.dumps(payload), encoding="utf-8") store_root = tmp_path / "store" def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) trial_root = Path(payload["artifact_dir"]) result = { "study_id": payload["study_id"], "trial_id": payload["trial_id"], "status": "completed", "best_sampling_u": None, "best_request_rate": None, "best_pass_rate": None, "best_request_count": None, "probes": [ { "threshold": 0.5, "feasible": False, "payload": {"pass_rate": 0.0, "request_rate": 2.0}, }, { "threshold": 0.25, "feasible": False, "payload": {"pass_rate": 0.5, "request_rate": 1.0}, }, ], "all_infeasible_diagnostics": { "threshold": 0.25, "request_rate": 1.0, "pass_rate": 0.5, "early_stop_reason": "slo_pass_rate_unrecoverable", "latency_summary": { "ttft_ms": { "count": 2, "mean": 1200.0, "p50": 1100.0, "p95": 1900.0, "p99": 1980.0, }, "tpot_ms": { "count": 2, "mean": 35.0, "p50": 32.0, "p95": 48.0, "p99": 49.0, }, }, }, } (trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8") return result with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial): with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--max-trials", "3", ] ) self.assertEqual(exit_code, 0) llm_mock.assert_not_called() store = StudyStore(store_root) state = store.load_state("study-1") self.assertEqual(state.next_trial_index, 2) self.assertEqual(len(state.trials), 1) self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible") self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis) self.assertIn("lowest_probe_ttft_ms", state.tuning_stop_diagnosis) self.assertEqual( state.tuning_stop_details["lowest_probe_latency_ms"]["ttft"]["p95"], 1900.0, ) self.assertEqual( state.tuning_stop_details["lowest_probe_latency_ms"]["tpot"]["p99"], 49.0, ) with mock.patch("aituner.cli.run_trial") as run_trial_mock: with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--max-trials", "3", ] ) self.assertEqual(exit_code, 0) run_trial_mock.assert_not_called() llm_mock.assert_not_called() def test_cli_tune_max_trials_is_total_budget_on_resume(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) payload = json.loads(study_path.read_text(encoding="utf-8")) payload["llm"]["endpoint"] = { "provider": "custom", "base_url": "http://llm.example/v1", "wire_api": "chat.completions", "model": "test-model", "api_key_env": "OPENAI_API_KEY", } study_path.write_text(json.dumps(payload), encoding="utf-8") store_root = tmp_path / "store" study = load_study_spec(study_path) store = StudyStore(store_root) store.init_study(spec_path=study_path, study=study) state = StudyState( study_id=study.study_id, next_trial_index=3, trials=[ TrialSummary(trial_id="trial-0001", status="completed"), TrialSummary(trial_id="trial-0002", status="completed"), ], ) store.save_state(state) with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: with mock.patch("aituner.cli.run_trial") as run_trial_mock: exit_code = cli_main( [ "study", "tune", "--spec", str(study_path), "--store-root", str(store_root), "--max-trials", "2", ] ) self.assertEqual(exit_code, 0) llm_mock.assert_not_called() run_trial_mock.assert_not_called() self.assertEqual(store.load_state(study.study_id).next_trial_index, 3) def test_load_compare_spec_requires_window_selection(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) compare_path = tmp_path / "compare.json" compare_path.write_text( json.dumps( { "compare_id": "compare-1", "study_spec_path": str(study_path), "baseline": {"config_patch": {"env_patch": {}, "flag_patch": {}}}, "tuned": {"config_patch": {"env_patch": {}, "flag_patch": {}}}, } ), encoding="utf-8", ) with self.assertRaisesRegex(SpecError, "window_ids or window_selector"): load_compare_spec(compare_path) def test_run_compare_outputs_summary_and_report(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) trace_dir = tmp_path / "trace_windows" / "traces" trace_path = trace_dir / "chat_w2.jsonl" trace_path.write_text( json.dumps( { "request_id": "r4", "timestamp": 0.0, "sampling_u": 0.2, "messages": [{"role": "user", "content": "extra"}], "input_length": 3000, "output_length": 32, } ) + "\n", encoding="utf-8", ) windows_path = tmp_path / "trace_windows" / "windows.json" windows_payload = json.loads(windows_path.read_text(encoding="utf-8")) windows_payload["windows"].append( { "window_id": "chat_w2", "trace_type": "chat", "trace_file": "traces/chat_w2.jsonl", "window_start": 0.0, "window_end": 10.0, "date": "2026-03-12", "slot_token": "1000", "slot_label": "10:00-10:10", } ) windows_payload["windows"][0]["date"] = "2026-03-11" windows_payload["windows"][0]["slot_token"] = "1000" windows_payload["windows"][0]["slot_label"] = "10:00-10:10" windows_path.write_text(json.dumps(windows_payload), encoding="utf-8") compare_path = _write_compare_assets( tmp_path, study_path=study_path, window_ids=["chat_w1", "chat_w2"], ) def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) source_path = Path(trial_payload["study_spec_path"]) actual_spec_path = Path(source_path.read_text(encoding="utf-8").strip()) study_payload = json.loads(actual_spec_path.read_text(encoding="utf-8")) window_id = study_payload["trace"]["window_id"] trial_id = trial_payload["trial_id"] rate_map = { ("chat_w1", "baseline"): 1.0, ("chat_w1", "tuned"): 3.0, ("chat_w2", "baseline"): 3.0, ("chat_w2", "tuned"): 7.0, } best_rate = rate_map[(window_id, trial_id)] result = { "study_id": trial_payload["study_id"], "trial_id": trial_id, "status": "completed", "best_sampling_u": 0.5, "best_request_rate": best_rate, "best_pass_rate": 1.0, "best_request_count": 2, "probes": [], } Path(trial_payload["result_path"]).write_text( json.dumps(result), encoding="utf-8", ) return result with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial): summary = run_compare(compare_path, output_root=tmp_path / ".compare") self.assertEqual(len(summary["windows"]), 2) self.assertEqual(summary["aggregate"]["wins"]["tuned"], 2) self.assertTrue((tmp_path / ".compare" / "summary.json").exists()) self.assertTrue((tmp_path / ".compare" / "report.md").exists()) def test_compare_aggregate_counts_failed_and_no_feasible_windows(self) -> None: summary = _aggregate_summary( [ { "baseline": { "status": "completed", "best_request_rate": 1.0, "best_request_rate_per_gpu": 1.0, }, "tuned": { "status": "completed", "best_request_rate": None, "best_request_rate_per_gpu": None, }, "delta": {"winner": "baseline"}, }, { "baseline": { "status": "failed", "best_request_rate": None, "best_request_rate_per_gpu": None, }, "tuned": { "status": "completed", "best_request_rate": 2.0, "best_request_rate_per_gpu": 2.0, }, "delta": {"winner": "tuned"}, }, ] ) self.assertEqual(summary["baseline_completed_window_count"], 1) self.assertEqual(summary["baseline_failed_window_count"], 1) self.assertEqual(summary["baseline_no_feasible_window_count"], 1) self.assertEqual(summary["tuned_completed_window_count"], 2) self.assertEqual(summary["tuned_failed_window_count"], 0) self.assertEqual(summary["tuned_no_feasible_window_count"], 1) def test_run_compare_resolves_trial_ref_candidate(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) prior_root = tmp_path / "prior-study" trial_dir = prior_root / "trials" / "trial-0002" trial_dir.mkdir(parents=True) trial_spec = { "study_id": "prior-study", "trial_id": "trial-0002", "config_patch": { "env_patch": {}, "flag_patch": {"data-parallel-size": 2}, }, "search": { "low": 0.0, "high": 1.0, "tolerance": 0.01, "max_probes": 8, "sample_seed": 20260325, }, "study_spec_path": str(study_path), "artifact_dir": str(trial_dir), "probe_log_path": str(trial_dir / "probe_history.json"), "engine_log_path": str(trial_dir / "engine.log"), "result_path": str(trial_dir / "result.json"), } (trial_dir / "trial_spec.json").write_text(json.dumps(trial_spec), encoding="utf-8") compare_path = _write_compare_assets( tmp_path, study_path=study_path, window_ids=["chat_w1"], baseline={ "trial_ref": { "study_root": str(prior_root), "trial_id": "trial-0002", } }, ) def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) flags = (trial_payload["config_patch"] or {}).get("flag_patch") or {} best_rate = 5.0 if flags.get("data-parallel-size") == 2 else 2.0 result = { "study_id": trial_payload["study_id"], "trial_id": trial_payload["trial_id"], "status": "completed", "best_sampling_u": 0.5, "best_request_rate": best_rate, "best_pass_rate": 1.0, "best_request_count": 2, "probes": [], } Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8") return result with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial): summary = run_compare(compare_path, output_root=tmp_path / ".compare") self.assertEqual(summary["baseline_source"]["kind"], "trial_ref") self.assertEqual( summary["windows"][0]["baseline"]["config_patch"]["flag_patch"]["data-parallel-size"], 2, ) def test_run_compare_window_selector_filters_windows(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) trace_dir = tmp_path / "trace_windows" / "traces" for name in ("chat_w2.jsonl", "thinking_w3.jsonl"): (trace_dir / name).write_text( json.dumps( { "request_id": name, "timestamp": 0.0, "sampling_u": 0.2, "messages": [{"role": "user", "content": name}], "input_length": 3000, "output_length": 32, } ) + "\n", encoding="utf-8", ) windows_path = tmp_path / "trace_windows" / "windows.json" windows_payload = json.loads(windows_path.read_text(encoding="utf-8")) windows_payload["windows"][0]["date"] = "2026-03-11" windows_payload["windows"][0]["slot_token"] = "1000" windows_payload["windows"].append( { "window_id": "chat_w2", "trace_type": "chat", "trace_file": "traces/chat_w2.jsonl", "window_start": 0.0, "window_end": 10.0, "date": "2026-03-12", "slot_token": "1000", } ) windows_payload["windows"].append( { "window_id": "thinking_w3", "trace_type": "thinking", "trace_file": "traces/thinking_w3.jsonl", "window_start": 0.0, "window_end": 10.0, "date": "2026-03-12", "slot_token": "1000", } ) windows_path.write_text(json.dumps(windows_payload), encoding="utf-8") compare_path = _write_compare_assets( tmp_path, study_path=study_path, window_selector={"trace_type": "chat", "date_prefix": "2026-03-12"}, ) def fake_run_trial(trial_spec_path: Path) -> dict[str, object]: trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8")) result = { "study_id": trial_payload["study_id"], "trial_id": trial_payload["trial_id"], "status": "completed", "best_sampling_u": 0.5, "best_request_rate": 1.0, "best_pass_rate": 1.0, "best_request_count": 2, "probes": [], } Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8") return result with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial): summary = run_compare(compare_path, output_root=tmp_path / ".compare") self.assertEqual([row["window_id"] for row in summary["windows"]], ["chat_w2"]) def test_proposal_expected_effects_accepts_string(self) -> None: proposal = Proposal.from_dict( { "observation": "obs", "diagnosis": "diag", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": "higher throughput", } ) self.assertEqual(proposal.expected_effects, ["higher throughput"]) def test_proposal_expected_effects_accepts_object(self) -> None: proposal = Proposal.from_dict( { "observation": "obs", "diagnosis": "diag", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": { "throughput": "higher", "ttft": "lower", }, } ) self.assertEqual( proposal.expected_effects, ["throughput: higher", "ttft: lower"], ) def test_proposal_observation_accepts_object(self) -> None: proposal = Proposal.from_dict( { "observation": { "incumbent_trial": "trial-0002", "boundary_signal": "tpot cliff", }, "diagnosis": "validate incumbent", "config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}}, "expected_effects": ["more TPOT headroom"], } ) self.assertIn('"incumbent_trial": "trial-0002"', proposal.observation) self.assertEqual(proposal.diagnosis, "validate incumbent") def test_proposal_accepts_should_stop(self) -> None: proposal = Proposal.from_dict( { "observation": "obs", "diagnosis": "converged", "config_patch": {"env_patch": {}, "flag_patch": {}}, "expected_effects": ["avoid wasting another GPU trial"], "should_stop": True, } ) self.assertTrue(proposal.should_stop) def test_parse_proposal_text_accepts_wrapped_json(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) study_path = _write_study_assets(tmp_path) study = load_study_spec(study_path) proposal = parse_proposal_text( """Here is the proposal: ```json {"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"} ```""", study, ) self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32) def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None: requests = [ TraceRequest( row_id=f"r{i}", arrival_s=0.0, sampling_u=0.1 * i, body={"model": "m", "messages": [{"role": "user", "content": "x"}]}, prompt_tokens_hint=8, completion_tokens_hint=4, ) for i in range(3) ] outcomes = [ RequestOutcome( request_id="r0", success=False, ttft_ms=None, tpot_ms=None, prompt_tokens=8, completion_tokens=4, error="request_failed", ) ] def fake_run_one_request(*args, **kwargs): return outcomes.pop(0) def fake_evaluate(outcome: RequestOutcome): return type("Eval", (), {"passed": outcome.success})() with mock.patch("aituner.worker._run_one_request", side_effect=fake_run_one_request): replayed, early_stopped, reason = _replay_requests( requests, base_url="http://127.0.0.1:8000", timeout_s=1.0, max_concurrency=1, target_pass_rate=0.95, max_lag_s=None, max_elapsed_s=None, evaluate_outcome=fake_evaluate, ) self.assertTrue(early_stopped) self.assertEqual(reason, "slo_pass_rate_unrecoverable") self.assertEqual(len(replayed), 3) self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable") def test_replay_requests_does_not_wait_for_inflight_after_early_stop(self) -> None: requests = [ TraceRequest( row_id="r0", arrival_s=0.0, sampling_u=0.1, body={"model": "m", "messages": [{"role": "user", "content": "x"}]}, prompt_tokens_hint=8, completion_tokens_hint=4, ), TraceRequest( row_id="r1", arrival_s=0.0, sampling_u=0.2, body={"model": "m", "messages": [{"role": "user", "content": "y"}]}, prompt_tokens_hint=8, completion_tokens_hint=4, ), ] class FakeFuture: def __init__(self, outcome=None, *, should_fail_if_waited=False): self._outcome = outcome self._should_fail_if_waited = should_fail_if_waited def result(self, timeout=None): if self._should_fail_if_waited: raise AssertionError("in-flight future should not be awaited after early stop") return self._outcome def cancel(self): return True done_future = FakeFuture( RequestOutcome( request_id="r0", success=False, ttft_ms=None, tpot_ms=None, prompt_tokens=8, completion_tokens=4, error="request_failed", ) ) inflight_future = FakeFuture(should_fail_if_waited=True) submitted = [] class FakeExecutor: def __init__(self, max_workers): self.max_workers = max_workers def submit(self, fn, request, **kwargs): submitted.append(request.row_id) if request.row_id == "r0": return done_future return inflight_future def shutdown(self, wait=False, cancel_futures=True): return None def fake_wait(futures, timeout=None, return_when=None): self.assertEqual(len(futures), 2) return {done_future}, {inflight_future} def fake_evaluate(outcome: RequestOutcome): return type("Eval", (), {"passed": outcome.success})() with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor): with mock.patch("aituner.worker.wait", side_effect=fake_wait): replayed, early_stopped, reason = _replay_requests( requests, base_url="http://127.0.0.1:8000", timeout_s=30.0, max_concurrency=2, target_pass_rate=0.95, max_lag_s=None, max_elapsed_s=None, evaluate_outcome=fake_evaluate, drain_inflight_on_early_stop=False, ) self.assertEqual(submitted, ["r0", "r1"]) self.assertTrue(early_stopped) self.assertEqual(reason, "slo_pass_rate_unrecoverable") self.assertEqual(len(replayed), 2) self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable") def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None: requests = [ TraceRequest( row_id="r0", arrival_s=0.0, sampling_u=0.1, body={"model": "m", "messages": [{"role": "user", "content": "x"}]}, prompt_tokens_hint=8, completion_tokens_hint=4, ) ] class FakeFuture: def result(self, timeout=None): raise AssertionError("future should not be awaited after elapsed early stop") def cancel(self): return True submitted = [] class FakeExecutor: def __init__(self, max_workers): self.max_workers = max_workers def submit(self, fn, request, **kwargs): submitted.append(request.row_id) return FakeFuture() def shutdown(self, wait=False, cancel_futures=True): return None wait_timeouts: list[float] = [] def fake_wait(futures, timeout=None, return_when=None): wait_timeouts.append(timeout) return set(), set(futures) def fake_evaluate(outcome: RequestOutcome): return type("Eval", (), {"passed": outcome.success})() monotonic_values = iter([0.0, 0.0, 0.4, 1.2]) with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor): with mock.patch("aituner.worker.wait", side_effect=fake_wait): with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)): replayed, early_stopped, reason = _replay_requests( requests, base_url="http://127.0.0.1:8000", timeout_s=30.0, max_concurrency=1, target_pass_rate=0.95, max_lag_s=None, max_elapsed_s=1.0, evaluate_outcome=fake_evaluate, drain_inflight_on_early_stop=False, ) self.assertEqual(submitted, ["r0"]) self.assertTrue(early_stopped) self.assertEqual(reason, "probe_elapsed_s>1.0") self.assertEqual(len(replayed), 1) self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0") self.assertTrue(wait_timeouts) self.assertLessEqual(wait_timeouts[0], 0.5) def test_latency_summary_reports_quantiles_and_slo(self) -> None: study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp()))) outcomes = [ RequestOutcome( request_id="r1", success=True, ttft_ms=100.0, tpot_ms=10.0, prompt_tokens=100, completion_tokens=10, ), RequestOutcome( request_id="r2", success=True, ttft_ms=200.0, tpot_ms=20.0, prompt_tokens=5000, completion_tokens=10, ), ] evaluations = [evaluate_request(item, study.slo) for item in outcomes] summary = _latency_summary(outcomes=outcomes, evaluations=evaluations, study=study) self.assertEqual(summary["observed_request_count"], 2) self.assertEqual(summary["request_mode"], "chat") self.assertEqual(summary["ttft_ms"]["mean"], 150.0) self.assertEqual(summary["ttft_ms"]["p50"], 100.0) self.assertEqual(summary["ttft_ms"]["p99"], 200.0) self.assertEqual(summary["tpot_ms"]["mean"], 15.0) self.assertEqual(summary["slo"]["target_pass_rate"], 0.95) def test_wait_for_server_or_exit_fails_fast_when_process_exits(self) -> None: process = mock.Mock() process.poll.return_value = 17 with self.assertRaisesRegex(RuntimeError, "engine_process_exited_before_ready exit_code=17"): _wait_for_server_or_exit( process, base_url="http://127.0.0.1:8000", healthcheck_path="/v1/models", ready_timeout_s=10.0, ) def test_terminate_process_tree_kills_process_group(self) -> None: process = mock.Mock() process.pid = 1234 process.poll.return_value = None process.wait.return_value = 0 with mock.patch("aituner.worker.os.getpgid", return_value=1234): with mock.patch( "aituner.worker.os.killpg", side_effect=[None, ProcessLookupError], ) as mock_killpg: _terminate_process_tree(process, timeout_s=1.0) self.assertEqual(mock_killpg.call_args_list[0].args[0], 1234) self.assertEqual(mock_killpg.call_args_list[0].args[1], 15) def test_terminate_process_tree_kills_group_when_parent_already_exited(self) -> None: process = mock.Mock() process.pid = 1234 process.poll.return_value = 0 with mock.patch("aituner.worker.os.getpgid", side_effect=ProcessLookupError): with mock.patch( "aituner.worker.os.killpg", side_effect=[None, ProcessLookupError], ) as mock_killpg: _terminate_process_tree(process, timeout_s=1.0) self.assertEqual(mock_killpg.call_args_list[0].args[0], 1234) process.wait.assert_not_called() def test_terminate_process_tree_signals_marker_processes_when_group_missing(self) -> None: process = mock.Mock() process.pid = 1234 process.poll.return_value = 0 marker_env = {"AITUNER_TRIAL_ID": "trial-0001"} with mock.patch("aituner.worker.os.getpgid", side_effect=ProcessLookupError): with mock.patch("aituner.worker.os.killpg", side_effect=ProcessLookupError): with mock.patch( "aituner.worker._pids_matching_env", side_effect=[[2222], []], ) as mock_pids: with mock.patch("aituner.worker._signal_pids") as mock_signal: _terminate_process_tree( process, timeout_s=1.0, marker_env=marker_env, ) self.assertEqual(mock_pids.call_args_list[0].args[0], marker_env) self.assertEqual(mock_signal.call_args_list[0].args, ([2222], signal.SIGTERM)) def test_openai_url_avoids_double_v1(self) -> None: self.assertEqual( _openai_url("http://example.com", "/v1/chat/completions"), "http://example.com/v1/chat/completions", ) self.assertEqual( _openai_url("http://example.com/v1", "/v1/chat/completions"), "http://example.com/v1/chat/completions", ) def test_stream_chat_completion_handles_missing_usage_and_chunks(self) -> None: class FakeResponse: def __enter__(self): return self def __exit__(self, exc_type, exc, traceback): return False def __iter__(self): return iter([b"data: {\"choices\": []}\n", b"data: [DONE]\n"]) with mock.patch("aituner.http_client._urlopen", return_value=FakeResponse()): metrics = stream_chat_completion( base_url="http://127.0.0.1:8000", body={"model": "m", "messages": [{"role": "user", "content": "x"}]}, timeout_s=1.0, ) self.assertIsNone(metrics.ttft_ms) self.assertIsNone(metrics.tpot_ms) self.assertIsNone(metrics.completion_tokens) self.assertEqual(metrics.completion_tokens_source, "none") def test_loopback_urls_bypass_proxy(self) -> None: self.assertTrue(_should_bypass_proxy("http://127.0.0.1:8000/v1/models")) self.assertTrue(_should_bypass_proxy("http://localhost:8000/health")) self.assertFalse(_should_bypass_proxy("http://example.com/v1/models")) if __name__ == "__main__": unittest.main()