Files
aituner/tests/test_core_flow.py

8650 lines
365 KiB
Python

from __future__ import annotations
import json
import hashlib
import contextlib
import io
import math
import os
import signal
import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest import mock
from aituner.cli import main as cli_main
from aituner.compare import _aggregate_summary, load_compare_spec, run_compare
from aituner.config_signature import materialized_effective_config_signature
from aituner.engine import build_launch_recipe
from aituner.http_client import (
HttpClientError,
StreamMetrics,
_auth_headers,
_openai_url,
_should_bypass_proxy,
stream_chat_completion,
)
from aituner.job import append_job, build_trial_job
from aituner.harness import (
_effective_config_signature,
build_harness_context,
build_harness_guided_proposal,
build_harness_stop_proposal,
)
from aituner.lca import (
build_study_workload_profile,
build_workload_profile,
find_convergence_prefix,
profile_similarity,
resolve_length_mode,
similarity_report,
)
from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
from aituner.search import ThresholdProbe, binary_search_max_feasible
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
from aituner.spec import (
AdaptiveStopSpec,
ConfigPatch,
LLMEndpointSpec,
Proposal,
SloSpec,
SpecError,
StudyState,
TrialSummary,
load_study_spec,
)
from aituner.store import StudyStore, resolve_auto_high_search
from aituner.trace import load_trace_requests, summarize_window
from aituner.worker import (
_adaptive_replay_set,
_probe_drain_deadline,
_install_sigterm_as_keyboardinterrupt,
_restore_sigterm,
_should_extend_on_boundary,
_best_feasible_probe_record,
_latency_summary,
_run_one_request,
_replay_requests,
_terminate_process_tree,
_wait_for_server_or_exit,
run_trial,
)
from aituner.trace import TraceRequest, WindowRecord
REPO_ROOT = Path(__file__).resolve().parents[1]
def _write_study_assets(
tmp_path: Path,
*,
trace_overrides: dict[str, object] | None = None,
slo_overrides: dict[str, object] | None = None,
engine_overrides: dict[str, object] | None = None,
search_overrides: dict[str, object] | None = None,
) -> Path:
trace_dir = tmp_path / "trace_windows" / "traces"
trace_dir.mkdir(parents=True)
trace_path = trace_dir / "chat_w1.jsonl"
rows = [
{
"request_id": "r1",
"timestamp": 0.0,
"sampling_u": 0.10,
"messages": [{"role": "user", "content": "hello"}],
"input_length": 1000,
"output_length": 16
},
{
"request_id": "r2",
"timestamp": 1.0,
"sampling_u": 0.50,
"messages": [{"role": "user", "content": "world"}],
"input_length": 5000,
"output_length": 32
},
{
"request_id": "r3",
"timestamp": 2.0,
"sampling_u": 0.90,
"messages": [{"role": "user", "content": "!"}],
"input_length": 20000,
"output_length": 64
}
]
with trace_path.open("w", encoding="utf-8") as handle:
for row in rows:
handle.write(json.dumps(row) + "\n")
windows_path = tmp_path / "trace_windows" / "windows.json"
windows_payload = {
"u_field": "sampling_u",
"windows": [
{
"window_id": "chat_w1",
"trace_type": "chat",
"trace_file": "traces/chat_w1.jsonl",
"window_start": 0.0,
"window_end": 10.0
}
]
}
windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
capability_path = tmp_path / "capability.json"
capability_path.write_text(
json.dumps({"prefill_service_by_bucket": {"4k": {"tp4_ms": 320, "tp8_ms": 240}}}),
encoding="utf-8",
)
study_path = tmp_path / "study.json"
trace_payload: dict[str, object] = {
"windows_path": str(windows_path),
"window_id": "chat_w1",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 4,
}
if trace_overrides:
trace_payload.update(trace_overrides)
study_payload = {
"study_id": "study-1",
"hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
"model": {
"model_id": "qwen",
"served_model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507"
},
"engine": {
"engine_name": "vllm",
"engine_version": "0.1",
"exec_path": "/usr/local/bin/vllm",
"cwd": str(tmp_path),
"host": "127.0.0.1",
"port": 8000,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 30,
"request_timeout_s": 30,
"launch_args": ["serve", "/models/qwen"],
"base_envs": {"BASE_ENV": "1"},
"base_flags": {"host": "127.0.0.1", "port": 8000},
"tunable_envs": ["VLLM_ATTENTION_BACKEND"],
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
"python_executable": "python3"
},
"trace": trace_payload,
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{"max_input_tokens": 4096, "threshold_ms": 2000},
{"max_input_tokens": 16384, "threshold_ms": 5000},
{"threshold_ms": 9000}
]
},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 120}
},
"search": {
"low": 0.0,
"high": 1.0,
"tolerance": 0.01,
"max_probes": 8,
"sample_seed": 20260325
},
"llm": {"system_prompt": "Tune it.", "max_history_trials": 8},
"capability_profile_path": str(capability_path)
}
if slo_overrides:
study_payload["slo"].update(slo_overrides)
if engine_overrides:
study_payload["engine"].update(engine_overrides)
if search_overrides:
study_payload["search"].update(search_overrides)
study_path.write_text(json.dumps(study_payload), encoding="utf-8")
return study_path
def _write_compare_assets(
tmp_path: Path,
*,
study_path: Path,
window_ids: list[str] | None = None,
window_selector: dict[str, object] | None = None,
baseline: dict[str, object] | None = None,
tuned: dict[str, object] | None = None,
) -> Path:
compare_path = tmp_path / "compare.json"
payload: dict[str, object] = {
"compare_id": "compare-1",
"study_spec_path": str(study_path),
"baseline": baseline or {"config_patch": {"env_patch": {}, "flag_patch": {}}},
"tuned": tuned
or {
"config_patch": {
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
}
},
}
if window_ids is not None:
payload["window_ids"] = window_ids
if window_selector is not None:
payload["window_selector"] = window_selector
compare_path.write_text(json.dumps(payload), encoding="utf-8")
return compare_path
class CoreFlowTests(unittest.TestCase):
def test_trace_and_prompt_flow(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
study_root = store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
window, requests = load_trace_requests(study, study_spec_path=study_path)
summary = summarize_window(requests, window)
self.assertEqual(summary["request_count"], 3)
self.assertEqual(summary["request_rate"], 0.3)
prompt = build_prompt(
study=study,
window_summary=summary,
state=state,
capability_profile={"queueing_knee_by_bucket": {"4k": 1000}},
)
self.assertIn("allowed_flag_keys", prompt)
self.assertIn("study-1", prompt)
self.assertIn('"current_best"', prompt)
self.assertIn("queueing_knee_by_bucket", prompt)
self.assertIn("Harnesses:", prompt)
self.assertIn("workload_lca_profile", prompt)
self.assertIn("knob_harnesses", prompt)
self.assertTrue(study_root.exists())
def test_search_auto_high_schema_is_backward_compatible(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study_path = _write_study_assets(
Path(tmp),
search_overrides={"high": 0.4},
)
study = load_study_spec(study_path)
self.assertFalse(study.search.auto_high.enabled)
updated, evidence = resolve_auto_high_search(
search=study.search,
sampling_us=[0.1, 0.9],
)
self.assertEqual(updated.high, 0.4)
self.assertEqual(evidence["reason"], "auto_high_disabled")
def test_search_auto_high_caps_at_policy_and_trace(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study_path = _write_study_assets(
Path(tmp),
search_overrides={
"high": 0.2,
"auto_high": {
"enabled": True,
"max_sampling_u": 0.8,
"require_human_confirmation_beyond_trace": True,
},
},
)
study = load_study_spec(study_path)
capped_by_policy, policy_evidence = resolve_auto_high_search(
search=study.search,
sampling_us=[0.1, 0.9],
)
self.assertEqual(capped_by_policy.high, 0.8)
self.assertEqual(
policy_evidence["reason"],
"search_high_raised_to_trace_ceiling",
)
capped_by_trace, trace_evidence = resolve_auto_high_search(
search=study.search,
sampling_us=[0.1, 0.7],
)
self.assertEqual(capped_by_trace.high, 0.7)
self.assertEqual(trace_evidence["effective_ceiling"], 0.7)
low_above_ceiling = study.search.__class__.from_dict(
{
"low": 0.9,
"high": 0.95,
"tolerance": study.search.tolerance,
"max_probes": study.search.max_probes,
"sample_seed": study.search.sample_seed,
"auto_high": {
"enabled": True,
"max_sampling_u": 0.8,
"require_human_confirmation_beyond_trace": True,
},
}
)
unchanged, invalid_evidence = resolve_auto_high_search(
search=low_above_ceiling,
sampling_us=[0.1, 0.9],
)
self.assertEqual(unchanged.low, 0.9)
self.assertEqual(unchanged.high, 0.95)
self.assertEqual(
invalid_evidence["reason"],
"auto_high_ceiling_below_search_low",
)
high_search = study.search.__class__.from_dict(
{
"low": 0.0,
"high": 0.95,
"tolerance": study.search.tolerance,
"max_probes": study.search.max_probes,
"sample_seed": study.search.sample_seed,
"auto_high": {
"enabled": True,
"max_sampling_u": 0.8,
"require_human_confirmation_beyond_trace": True,
},
}
)
lowered, lowered_evidence = resolve_auto_high_search(
search=high_search,
sampling_us=[0.1, 0.9],
)
self.assertEqual(lowered.high, 0.8)
self.assertEqual(
lowered_evidence["reason"],
"search_high_lowered_to_trace_ceiling",
)
def test_effective_config_signature_treats_noop_patch_as_baseline(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study_path = _write_study_assets(
Path(tmp),
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.5,
"max-num-seqs": 8,
},
},
)
study = load_study_spec(study_path)
baseline = _effective_config_signature(study, {"env_patch": {}, "flag_patch": {}})
noop_tp = _effective_config_signature(
study,
{"env_patch": {}, "flag_patch": {"tensor-parallel-size": 8}},
)
noop_tp_string = _effective_config_signature(
study,
{"env_patch": {}, "flag_patch": {"tensor-parallel-size": "8"}},
)
changed_tp = _effective_config_signature(
study,
{"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
)
self.assertEqual(baseline, noop_tp)
self.assertEqual(baseline, noop_tp_string)
self.assertNotEqual(baseline, changed_tp)
def test_materialized_signature_inherits_incumbent_topology(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"max-num-seqs": 64,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-seqs",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
trials=[
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
"max-num-seqs": 160,
},
},
)
],
)
runtime_only = Proposal.from_dict(
{
"observation": "Try the same runtime cap.",
"diagnosis": "This should materialize on incumbent topology.",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
"expected_effects": ["no-op after topology inheritance"],
}
)
explicit = Proposal.from_dict(
{
"observation": "Explicit duplicate.",
"diagnosis": "Same effective execution config.",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": "2",
"data-parallel-size": "4",
"max-num-seqs": "160",
},
},
"expected_effects": ["same config"],
}
)
self.assertEqual(
materialized_effective_config_signature(
study=study,
state=state,
proposal=runtime_only,
),
materialized_effective_config_signature(
study=study,
state=state,
proposal=explicit,
),
)
def test_lca_workload_profile_uses_standard_10d_features(self) -> None:
window = WindowRecord(
window_id="w1",
trace_path=Path("trace.jsonl"),
trace_type="chat",
window_start=0.0,
window_end=4.0,
source_payload={"block_size": 64},
)
requests = [
TraceRequest(
row_id="r1",
arrival_s=0.0,
sampling_u=1.0,
body={},
prompt_tokens_hint=100,
completion_tokens_hint=10,
metadata={"hash_ids": [1, 2]},
),
TraceRequest(
row_id="r2",
arrival_s=1.0,
sampling_u=1.0,
body={},
prompt_tokens_hint=100,
completion_tokens_hint=20,
metadata={"hash_ids": [1, 3]},
),
]
profile = build_workload_profile(
requests,
window,
gpu_count=2,
length_mode="total",
)
self.assertEqual(len(profile.feature_names), 10)
self.assertEqual(len(profile.vector), 10)
self.assertEqual(profile.feature_names[0], "L.log_mean_length")
self.assertAlmostEqual(profile.stats["cache"]["total_hit_length"], 64.0)
self.assertAlmostEqual(profile.stats["cache"]["hit_rate"], 64.0 / 230.0)
self.assertAlmostEqual(profile.stats["cache"]["input_hit_rate"], 64.0 / 200.0)
self.assertAlmostEqual(profile.vector[3], math.log1p(32.0))
self.assertAlmostEqual(profile.vector[5], 1.0)
self.assertAlmostEqual(profile.stats["arrival"]["request_rate_per_gpu"], 0.25)
self.assertAlmostEqual(profile.stats["arrival"]["fano_1s"], 0.5)
self.assertEqual(resolve_length_mode(request_mode="decode_only"), "output")
def test_harness_context_uses_canonical_lca_vector(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
profile = build_study_workload_profile(study, requests, window)
state = StudyState(study_id=study.study_id, trials=[])
summary = summarize_window(requests, window)
context = build_harness_context(
study=study,
window_summary=summary,
state=state,
workload_profile=profile,
)
block = context["workload_lca_profile"]
# The labeled L-C-A block is the canonical 10-dim metric, not ad-hoc.
self.assertEqual(block["vector"], profile.vector)
self.assertEqual(len(block["vector"]), 10)
self.assertIn("RobustScaler", block["metric"])
# Without a profile it falls back to the legacy ad-hoc rendering.
legacy = build_harness_context(
study=study,
window_summary=summary,
state=state,
)["workload_lca_profile"]
self.assertNotIn("vector", legacy)
def _steady_requests(self, count: int, *, input_tokens: int = 100) -> list:
return [
TraceRequest(
row_id=f"r{i}",
arrival_s=float(i),
sampling_u=1.0,
body={},
prompt_tokens_hint=input_tokens,
completion_tokens_hint=16,
metadata={"hash_ids": None},
)
for i in range(count)
]
def _conv_window(self) -> WindowRecord:
return WindowRecord(
window_id="conv",
trace_path=Path("trace.jsonl"),
trace_type="chat",
window_start=0.0,
window_end=0.0,
source_payload={"block_size": 64},
)
def test_convergence_prefix_stops_early_on_stationary_trace(self) -> None:
requests = self._steady_requests(60)
point = find_convergence_prefix(
requests,
self._conv_window(),
gpu_count=1,
length_mode="total",
tau=0.9,
tau_c=0.9,
stable_checks=3,
max_checks=20,
min_fraction=0.1,
)
self.assertTrue(point.converged)
# A stationary workload should be trustworthy well before the full window.
self.assertLess(point.stop_index, len(requests))
self.assertLess(point.fraction, 1.0)
self.assertTrue(point.checks)
def test_convergence_prefix_waits_when_cache_warms_late(self) -> None:
window = self._conv_window()
# First half: no prefix reuse. Second half: every request reuses block 1,
# so the C dimension only stabilizes once the reuse regime is exercised.
requests = []
for i in range(30):
requests.append(
TraceRequest(
row_id=f"cold{i}",
arrival_s=float(i),
sampling_u=1.0,
body={},
prompt_tokens_hint=640,
completion_tokens_hint=16,
metadata={"hash_ids": [10_000 + i]},
)
)
for i in range(30):
requests.append(
TraceRequest(
row_id=f"warm{i}",
arrival_s=float(30 + i),
sampling_u=1.0,
body={},
prompt_tokens_hint=640,
completion_tokens_hint=16,
metadata={"hash_ids": [1, 2, 3, 4, 5]},
)
)
point = find_convergence_prefix(
requests,
window,
gpu_count=1,
length_mode="total",
tau=0.9,
tau_c=0.95,
stable_checks=2,
max_checks=20,
min_fraction=0.1,
)
# The C family similarity must be low while only the cold half is seen.
early = [c for c in point.checks if c["fraction"] <= 0.4]
self.assertTrue(early)
self.assertTrue(any(c["family_similarity"]["C"] < 0.9 for c in early))
def test_stop_authority_mirrors_validator_and_blocks_fresh_stop(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study = load_study_spec(_write_study_assets(Path(tmp)))
state = StudyState(study_id=study.study_id, trials=[])
context = build_harness_context(
study=study,
window_summary={},
state=state,
)
authority = context["stop_authority"]
# The authority is the deterministic validator; with no completed
# trials it must not authorize a stop.
self.assertEqual(
authority["authorized"], context["harness_stop"]["should_stop"]
)
self.assertFalse(authority["authorized"])
def test_adaptive_replay_set_truncates_only_when_enabled(self) -> None:
from types import SimpleNamespace
requests = self._steady_requests(60)
window = self._conv_window()
enabled_study = SimpleNamespace(
trace=SimpleNamespace(
adaptive_stop=AdaptiveStopSpec(
enabled=True,
tau=0.9,
tau_c=0.9,
stable_checks=3,
max_checks=20,
min_fraction=0.1,
),
request_mode="chat",
),
hardware=SimpleNamespace(gpu_count=1),
)
replay, certificate = _adaptive_replay_set(
requests, study=enabled_study, window=window
)
self.assertIsNotNone(certificate)
self.assertTrue(certificate["enabled"])
self.assertEqual(len(replay), certificate["stop_index"])
self.assertLessEqual(len(replay), len(requests))
disabled_study = SimpleNamespace(
trace=SimpleNamespace(
adaptive_stop=AdaptiveStopSpec(enabled=False),
request_mode="chat",
),
hardware=SimpleNamespace(gpu_count=1),
)
passthrough, no_cert = _adaptive_replay_set(
requests, study=disabled_study, window=window
)
self.assertIsNone(no_cert)
self.assertEqual(len(passthrough), len(requests))
def test_boundary_guard_extends_only_near_the_slo_knee(self) -> None:
converged = {"converged": True}
# Truncated, converged, pass-rate on the knee -> re-measure full.
self.assertTrue(
_should_extend_on_boundary(
pass_rate=0.961, target_pass_rate=0.95, certificate=converged,
truncated=True, boundary_delta=0.02,
)
)
self.assertTrue(
_should_extend_on_boundary(
pass_rate=0.946, target_pass_rate=0.95, certificate=converged,
truncated=True, boundary_delta=0.02,
)
)
# Clearly feasible / clearly infeasible -> trust the truncated verdict.
self.assertFalse(
_should_extend_on_boundary(
pass_rate=0.99, target_pass_rate=0.95, certificate=converged,
truncated=True, boundary_delta=0.02,
)
)
self.assertFalse(
_should_extend_on_boundary(
pass_rate=0.50, target_pass_rate=0.95, certificate=converged,
truncated=True, boundary_delta=0.02,
)
)
# Not truncated, not converged, guard disabled, or no certificate -> no extend.
self.assertFalse(
_should_extend_on_boundary(
pass_rate=0.95, target_pass_rate=0.95, certificate=converged,
truncated=False, boundary_delta=0.02,
)
)
self.assertFalse(
_should_extend_on_boundary(
pass_rate=0.95, target_pass_rate=0.95, certificate={"converged": False},
truncated=True, boundary_delta=0.02,
)
)
self.assertFalse(
_should_extend_on_boundary(
pass_rate=0.95, target_pass_rate=0.95, certificate=converged,
truncated=True, boundary_delta=0.0,
)
)
self.assertFalse(
_should_extend_on_boundary(
pass_rate=0.95, target_pass_rate=0.95, certificate=None,
truncated=True, boundary_delta=0.02,
)
)
def test_probe_drain_deadline_tracks_admitted_set_and_caps_at_ceiling(self) -> None:
slo = SloSpec.from_dict(
{
"target_pass_rate": 0.95,
"ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
}
)
def req(arrival_s: float, in_tok: int, out_tok: int) -> TraceRequest:
return TraceRequest(
row_id="r",
arrival_s=arrival_s,
sampling_u=0.1,
body={},
prompt_tokens_hint=in_tok,
completion_tokens_hint=out_tok,
metadata={},
)
# 100 requests, last arrival 500s, p99 in=8000 / out=2000.
reqs = [req(float(i * 5), 8000, 2000) for i in range(100)]
# deadline = last_arrival + (ttft_ms + p99_out*tpot_ms)/1000 + margin
# = 495 + (5000 + 2000*50)/1000 + 30 = 495 + 105 + 30 = 630
self.assertAlmostEqual(
_probe_drain_deadline(reqs, slo, ceiling=1000.0), 630.0, places=3
)
# Ceiling caps a deadline that would otherwise exceed it.
self.assertEqual(_probe_drain_deadline(reqs, slo, ceiling=400.0), 400.0)
# No requests or no TPOT rule -> fall back to the ceiling.
self.assertEqual(_probe_drain_deadline([], slo, ceiling=400.0), 400.0)
def test_linear_ms_ttft_rule_scales_with_input_length(self) -> None:
slo = SloSpec.from_dict(
{
"target_pass_rate": 0.95,
"ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
}
)
def ev(prompt_tokens: int, ttft_ms: float):
return evaluate_request(
RequestOutcome(
request_id="r",
success=True,
ttft_ms=ttft_ms,
tpot_ms=10.0,
prompt_tokens=prompt_tokens,
completion_tokens=8,
),
slo,
)
# threshold = 4000 + 0.125*L_in : 8k->5000ms, 0->4000ms
self.assertTrue(ev(8000, 4900).passed)
self.assertFalse(ev(8000, 5100).passed)
self.assertTrue(ev(0, 3900).passed)
self.assertFalse(ev(0, 4100).passed)
def test_streaming_socket_timeout_is_a_failed_request_not_a_crash(self) -> None:
# A request that exceeds request_timeout_s raises TimeoutError mid-stream;
# it must surface as HttpClientError (a failed request), never escape to
# crash the trial.
with mock.patch(
"aituner.http_client._urlopen", side_effect=TimeoutError("timed out")
):
with self.assertRaises(HttpClientError):
stream_chat_completion(
base_url="http://127.0.0.1:1/v1",
body={"messages": [{"role": "user", "content": "hi"}], "stream": True},
timeout_s=0.5,
)
outcome = _run_one_request(
TraceRequest(
row_id="r",
arrival_s=0.0,
sampling_u=1.0,
body={"messages": [{"role": "user", "content": "hi"}], "stream": True},
prompt_tokens_hint=10,
completion_tokens_hint=None,
),
base_url="http://127.0.0.1:1/v1",
timeout_s=0.5,
)
self.assertFalse(outcome.success)
self.assertIn("timed out", outcome.error)
def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None:
# So a killed `study tune` runs the engine-teardown finally instead of
# orphaning the vLLM EngineCore workers on the GPUs.
import signal as _signal
previous = _install_sigterm_as_keyboardinterrupt()
try:
with self.assertRaises(KeyboardInterrupt):
_signal.raise_signal(_signal.SIGTERM)
finally:
_restore_sigterm(previous)
def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
window = WindowRecord(
window_id="base",
trace_path=Path("trace.jsonl"),
trace_type="chat",
window_start=0.0,
window_end=4.0,
source_payload={"block_size": 64},
)
def make_profile(window_id: str, input_tokens: int, *, arrival_gap: float) -> object:
reqs = [
TraceRequest(
row_id=f"{window_id}-1",
arrival_s=0.0,
sampling_u=1.0,
body={},
prompt_tokens_hint=input_tokens,
completion_tokens_hint=16,
metadata={"hash_ids": [window_id, 1]},
),
TraceRequest(
row_id=f"{window_id}-2",
arrival_s=arrival_gap,
sampling_u=1.0,
body={},
prompt_tokens_hint=input_tokens,
completion_tokens_hint=16,
metadata={"hash_ids": [window_id, 1, 2]},
),
]
return build_workload_profile(
reqs,
WindowRecord(
window_id=window_id,
trace_path=window.trace_path,
trace_type=window.trace_type,
window_start=window.window_start,
window_end=window.window_end,
source_payload=window.source_payload,
),
gpu_count=1,
length_mode="total",
)
p1 = make_profile("same-a", 100, arrival_gap=1.0)
p2 = make_profile("same-b", 100, arrival_gap=1.0)
p3 = make_profile("different", 10000, arrival_gap=0.1)
report = similarity_report([p1, p2, p3])
self.assertAlmostEqual(profile_similarity(p1, p2), 1.0)
self.assertGreater(report["matrix"][0][1], report["matrix"][0][2])
self.assertIn("L", report["pairs"][2]["family_similarity"])
def test_cli_profile_window_outputs_lca_profile(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
stdout = io.StringIO()
with mock.patch("sys.stdout", stdout):
rc = cli_main(
[
"profile",
"window",
"--spec",
str(study_path),
"--gpu-count",
"8",
]
)
self.assertEqual(rc, 0)
payload = json.loads(stdout.getvalue())
self.assertEqual(payload["profile"]["window_id"], "chat_w1")
self.assertEqual(len(payload["profile"]["vector"]), 10)
self.assertEqual(payload["profile"]["gpu_count"], 8)
def test_cli_profile_window_does_not_resolve_llm_endpoint(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["endpoint"] = {
"provider": "codex",
"model": "gpt-5.4",
}
study_path.write_text(json.dumps(payload), encoding="utf-8")
stdout = io.StringIO()
with mock.patch("sys.stdout", stdout):
rc = cli_main(["profile", "window", "--spec", str(study_path)])
self.assertEqual(rc, 0)
self.assertEqual(json.loads(stdout.getvalue())["profile"]["window_id"], "chat_w1")
def test_harness_uses_latency_failures_before_generic_unrecoverable(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-result.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"probes": [
{
"threshold": 0.25,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.3,
"request_rate": 1.0,
"early_stopped": True,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {
"ttft_ms>5000.0": 70,
"tpot_ms>50.0": 5,
"probe_elapsed_s>240.0": 100,
},
"ttft_ms": {"p95": 6500.0, "p99": 7200.0},
},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={
"prompt_tokens_p95": 5000,
"prompt_tail_ratio_p95_p50": 3.0,
},
state=state,
)
self.assertEqual(
context["recent_trial_diagnostics"][0]["active_bottleneck"],
"ttft_prefill",
)
def test_harness_blocks_repeating_infeasible_plateau_family(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
trial_summaries = []
for index, (dp, pass_rate, p95) in enumerate(
[(4, 0.345, 3818.4), (8, 0.345, 3823.4)], start=3
):
result_path = tmp_path / f"trial-{index:04d}.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_request_rate": None,
"all_infeasible_diagnostics": {
"threshold": 0.0078125,
"request_count": 148,
"request_rate": 0.22,
"pass_rate": pass_rate,
"early_stopped": True,
"early_stop_reason": "elapsed",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>5000.0": 97},
"ttft_ms": {"p95": p95, "p99": 5800.0},
},
},
}
),
encoding="utf-8",
)
trial_summaries.append(
TrialSummary(
trial_id=f"trial-{index:04d}",
status="completed",
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": dp,
"expert-parallel-size": 1,
},
},
)
)
context = build_harness_context(
study=study,
window_summary={
"prompt_tokens_p95": 7628,
"prompt_tail_ratio_p95_p50": 3.83,
},
state=StudyState(study_id=study.study_id, trials=trial_summaries),
)
guard = context["convergence_guard"]["infeasible_progress"]
self.assertTrue(guard["plateau_detected"])
self.assertTrue(guard["stop_if_next_probe_repeats_family"])
self.assertEqual(guard["blocked_primary_family"], "data-parallel-size")
self.assertTrue(
context["convergence_guard"][
"should_stop_if_no_harness_can_justify_a_new_adjacent_probe"
]
)
self.assertFalse(context["convergence_guard"]["deterministic_stop"])
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertIsNone(build_harness_stop_proposal(context))
def test_harness_strong_incumbent_guard_after_large_gain(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate_per_gpu=0.21,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=1,
best_request_rate=0.035,
best_request_rate_per_gpu=0.035,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=2,
best_request_rate=0.42,
best_request_rate_per_gpu=0.21,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 1,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={
"prompt_tokens_p95": 7628,
"prompt_tokens_p99": 8102,
"prompt_tail_ratio_p95_p50": 3.83,
},
state=state,
)
guard = context["convergence_guard"]["strong_incumbent"]
self.assertTrue(guard["guard_active"])
self.assertGreaterEqual(guard["incumbent_gain_vs_baseline"], 3.0)
self.assertFalse(
context["convergence_guard"][
"should_stop_if_no_harness_can_justify_a_new_adjacent_probe"
]
)
self.assertEqual(
context["convergence_guard"]["reason"],
"strong_incumbent_requires_validation_probes",
)
self.assertIn("validate", guard["recommended_next_action"])
def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.02,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": 8,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 160},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
proposal = build_harness_stop_proposal(context)
self.assertIsNotNone(proposal)
self.assertTrue(proposal.should_stop)
def test_harness_stop_after_non_improving_feasible_validation_is_exhausted(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.02,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": 8,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
best_request_rate=2.1,
best_request_rate_per_gpu=0.2625,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 160},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
def test_harness_stop_after_gmu_incumbent_and_non_improving_topology_validation(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"gpu-memory-utilization",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0007",
best_request_rate=6.8667,
best_request_rate_per_gpu=3.4333,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=2.2,
best_request_rate_per_gpu=2.2,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=6.5167,
best_request_rate_per_gpu=3.2583,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
best_request_rate=8.3667,
best_request_rate_per_gpu=2.0917,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
TrialSummary(
trial_id="trial-0007",
status="completed",
best_request_rate=6.8667,
best_request_rate_per_gpu=3.4333,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"gpu-memory-utilization": 0.97,
},
},
),
TrialSummary(
trial_id="trial-0008",
status="completed",
best_request_rate=4.1833,
best_request_rate_per_gpu=1.0458,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"data-parallel-size": 2,
},
},
),
TrialSummary(
trial_id="trial-0009",
status="completed",
best_request_rate=8.3667,
best_request_rate_per_gpu=1.0458,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 8},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 1500},
state=state,
)
self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(
context["harness_stop"]["reason"],
"post_incumbent_validation_exhausted",
)
proposal = build_harness_stop_proposal(context)
self.assertIsNotNone(proposal)
self.assertTrue(proposal.should_stop)
def test_harness_validation_uses_full_state_baseline_when_history_window_moves(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={"tunable_flags": ["max-num-seqs"]},
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0006",
best_parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=0.88,
best_request_rate_per_gpu=0.11,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
best_request_rate=0.96,
best_request_rate_per_gpu=0.12,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
best_request_rate=1.04,
best_request_rate_per_gpu=0.13,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 32}},
),
TrialSummary(
trial_id="trial-0005",
status="completed",
parallel_size=8,
best_request_rate=2.24,
best_request_rate_per_gpu=0.28,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 40}},
),
TrialSummary(
trial_id="trial-0006",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 48}},
),
TrialSummary(
trial_id="trial-0007",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 56}},
),
TrialSummary(
trial_id="trial-0008",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
),
TrialSummary(
trial_id="trial-0009",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 72}},
),
TrialSummary(
trial_id="trial-0010",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 80}},
),
TrialSummary(
trial_id="trial-0011",
status="failed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 88}},
),
TrialSummary(
trial_id="trial-0012",
status="completed",
parallel_size=8,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 96}},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
self.assertGreater(
context["harness_stop"]["evidence"]["incumbent_gain_vs_baseline"],
2.9,
)
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertIsNone(build_harness_stop_proposal(context))
def test_harness_stop_when_incumbent_saturates_search_high(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.99609375,
"best_request_rate": 9.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.99609375,
"feasible": True,
"payload": {
"request_count": 10,
"pass_rate": 1.0,
"request_rate": 9.0,
"early_stopped": False,
"early_stop_reason": "",
"latency_summary": {"failed_reason_counts": {}},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=9.0,
best_request_rate_per_gpu=9.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=9.0,
best_request_rate_per_gpu=9.0,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertEqual(
context["harness_stop"]["reason"],
"search_high_saturation_requires_parallel_size_evidence",
)
self.assertEqual(
context["harness_stop"]["evidence"]["objective"],
"request_rate_per_gpu",
)
proposal = build_harness_stop_proposal(context)
self.assertIsNone(proposal)
def test_harness_stop_allows_feasible_high_probe_with_some_failures(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0004.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.99609375,
"best_request_rate": 1.77,
"best_pass_rate": 0.968,
"probes": [
{
"threshold": 0.99609375,
"feasible": True,
"payload": {
"request_count": 1063,
"pass_rate": 0.968,
"request_rate": 1.77,
"early_stopped": False,
"early_stop_reason": "",
"latency_summary": {
"failed_reason_counts": {
"tpot_ms>50.0": 34,
}
},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0004",
best_request_rate=1.77,
best_request_rate_per_gpu=0.4425,
trials=[
TrialSummary(
trial_id="trial-0004",
status="completed",
best_request_rate=1.77,
best_request_rate_per_gpu=0.4425,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertEqual(
context["harness_stop"]["reason"],
"search_high_saturation_requires_parallel_size_evidence",
)
def test_harness_stop_blocks_high_saturation_for_fixed_product_tp_dp_redistribution(
self,
) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
},
"tunable_flags": ["tensor-parallel-size", "data-parallel-size"],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_tp_dp_products": [8],
"require_tp_dp_product_equals_gpu_count": True,
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.99609375,
"best_request_rate": 8.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.99609375,
"feasible": True,
"payload": {
"request_count": 10,
"pass_rate": 1.0,
"request_rate": 8.0,
"early_stopped": False,
"early_stop_reason": "",
"latency_summary": {"failed_reason_counts": {}},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=8.0,
best_request_rate_per_gpu=1.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=8.0,
best_request_rate_per_gpu=1.0,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 8,
"data-parallel-size": 1,
},
},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertEqual(
context["harness_stop"]["reason"],
"search_high_saturation_requires_parallel_size_evidence",
)
def test_harness_does_not_repropose_noop_topology_equivalent_to_baseline(
self,
) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.5,
"max-num-seqs": 8,
},
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
trial1_result = tmp_path / "trial-0001.json"
trial1_result.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.935616858887,
"best_request_rate": 8.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.935616858887,
"feasible": True,
"payload": {
"request_count": 480,
"pass_rate": 1.0,
"request_rate": 8.0,
"early_stopped": False,
"early_stop_reason": "",
"latency_summary": {"failed_reason_counts": {}},
},
}
],
}
),
encoding="utf-8",
)
trial2_result = tmp_path / "trial-0002.json"
trial2_result.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.810867944369,
"best_request_rate": 6.95,
"best_pass_rate": 0.9784,
"probes": [
{
"threshold": 0.873242401628,
"feasible": False,
"payload": {
"request_count": 450,
"pass_rate": 0.7844,
"request_rate": 7.5,
"early_stopped": True,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {
"ttft_ms>2000.0": 42,
"slo_pass_rate_unrecoverable": 49,
}
},
},
},
{
"threshold": 0.810867944369,
"feasible": True,
"payload": {
"request_count": 417,
"pass_rate": 0.9784,
"request_rate": 6.95,
"early_stopped": False,
"early_stop_reason": "",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>2000.0": 9}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=4,
best_sampling_u=0.810867944369,
best_request_rate=6.95,
best_request_rate_per_gpu=1.7375,
next_trial_index=3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=8.0,
best_request_rate_per_gpu=1.0,
result_path=str(trial1_result),
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=4,
best_request_rate=6.95,
best_request_rate_per_gpu=1.7375,
result_path=str(trial2_result),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
candidate_set = context["experiment_plan"]["candidate_set"]
self.assertEqual(candidate_set["version"], "candidate-set-v1")
self.assertIn("candidate_set_hash", candidate_set)
self.assertGreaterEqual(
candidate_set["blocked_reason_summary"].get(
"blocked_noop_or_repeat_effective_full_config",
0,
),
1,
)
baseline_fingerprint = hashlib.sha256(
_effective_config_signature(
study,
{"env_patch": {}, "flag_patch": {}},
).encode("utf-8")
).hexdigest()
blocked_baseline_equivalent = [
item
for item in candidate_set["blocked_candidates"]
if item.get("effective_config_fingerprint") == baseline_fingerprint
]
self.assertTrue(blocked_baseline_equivalent)
self.assertEqual(
blocked_baseline_equivalent[0]["blocked_reason"],
"blocked_noop_or_repeat_effective_full_config",
)
self.assertIn("effective_config_fingerprint", blocked_baseline_equivalent[0])
actions = context["experiment_plan"]["candidate_actions"]
self.assertFalse(
any(
action.get("config_patch", {}).get("flag_patch")
== {"tensor-parallel-size": 8}
for action in actions
)
)
proposal = build_harness_guided_proposal(context)
self.assertTrue(
proposal is None
or proposal.config_patch.flag_patch != {"tensor-parallel-size": 8}
)
def test_harness_guided_first_tp_probe_for_latency_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": ["tensor-parallel-size", "data-parallel-size"],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_data_parallel_sizes": [1, 2],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.25,
"best_request_rate": 2.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.5,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.6,
"request_rate": 4.0,
"early_stopped": True,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>50.0": 40},
},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=2.0,
best_request_rate_per_gpu=2.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=2.0,
best_request_rate_per_gpu=2.0,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2})
self.assertFalse(proposal.should_stop)
def test_harness_guided_runtime_seed_preserves_tp_incumbent(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
"enable-chunked-prefill",
"max-num-batched-tokens",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.75,
"best_request_rate": 6.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.75,
"feasible": True,
"payload": {
"request_count": 100,
"pass_rate": 1.0,
"request_rate": 6.0,
"early_stopped": False,
"early_stop_reason": "",
"latency_summary": {"failed_reason_counts": {}},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=6.0,
best_request_rate_per_gpu=3.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=2.0,
best_request_rate_per_gpu=2.0,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=6.0,
best_request_rate_per_gpu=3.0,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p99": 8100},
state=state,
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(
proposal.config_patch.flag_patch,
{
"tensor-parallel-size": 2,
"enable-chunked-prefill": True,
"max-num-batched-tokens": 16384,
},
)
def test_harness_runtime_refinement_preserves_incumbent_runtime_knobs(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
"max-num-seqs",
"enable-chunked-prefill",
"max-num-batched-tokens",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.098,
"best_request_rate": 3.3,
"best_pass_rate": 0.97,
"probes": [
{
"threshold": 0.098,
"feasible": True,
"payload": {
"request_count": 100,
"pass_rate": 0.97,
"request_rate": 3.3,
"early_stopped": False,
"early_stop_reason": "",
"latency_summary": {"failed_reason_counts": {}},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=3.3,
best_request_rate_per_gpu=0.825,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=2.5,
best_request_rate_per_gpu=0.625,
config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=3.3,
best_request_rate_per_gpu=0.825,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.92,
"max-num-seqs": 48,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p99": 8100},
state=state,
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(
proposal.config_patch.flag_patch,
{
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.92,
"max-num-seqs": 48,
"enable-chunked-prefill": True,
"max-num-batched-tokens": 16384,
},
)
def test_harness_raises_gpu_mem_util_on_settled_decode_bound_incumbent(self) -> None:
"""Regression for the coverage gap that let the naive baseline beat the harness:
a settled TP incumbent that is decode_tpot-bound must get a gpu-memory-utilization
raise (KV-cache headroom) before the harness is allowed to stop."""
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
},
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.074,
"best_request_rate": 2.6,
"best_pass_rate": 0.97,
"probes": [
{
"threshold": 0.074,
"feasible": True,
"payload": {
"request_count": 300,
"pass_rate": 0.97,
"request_rate": 2.6,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.09,
"feasible": False,
"payload": {
"request_count": 300,
"pass_rate": 0.6,
"request_rate": 3.2,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>50.0": 90}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=2.6,
best_request_rate_per_gpu=0.65,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=1.1,
best_request_rate_per_gpu=0.275,
config_patch={"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=2.6,
best_request_rate_per_gpu=0.65,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
},
},
),
],
)
context = build_harness_context(
study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertFalse(proposal.should_stop)
# TP4 preserved; gpu-memory-utilization hill-climbed one step (0.9 -> 0.92).
self.assertEqual(
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
)
self.assertEqual(
proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.92
)
# And the harness must NOT authorize a stop while that knob is untried.
self.assertIsNone(build_harness_stop_proposal(context))
def test_harness_climbs_tp_before_gpu_mem_util_micro_tuning(self) -> None:
"""gpu-memory-utilization must not preempt an untried TP increase: at a TP2 incumbent
with TP4 still reachable, the harness must climb TP, not micro-tune runtime."""
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
},
engine_overrides={
"tunable_flags": ["tensor-parallel-size", "gpu-memory-utilization"],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.03,
"best_request_rate": 1.1,
"best_pass_rate": 0.97,
"probes": [
{
"threshold": 0.03,
"feasible": True,
"payload": {
"request_count": 300,
"pass_rate": 0.97,
"request_rate": 1.1,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.05,
"feasible": False,
"payload": {
"request_count": 300,
"pass_rate": 0.6,
"request_rate": 1.6,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>50.0": 90}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=1.1,
best_request_rate_per_gpu=0.55,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=0.6,
best_request_rate_per_gpu=0.6,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=1.1,
best_request_rate_per_gpu=0.55,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"gpu-memory-utilization": 0.9,
},
},
),
],
)
context = build_harness_context(
study=study, window_summary={"prompt_tokens_p95": 1500}, state=state
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
# Must climb TP (to 4), and must NOT micro-tune gpu-memory-utilization yet.
self.assertEqual(
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
)
self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch)
def test_harness_brackets_down_from_bad_high_tp_start_before_runtime_tuning(self) -> None:
"""A no-LLM run that starts at the max TP should validate the adjacent lower
topology before spending trials on runtime micro-tuning."""
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.5,
"max-num-seqs": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"gpu-memory-utilization",
"max-num-seqs",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.05,
"best_request_rate": 8.0,
"best_pass_rate": 0.96,
"probes": [
{
"threshold": 0.05,
"feasible": True,
"payload": {
"request_count": 300,
"pass_rate": 0.96,
"request_rate": 8.0,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.08,
"feasible": False,
"payload": {
"request_count": 300,
"pass_rate": 0.5,
"request_rate": 10.0,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 120}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=8.0,
best_request_rate_per_gpu=1.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=8.0,
best_request_rate_per_gpu=1.0,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 6500},
state=state,
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(
proposal.config_patch.flag_patch.get("tensor-parallel-size"), 4
)
self.assertNotIn("gpu-memory-utilization", proposal.config_patch.flag_patch)
self.assertNotIn("max-num-seqs", proposal.config_patch.flag_patch)
def test_harness_jumps_low_gpu_mem_util_to_nominal_floor_after_topology_settles(self) -> None:
"""A pathological gmu=0.5 start should jump to the normal operating floor
after topology is bracketed instead of wasting many 0.02 hill-climb trials."""
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 2,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.5,
},
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.07,
"best_request_rate": 2.4,
"best_pass_rate": 0.97,
"probes": [
{
"threshold": 0.07,
"feasible": True,
"payload": {
"request_count": 300,
"pass_rate": 0.97,
"request_rate": 2.4,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.1,
"feasible": False,
"payload": {
"request_count": 300,
"pass_rate": 0.55,
"request_rate": 3.1,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>50.0": 90}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=2.4,
best_request_rate_per_gpu=1.2,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=2.4,
best_request_rate_per_gpu=1.2,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=2.2,
best_request_rate_per_gpu=0.55,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 1500},
state=state,
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(
proposal.config_patch.flag_patch.get("gpu-memory-utilization"), 0.9
)
self.assertNotIn("tensor-parallel-size", proposal.config_patch.flag_patch)
def test_descriptor_candidates_expose_bad_runtime_recovery_without_preempting_topology(
self,
) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 2,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.5,
"max-num-seqs": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"gpu-memory-utilization",
"max-num-seqs",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [2, 4, 8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [2, 4, 8],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.05,
"best_request_rate": 3.4667,
"best_pass_rate": 0.9663,
"probes": [
{
"threshold": 0.05,
"feasible": True,
"payload": {
"request_rate": 3.4667,
"pass_rate": 0.9663,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.08,
"feasible": False,
"payload": {
"request_rate": 4.0,
"pass_rate": 0.5,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 120}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=3.4667,
best_request_rate_per_gpu=1.73335,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=2,
best_request_rate=3.4667,
best_request_rate_per_gpu=1.73335,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={
"prompt_tokens_p95": 6500,
"prompt_tail_ratio_p95_p50": 3.0,
},
state=state,
)
next_action = context["experiment_plan"]["next_action"]
self.assertEqual(next_action["knob_family"], "topology")
descriptor_patches = [
action["config_patch"]["flag_patch"]
for action in context["experiment_plan"]["candidate_actions"]
if str(action["knob_family"]).startswith("descriptor:")
]
self.assertTrue(
any(patch.get("max-num-seqs") == 24 for patch in descriptor_patches)
)
self.assertTrue(
any(
patch.get("gpu-memory-utilization") == 0.9
for patch in descriptor_patches
)
)
def test_harness_stops_gpu_mem_util_climb_after_tied_same_topology_probe(self) -> None:
"""A same-topology gpu-memory-utilization probe must improve per-GPU rate before
the hill-climb continues; launch success alone is not evidence to keep climbing."""
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
},
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"gpu-memory-utilization",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.75,
"best_request_rate": 6.5,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.75,
"feasible": True,
"payload": {
"request_count": 300,
"pass_rate": 1.0,
"request_rate": 6.5,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.765625,
"feasible": False,
"payload": {
"request_count": 300,
"pass_rate": 0.6,
"request_rate": 6.7,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 80}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=6.5,
best_request_rate_per_gpu=3.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=2.2,
best_request_rate_per_gpu=2.2,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=6.5,
best_request_rate_per_gpu=3.25,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
best_request_rate=8.4,
best_request_rate_per_gpu=2.1,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
best_request_rate=6.5,
best_request_rate_per_gpu=3.25,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"gpu-memory-utilization": 0.92,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 1500},
state=state,
)
candidates = context["experiment_plan"]["candidate_actions"]
self.assertNotIn(
{"tensor-parallel-size": 2, "gpu-memory-utilization": 0.94},
[
item["config_patch"]["flag_patch"]
for item in candidates
if item["knob_family"] == "gpu-memory-utilization"
],
)
def test_harness_projects_measured_runtime_delta_to_other_frontier_anchor(self) -> None:
"""A runtime improvement found on one topology must be tested on other
Pareto anchors before the harness can keep micro-tuning the source topology."""
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 2,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.5,
"max-num-seqs": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"gpu-memory-utilization",
"max-num-seqs",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [2, 4, 8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [2, 4, 8],
},
},
)
study = load_study_spec(study_path)
latest_result_path = tmp_path / "trial-0005.json"
latest_result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.1,
"best_request_rate": 8.0,
"best_pass_rate": 0.96,
"probes": [
{
"threshold": 0.1,
"feasible": True,
"payload": {
"request_count": 300,
"pass_rate": 0.96,
"request_rate": 8.0,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.12,
"feasible": False,
"payload": {
"request_count": 300,
"pass_rate": 0.6,
"request_rate": 9.0,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 100}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0005",
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=2,
best_request_rate=2.9,
best_request_rate_per_gpu=1.45,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=4,
best_request_rate=6.95,
best_request_rate_per_gpu=1.7375,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
best_request_rate=8.0,
best_request_rate_per_gpu=1.0,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 8},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=4,
best_request_rate=6.95,
best_request_rate_per_gpu=1.7375,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"max-num-seqs": 16,
},
},
),
TrialSummary(
trial_id="trial-0005",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
result_path=str(latest_result_path),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
},
},
),
TrialSummary(
trial_id="trial-0006",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
"max-num-seqs": 16,
},
},
),
TrialSummary(
trial_id="trial-0007",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.92,
},
},
),
TrialSummary(
trial_id="trial-0008",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
"max-num-batched-tokens": 16384,
"max-num-seqs": 16,
},
},
),
TrialSummary(
trial_id="trial-0009",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
"enable-chunked-prefill": True,
"max-num-batched-tokens": 8192,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 6500},
state=state,
)
next_action = context["experiment_plan"]["next_action"]
self.assertEqual(next_action["knob_family"], "frontier-delta-projection")
self.assertEqual(
next_action["config_patch"]["flag_patch"],
{
"tensor-parallel-size": 2,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.9,
},
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
materialized_signature = materialized_effective_config_signature(
study=study,
state=state,
proposal=proposal,
)
tested_signatures = {
_effective_config_signature(study, trial.config_patch)
for trial in state.trials
}
self.assertNotIn(materialized_signature, tested_signatures)
self.assertIsNone(build_harness_stop_proposal(context))
def test_harness_validates_unmeasured_tp_frontier_before_runtime_refinement(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"max-num-batched-tokens",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.96,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_count": 100,
"pass_rate": 0.96,
"request_rate": 2.0,
"latency_summary": {"failed_reason_counts": {}},
},
},
{
"threshold": 0.75,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.6,
"request_rate": 3.0,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>25.0": 40}
},
},
},
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=2.0,
best_request_rate_per_gpu=1.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=0.5,
best_request_rate_per_gpu=0.5,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=2.0,
best_request_rate_per_gpu=1.0,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
state=state,
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 4})
self.assertEqual(
context["harness_proposal"]["reason"],
"topology_frontier_probe_for_slo_pressure",
)
def test_profile_driven_planner_scores_unmeasured_tp_frontier(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"max-num-batched-tokens",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.96,
"probes": [
{
"threshold": 0.75,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.6,
"request_rate": 3.0,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 35}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
state=StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=2.0,
best_request_rate_per_gpu=1.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=0.5,
best_request_rate_per_gpu=0.5,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=2.0,
best_request_rate_per_gpu=1.0,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
],
),
)
plan = context["experiment_plan"]
self.assertEqual(plan["planner_version"], "profile-driven-v1")
self.assertEqual(plan["next_action"]["knob_family"], "topology")
self.assertEqual(
plan["next_action"]["config_patch"]["flag_patch"],
{"tensor-parallel-size": 4},
)
self.assertIn("ttft_prefill", context["bottleneck_hypotheses"][0]["name"])
self.assertFalse(context["harness_stop"]["should_stop"])
def test_profile_driven_topology_does_not_introduce_ep_for_ttft(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {"host": "127.0.0.1", "port": 8000},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"enable-expert-parallel",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_data_parallel_sizes": [1],
"allowed_expert_parallel_sizes": [1, 2],
"allowed_tp_dp_products": [1, 2, 4],
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"require_enable_expert_parallel_when_ep_gt_one": True,
},
},
)
result_paths: list[Path] = []
for idx in range(1, 4):
result_path = tmp_path / f"trial-000{idx}.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.25,
"best_request_rate": 2.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.5,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.6,
"request_rate": 4.0,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"ttft_ms>2000": 40}
},
},
}
],
}
),
encoding="utf-8",
)
result_paths.append(result_path)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 8192},
state=StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=4.0,
best_request_rate_per_gpu=2.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=2.0,
best_request_rate_per_gpu=2.0,
result_path=str(result_paths[0]),
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=4.0,
best_request_rate_per_gpu=2.0,
result_path=str(result_paths[1]),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
best_request_rate=4.0,
best_request_rate_per_gpu=1.0,
result_path=str(result_paths[2]),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
],
),
)
candidate_actions = context["experiment_plan"]["candidate_actions"]
for action in candidate_actions:
patch = action["config_patch"]["flag_patch"]
self.assertNotIn("enable-expert-parallel", patch)
self.assertNotIn("expert-parallel-size", patch)
def test_profile_driven_planner_prefers_decode_concurrency_relief(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"request_mode": "decode_only"},
slo_overrides={
"ttft_rule": None,
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 4,
"max-num-seqs": 64,
},
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.25,
"best_request_rate": 1.0,
"best_pass_rate": 0.97,
"probes": [
{
"threshold": 0.5,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.5,
"request_rate": 2.0,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>20.0": 50}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={},
state=StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=1.0,
best_request_rate_per_gpu=0.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=1.0,
best_request_rate_per_gpu=0.25,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
),
)
plan = context["experiment_plan"]
self.assertEqual(plan["next_action"]["knob_family"], "max-num-seqs")
self.assertEqual(
plan["next_action"]["config_patch"]["flag_patch"],
{"max-num-seqs": 32},
)
def test_prefill_convergence_stop_waits_for_sequence_concurrency_probe(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 4,
"data-parallel-size": 1,
"max-num-batched-tokens": 8192,
"max-num-seqs": 64,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [4, 8],
"allowed_data_parallel_sizes": [1, 2],
"allowed_tp_dp_products": [4, 8],
},
},
)
def write_result(name: str, best_rate: float | None, pass_rate: float) -> Path:
path = tmp_path / f"{name}.json"
payload = {
"status": "completed",
"best_sampling_u": 0.091796875 if best_rate is not None else None,
"best_request_rate": best_rate,
"best_pass_rate": pass_rate if best_rate is not None else None,
"probes": [
{
"threshold": 0.09375,
"feasible": best_rate is not None,
"payload": {
"request_rate": best_rate,
"pass_rate": pass_rate,
"early_stop_reason": (
"" if best_rate is not None else "slo_pass_rate_unrecoverable"
),
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 32}
},
},
}
],
}
path.write_text(json.dumps(payload), encoding="utf-8")
return path
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_sampling_u=0.091796875,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
best_pass_rate=0.952,
result_path=str(write_result("trial-0001", 2.303, 0.952)),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 8,
"data-parallel-size": 1,
},
},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
best_pass_rate=0.953,
result_path=str(write_result("trial-0002", 2.303, 0.953)),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 8,
"max-num-batched-tokens": 32768,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
result_path=str(write_result("trial-0003", None, 0.0)),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"data-parallel-size": 2,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
best_pass_rate=0.954,
result_path=str(write_result("trial-0004", 2.303, 0.954)),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 12288,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertEqual(
context["harness_stop"]["reason"],
"experiment_plan_has_high_value_candidate",
)
action = context["experiment_plan"]["next_action"]
self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
self.assertEqual(action["action_id"], "raise_prefill_quantum_with_chunked_prefill")
flag_patch = action["config_patch"]["flag_patch"]
self.assertEqual(flag_patch["tensor-parallel-size"], 8)
self.assertGreater(flag_patch["max-num-batched-tokens"], 8192)
def test_prefill_scheduler_lowers_quantum_by_normalized_ratio(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 32768,
"max-num-seqs": 8,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [8],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.95,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.95,
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 24}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
state=state,
)
action = context["experiment_plan"]["next_action"]
flag_patch = action["config_patch"]["flag_patch"]
self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
self.assertEqual(action["action_id"], "lower_prefill_quantum_with_chunked_prefill")
self.assertLess(flag_patch["max-num-batched-tokens"], 32768)
factors = action["score_factors"]
self.assertLess(
factors["prefill_quantum_ratio_target"],
factors["prefill_quantum_ratio_current"],
)
def test_prefill_scheduler_quantum_step_scales_with_prompt_length(self) -> None:
targets: list[int] = []
for prompt_p95 in (8192, 16384):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 32768,
"max-num-seqs": 8,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [8],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.95,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.95,
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 24}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={
"prompt_tokens_p95": prompt_p95,
"prompt_tail_ratio_p95_p50": 4.0,
},
state=state,
)
action = context["experiment_plan"]["next_action"]
self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
targets.append(action["config_patch"]["flag_patch"]["max-num-batched-tokens"])
self.assertGreater(targets[1], targets[0])
def test_prefill_scheduler_coverage_precedes_gmu_microtune(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 2,
"data-parallel-size": 1,
"gpu-memory-utilization": 0.7,
"max-num-seqs": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [2, 4],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [2, 4],
},
},
trace_overrides={"max_concurrency": 64},
)
def write_result(name: str, request_rate: float) -> Path:
path = tmp_path / f"{name}.json"
path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": request_rate,
"best_pass_rate": 0.95,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_rate": request_rate,
"pass_rate": 0.95,
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 24}
},
},
}
],
}
),
encoding="utf-8",
)
return path
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=2,
best_request_rate=4.05,
best_request_rate_per_gpu=2.025,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=2,
best_request_rate=4.05,
best_request_rate_per_gpu=2.025,
result_path=str(write_result("trial-0001", 4.05)),
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=4,
best_request_rate=8.0,
best_request_rate_per_gpu=2.0,
result_path=str(write_result("trial-0002", 8.0)),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 7774, "prompt_tail_ratio_p95_p50": 3.0},
state=state,
)
action = context["experiment_plan"]["next_action"]
self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
self.assertEqual(action["action_id"], "seed_chunked_prefill_quantum")
self.assertGreater(
action["score_factors"]["uncovered_scheduler_dimension_bonus"],
0.0,
)
families = {
item["knob_family"] for item in context["experiment_plan"]["candidate_actions"]
}
self.assertNotIn("enable-chunked-prefill", families)
def test_prefill_scheduler_admission_pressure_only_uses_normalized_seq_cap(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"max_concurrency": 64},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 8192,
"max-num-seqs": 8,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [8],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.5,
"probes": [
{
"threshold": 0.5,
"feasible": False,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.5,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {"failed_reason_counts": {}},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
state=state,
)
action = context["experiment_plan"]["next_action"]
flag_patch = action["config_patch"]["flag_patch"]
self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
self.assertEqual(action["action_id"], "raise_admission_pressure_with_chunked_prefill")
self.assertEqual(flag_patch["max-num-seqs"], 16)
self.assertNotIn("max-num-batched-tokens", flag_patch)
self.assertEqual(action["score_factors"]["admission_pressure_direction"], "raise")
self.assertLess(
action["score_factors"]["admission_pressure_ratio_current"],
action["score_factors"]["admission_pressure_ratio_target"],
)
def test_prefill_scheduler_lowers_excess_admission_pressure(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"max_concurrency": 64},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 8192,
"max-num-seqs": 128,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [8],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.95,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.95,
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 24}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
state=state,
)
action = context["experiment_plan"]["next_action"]
flag_patch = action["config_patch"]["flag_patch"]
self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
self.assertEqual(action["action_id"], "lower_admission_pressure_with_chunked_prefill")
self.assertLess(flag_patch["max-num-seqs"], 128)
self.assertNotIn("max-num-batched-tokens", flag_patch)
self.assertEqual(action["score_factors"]["admission_pressure_direction"], "lower")
self.assertLess(
action["score_factors"]["admission_pressure_ratio_target"],
action["score_factors"]["admission_pressure_ratio_current"],
)
def test_prefill_scheduler_negative_applicability_matrix(self) -> None:
variants = [
(
{"request_mode": "decode_only"},
{"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
),
(
{},
{
"prompt_tokens_p95": 8192,
"prompt_tail_ratio_p95_p50": 4.0,
"prefix_cache": {"repeated_token_ratio_estimate": 0.75},
},
),
(
{},
{"prompt_tokens_p95": 2048, "prompt_tail_ratio_p95_p50": 1.0},
),
]
for trace_overrides, window_summary in variants:
with self.subTest(trace_overrides=trace_overrides, window_summary=window_summary):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides=trace_overrides,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 8192,
"max-num-seqs": 8,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [8],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.95,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.95,
"latency_summary": {
"failed_reason_counts": {
"ttft_ms>4000.0": 24
}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary=window_summary,
state=state,
)
families = {
item["knob_family"]
for item in context["experiment_plan"]["candidate_actions"]
}
self.assertNotIn("prefill-scheduler-interaction", families)
def test_prefill_scheduler_does_not_preempt_open_topology_frontier(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 2,
"data-parallel-size": 1,
"max-num-batched-tokens": 8192,
"max-num-seqs": 8,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [2, 4],
"allowed_data_parallel_sizes": [1, 2],
"allowed_tp_dp_products": [4, 8],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.95,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.95,
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 24}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_request_rate=2.0,
best_request_rate_per_gpu=0.5,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=4,
best_request_rate=2.0,
best_request_rate_per_gpu=0.5,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"data-parallel-size": 2},
},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 8192, "prompt_tail_ratio_p95_p50": 4.0},
state=state,
)
action = context["experiment_plan"]["next_action"]
self.assertEqual(action["knob_family"], "topology")
self.assertEqual(
action["config_patch"]["flag_patch"],
{"tensor-parallel-size": 4, "data-parallel-size": 2},
)
families = {
item["knob_family"] for item in context["experiment_plan"]["candidate_actions"]
}
self.assertNotIn("prefill-scheduler-interaction", families)
def test_prefill_scheduler_not_active_for_short_prompt_workload(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 32768,
"max-num-seqs": 8,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [8],
"allowed_data_parallel_sizes": [1],
"allowed_tp_dp_products": [8],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.95,
"probes": [
{
"threshold": 0.5,
"feasible": True,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.95,
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 24}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.0,
best_request_rate_per_gpu=0.25,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048, "prompt_tail_ratio_p95_p50": 1.0},
state=state,
)
families = {
item["knob_family"] for item in context["experiment_plan"]["candidate_actions"]
}
self.assertNotIn("prefill-scheduler-interaction", families)
def test_prefill_sequence_probe_followed_by_joint_runtime_probe(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 4,
"data-parallel-size": 1,
"max-num-batched-tokens": 8192,
"max-num-seqs": 64,
"enable-chunked-prefill": True,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-chunked-prefill",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [4, 8],
"allowed_data_parallel_sizes": [1, 2],
"allowed_tp_dp_products": [4, 8],
},
},
)
def write_result(name: str) -> Path:
path = tmp_path / f"{name}.json"
payload = {
"status": "completed",
"best_sampling_u": 0.091796875,
"best_request_rate": 2.303,
"best_pass_rate": 0.951,
"probes": [
{
"threshold": 0.09375,
"feasible": True,
"payload": {
"request_rate": 2.303,
"pass_rate": 0.951,
"latency_summary": {
"failed_reason_counts": {"ttft_ms>4000.0": 32}
},
},
}
],
}
path.write_text(json.dumps(payload), encoding="utf-8")
return path
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_sampling_u=0.091796875,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
best_pass_rate=0.952,
result_path=str(write_result("trial-0001")),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 8,
"data-parallel-size": 1,
},
},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
best_pass_rate=0.950,
result_path=str(write_result("trial-0002")),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 8,
"max-num-seqs": 96,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
best_request_rate=2.303,
best_request_rate_per_gpu=0.288,
best_pass_rate=0.950,
result_path=str(write_result("trial-0003")),
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 8,
"data-parallel-size": 1,
"max-num-batched-tokens": 12288,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 24000, "prompt_tokens_p99": 32000},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertEqual(
context["harness_stop"]["reason"],
"experiment_plan_has_high_value_candidate",
)
action = context["experiment_plan"]["next_action"]
flag_patch = action["config_patch"]["flag_patch"]
self.assertEqual(action["knob_family"], "prefill-scheduler-interaction")
self.assertEqual(action["action_id"], "raise_prefill_quantum_with_chunked_prefill")
self.assertEqual(flag_patch["tensor-parallel-size"], 8)
self.assertGreater(flag_patch["max-num-batched-tokens"], 8192)
self.assertLess(flag_patch["max-num-batched-tokens"], 24000)
def test_slo_unrecoverable_does_not_mask_latency_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
slo_overrides={
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 25},
},
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"max-num-seqs",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
result_path = tmp_path / "trial-0001.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_request_rate": 0.065,
"best_request_rate_per_gpu": 0.065,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.015625,
"feasible": False,
"payload": {
"request_count": 290,
"pass_rate": 0.041,
"request_rate": 0.483,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {
"ttft_ms>4000.0": 2,
"tpot_ms>25.0": 14,
"slo_pass_rate_unrecoverable": 263,
}
},
},
},
{
"threshold": 0.001953125,
"feasible": True,
"payload": {
"request_count": 39,
"pass_rate": 1.0,
"request_rate": 0.065,
"latency_summary": {"failed_reason_counts": {}},
},
},
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
state=StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=0.065,
best_request_rate_per_gpu=0.065,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=0.065,
best_request_rate_per_gpu=0.065,
best_pass_rate=1.0,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
),
)
self.assertNotEqual(
context["bottleneck_hypotheses"][0]["name"],
"admission_or_queueing",
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(proposal.config_patch.flag_patch, {"tensor-parallel-size": 2})
def test_harness_excludes_topology_above_visible_gpu_count(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_envs": {"CUDA_VISIBLE_DEVICES": "0,1,2,4,5,6,7"},
"tunable_flags": ["tensor-parallel-size"],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
result_path = tmp_path / "trial-0003.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_request_rate": 1.078,
"best_pass_rate": 0.958,
"probes": [
{
"threshold": 0.039,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.8,
"request_rate": 1.10,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>25.0": 20}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
state=StudyState(
study_id=study.study_id,
best_trial_id="trial-0003",
best_request_rate=1.078,
best_request_rate_per_gpu=0.2695,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=0.065,
best_request_rate_per_gpu=0.065,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=0.398,
best_request_rate_per_gpu=0.199,
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
best_request_rate=1.078,
best_request_rate_per_gpu=0.2695,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 4},
},
),
],
),
)
candidates = context["candidate_actions"]
self.assertFalse(
any(
action["config_patch"]["flag_patch"].get("tensor-parallel-size") == 8
for action in candidates
)
)
proposal = build_harness_guided_proposal(context)
self.assertTrue(
proposal is None
or proposal.config_patch.flag_patch.get("tensor-parallel-size") != 8
)
def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_tp_dp_products": [1, 2, 4],
},
},
)
study = load_study_spec(study_path)
result_path = tmp_path / "trial-0002.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 0.96,
"probes": [
{
"threshold": 0.75,
"feasible": False,
"payload": {
"request_count": 100,
"pass_rate": 0.6,
"request_rate": 3.0,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>25.0": 40}
},
},
}
],
}
),
encoding="utf-8",
)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_request_rate=2.0,
best_request_rate_per_gpu=1.0,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=0.5,
best_request_rate_per_gpu=0.5,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
best_request_rate=2.0,
best_request_rate_per_gpu=1.0,
result_path=str(result_path),
config_patch={
"env_patch": {},
"flag_patch": {"tensor-parallel-size": 2},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
best_request_rate=1.98,
best_request_rate_per_gpu=0.99,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 8}},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
best_request_rate=1.98,
best_request_rate_per_gpu=0.99,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 16}},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "topology_frontier_requires_probe")
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192,
}
},
)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
summary = summarize_window(requests, window)
self.assertEqual(len(requests), 2)
self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
self.assertEqual(summary["request_count"], 2)
self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
self.assertIn("prefix_cache", summary)
self.assertIn("arrival_burst_ratio_p95_to_mean", summary)
prompt = build_prompt(
study=study,
window_summary=summary,
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertIn('"input_length_filter"', prompt)
self.assertIn('"max_input_tokens": 8192', prompt)
def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={
"input_length_filter": {
"min_input_tokens": 8193,
"max_input_tokens": 8192,
}
},
)
with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
load_study_spec(study_path)
def test_trace_rejects_non_positive_max_requests_per_probe(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study_path = _write_study_assets(
Path(tmp),
trace_overrides={"max_requests_per_probe": 0},
)
with self.assertRaisesRegex(SpecError, "max_requests_per_probe must be > 0"):
load_study_spec(study_path)
def test_trace_rejects_invalid_replay_time_scale(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study_path = _write_study_assets(
Path(tmp),
trace_overrides={"replay_time_scale": 0.0},
)
with self.assertRaisesRegex(SpecError, "replay_time_scale must be > 0"):
load_study_spec(study_path)
def test_decode_only_mode_is_loaded_and_prompt_mentions_it(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"request_mode": "decode_only"},
slo_overrides={
"ttft_rule": None,
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
},
)
study = load_study_spec(study_path)
self.assertEqual(study.trace.request_mode, "decode_only")
self.assertTrue(study.trace.restart_engine_after_early_stop)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertIn('"request_mode": "decode_only"', prompt)
self.assertIn('"restart_engine_after_early_stop": true', prompt)
self.assertIn("There is no TTFT SLO for this study.", prompt)
self.assertIn("decode-only", prompt)
def test_decode_only_restart_after_early_stop_can_be_disabled(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={
"request_mode": "decode_only",
"restart_engine_after_early_stop": False,
},
)
study = load_study_spec(study_path)
self.assertFalse(study.trace.restart_engine_after_early_stop)
def test_chat_mode_does_not_restart_after_early_stop_by_default(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
self.assertEqual(study.trace.request_mode, "chat")
self.assertFalse(study.trace.restart_engine_after_early_stop)
def test_decode_only_harness_defaults_to_decode_tpot(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"request_mode": "decode_only"},
slo_overrides={
"ttft_rule": None,
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
},
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-seqs",
"max-num-batched-tokens",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_tp_dp_products": [8],
"require_tp_dp_product_equals_gpu_count": True,
},
},
)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
context = build_harness_context(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(study_id=study.study_id),
)
active = {
harness["knob_family"]
for harness in context["knob_harnesses"]
if harness["active_now"]
}
self.assertIn("tensor-parallel-size", active)
self.assertIn("data-parallel-size", active)
self.assertIn("max-num-seqs", active)
self.assertIn("max-num-batched-tokens", active)
self.assertIn(
"For decode_only studies, ignore TTFT",
"\n".join(context["proposal_rules"]),
)
self.assertIn(
"config_patch is applied to the study base config",
"\n".join(context["proposal_rules"]),
)
def test_decode_topology_planner_prefers_dp_redistribution_and_preserves_ep(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"request_mode": "decode_only"},
slo_overrides={
"ttft_rule": None,
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 40},
},
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": True,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 8,
"max-num-seqs": 192,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"max-num-seqs",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8],
"require_tp_dp_product_equals_gpu_count": True,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"require_enable_expert_parallel_when_ep_gt_one": True,
},
},
)
result_path = tmp_path / "trial-0001-result.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_request_rate": 0.47,
"best_pass_rate": 0.98,
"probes": [
{
"threshold": 0.04,
"feasible": False,
"payload": {
"request_rate": 0.72,
"pass_rate": 0.3,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>40.0": 80}
},
},
}
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={},
state=StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_request_rate=0.47,
best_request_rate_per_gpu=0.05875,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
best_request_rate=0.47,
best_request_rate_per_gpu=0.05875,
best_pass_rate=0.98,
result_path=str(result_path),
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
),
)
action = context["experiment_plan"]["next_action"]
self.assertEqual(action["knob_family"], "topology")
self.assertEqual(
action["config_patch"]["flag_patch"],
{"tensor-parallel-size": 2, "data-parallel-size": 4},
)
proposal = build_harness_guided_proposal(context)
self.assertIsNotNone(proposal)
self.assertEqual(
proposal.config_patch.flag_patch,
{"tensor-parallel-size": 2, "data-parallel-size": 4},
)
def test_prompt_can_disable_harness_for_ablation(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["use_harness"] = False
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertFalse(study.llm.use_harness)
self.assertIn("Study context:", prompt)
self.assertIn("Trial history:", prompt)
self.assertIn("Known launch failures:", prompt)
self.assertNotIn('"paper_alignment"', prompt)
self.assertNotIn("Harnesses:", prompt)
self.assertNotIn("Disabled by llm.use_harness=false", prompt)
self.assertNotIn("without harness hints", prompt)
self.assertNotIn("Window summary:", prompt)
self.assertNotIn("Parallel space candidates:", prompt)
self.assertNotIn("Prioritize exploring legal topology changes", prompt)
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={"request_mode": "decode_only"},
slo_overrides={
"ttft_rule": None,
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
},
engine_overrides={
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-seqs",
]
},
)
result_path = tmp_path / "trial-0001-result.json"
result_path.write_text(
json.dumps(
{
"status": "completed",
"best_request_rate": 1.0,
"best_pass_rate": 1.0,
"probes": [
{
"threshold": 0.1,
"feasible": False,
"payload": {
"request_rate": 2.0,
"pass_rate": 0.1,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"failed_reason_counts": {"tpot_ms>20.0": 20}
},
},
},
{
"threshold": 0.01,
"feasible": True,
"payload": {
"request_rate": 1.0,
"pass_rate": 1.0,
"early_stop_reason": "probe_elapsed_s>1200.0",
"latency_summary": {
"failed_reason_counts": {"probe_elapsed_s>1200.0": 1}
},
},
},
],
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
context = build_harness_context(
study=study,
window_summary={},
state=StudyState(
study_id=study.study_id,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
result_path=str(result_path),
)
],
),
)
diagnostics = context["recent_trial_diagnostics"]
self.assertEqual(diagnostics[0]["active_bottleneck"], "decode_tpot")
active = {
harness["knob_family"]
for harness in context["knob_harnesses"]
if harness["active_now"]
}
self.assertIn("data-parallel-size", active)
self.assertIn("max-num-seqs", active)
def test_best_feasible_probe_record_keeps_partial_probe_evidence(self) -> None:
best = _best_feasible_probe_record(
[
{
"threshold": 0.03125,
"request_rate": 0.72,
"pass_rate": 0.3,
"feasible": False,
},
{
"threshold": 0.015625,
"request_rate": 0.3533,
"pass_rate": 0.99,
"feasible": True,
},
{
"threshold": 0.017578125,
"request_rate": 0.3833,
"pass_rate": 0.995,
"feasible": True,
},
]
)
self.assertIsNotNone(best)
self.assertEqual(best["threshold"], 0.017578125)
self.assertEqual(best["request_rate"], 0.3833)
def test_load_study_spec_rejects_mismatched_served_model_name(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"served-model-name": "engine-name",
}
},
)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["model"]["served_model_name"] = "trace-name"
study_path.write_text(json.dumps(payload), encoding="utf-8")
with self.assertRaisesRegex(SpecError, "must match engine.base_flags"):
load_study_spec(study_path)
def test_bailian_endpoint_defaults(self) -> None:
endpoint = LLMEndpointSpec.from_dict({"provider": "bailian", "model": "qwen-plus"})
self.assertEqual(endpoint.provider, "bailian")
self.assertEqual(
endpoint.base_url, "https://dashscope.aliyuncs.com/compatible-mode/v1"
)
self.assertEqual(endpoint.api_key_env, "DASHSCOPE_API_KEY")
def test_codex_endpoint_resolves_base_url_from_codex_config(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
codex_dir = tmp_path / ".codex"
codex_dir.mkdir(parents=True)
(codex_dir / "config.toml").write_text(
'\n'.join(
[
'model_provider = "ipads"',
'model_reasoning_effort = "high"',
"",
"[model_providers.ipads]",
'base_url = "http://codex.example/v1"',
'wire_api = "responses"',
]
),
encoding="utf-8",
)
with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True):
endpoint = LLMEndpointSpec.from_dict({"provider": "codex", "model": "gpt-5.4"})
self.assertEqual(endpoint.provider, "codex")
self.assertEqual(endpoint.base_url, "http://codex.example/v1")
self.assertEqual(endpoint.wire_api, "responses")
self.assertFalse(endpoint.stream)
self.assertEqual(endpoint.reasoning_effort, "high")
self.assertEqual(endpoint.api_key_env, "OPENAI_API_KEY")
def test_codex_stream_forces_chat_completions_wire_api(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
codex_dir = tmp_path / ".codex"
codex_dir.mkdir(parents=True)
(codex_dir / "config.toml").write_text(
'\n'.join(
[
'model_provider = "ipads"',
"",
"[model_providers.ipads]",
'base_url = "http://codex.example/v1"',
'wire_api = "responses"',
]
),
encoding="utf-8",
)
with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True):
endpoint = LLMEndpointSpec.from_dict(
{"provider": "codex", "model": "gpt-5.4", "stream": True}
)
self.assertTrue(endpoint.stream)
self.assertEqual(endpoint.wire_api, "chat.completions")
def test_endpoint_stream_flag(self) -> None:
endpoint = LLMEndpointSpec.from_dict(
{
"provider": "custom",
"base_url": "http://example/v1",
"wire_api": "chat.completions",
"stream": True,
"model": "x",
"api_key_env": "OPENAI_API_KEY",
}
)
self.assertTrue(endpoint.stream)
def test_extract_response_text_supports_responses_api_output(self) -> None:
text = _extract_response_text(
{
"output": [
{
"type": "message",
"content": [
{"type": "output_text", "text": '{"diagnosis":"ok"}'}
],
}
]
}
)
self.assertEqual(text, '{"diagnosis":"ok"}')
def test_auth_headers_load_bailian_key_from_dotenv(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
(tmp_path / ".env").write_text('DASHSCOPE_API_KEY="dash-key"\n', encoding="utf-8")
with mock.patch.dict(os.environ, {}, clear=True):
with mock.patch("pathlib.Path.cwd", return_value=tmp_path):
headers = _auth_headers("DASHSCOPE_API_KEY", "bailian")
self.assertEqual(headers["Authorization"], "Bearer dash-key")
def test_auth_headers_load_codex_auth_and_proxy(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
codex_dir = tmp_path / ".codex"
codex_dir.mkdir(parents=True)
(codex_dir / "config.toml").write_text(
'\n'.join(
[
"[network]",
'http_proxy = "http://proxy.example:3128"',
'https_proxy = "http://proxy.example:3128"',
]
),
encoding="utf-8",
)
(codex_dir / "auth.json").write_text(
json.dumps({"OPENAI_API_KEY": "sk-codex-test"}),
encoding="utf-8",
)
with mock.patch.dict(os.environ, {"HOME": str(tmp_path)}, clear=True):
with mock.patch("pathlib.Path.cwd", return_value=tmp_path):
headers = _auth_headers("OPENAI_API_KEY", "codex")
self.assertEqual(os.environ["http_proxy"], "http://proxy.example:3128")
self.assertEqual(os.environ["HTTP_PROXY"], "http://proxy.example:3128")
self.assertEqual(headers["Authorization"], "Bearer sk-codex-test")
def test_prompt_includes_failed_trial_context(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(
study_id=study.study_id,
trials=[
TrialSummary(
trial_id="trial-0001",
status="failed",
diagnosis="flashinfer looked promising",
config_patch={
"env_patch": {"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
"flag_patch": {"tensor-parallel-size": 4},
},
failure_reason="engine_process_exited_before_ready exit_code=1",
)
],
),
capability_profile=None,
)
self.assertIn('"status": "failed"', prompt)
self.assertIn('"failure_reason": "engine_process_exited_before_ready exit_code=1"', prompt)
self.assertIn('"VLLM_ATTENTION_BACKEND": "FLASHINFER"', prompt)
self.assertIn("Known launch failures:", prompt)
def test_prompt_includes_failure_stage_for_launch_failures(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(
study_id=study.study_id,
trials=[
TrialSummary(
trial_id="trial-0002",
status="failed",
diagnosis="bad topology",
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 3,
"data-parallel-size": 3,
},
},
failure_stage="engine_launch",
failure_reason="engine_process_exited_before_ready exit_code=1",
)
],
),
capability_profile=None,
)
self.assertIn('"failure_stage": "engine_launch"', prompt)
self.assertIn('"implicated_flag_keys"', prompt)
def test_prompt_prioritizes_parallel_space_when_tp_dp_ep_are_tunable(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": True,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 8,
},
"tunable_envs": [],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"max-num-seqs",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": True,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertIn("Prioritize exploring legal topology changes in parallel space", prompt)
self.assertIn("Parallel space candidates:", prompt)
self.assertIn('"tensor_parallel_size": 2', prompt)
def test_parse_proposal_text_repairs_truncated_json(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study = load_study_spec(_write_study_assets(tmp_path))
proposal = parse_proposal_text(
"""
{
"observation": "obs",
"diagnosis": "diag",
"config_patch": {
"env_patch": {},
"flag_patch": {
"max-num-seqs": 24
}
},
"expected_effects": [
"faster batching"
],
"why_not_previous_failures": "none"
""",
study,
)
self.assertEqual(proposal.diagnosis, "diag")
self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 24)
def test_length_only_trace_rows_are_synthesized(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
trace_dir = tmp_path / "trace_windows" / "traces"
trace_dir.mkdir(parents=True)
trace_path = trace_dir / "chat_len_only.jsonl"
with trace_path.open("w", encoding="utf-8") as handle:
handle.write(
json.dumps(
{
"timestamp": 0.0,
"sampling_u": 0.1,
"input_length": 32,
"output_length": 16
}
)
+ "\n"
)
windows_path = tmp_path / "trace_windows" / "windows.json"
windows_path.write_text(
json.dumps(
{
"windows": [
{
"window_id": "w1",
"trace_type": "chat",
"trace_file": "traces/chat_len_only.jsonl",
"window_start": 0.0,
"window_end": 10.0
}
]
}
),
encoding="utf-8",
)
study_path = tmp_path / "study.json"
study_path.write_text(
json.dumps(
{
"study_id": "study-len-only",
"hardware": {"gpu_count": 1},
"model": {
"model_id": "m1",
"served_model_name": "dummy-model"
},
"engine": {
"engine_name": "vllm",
"exec_path": "/usr/local/bin/vllm",
"host": "127.0.0.1",
"port": 8000,
"ready_timeout_s": 10,
"request_timeout_s": 10,
"healthcheck_path": "/v1/models",
"launch_args": [],
"base_envs": {},
"base_flags": {},
"tunable_envs": [],
"tunable_flags": []
},
"trace": {
"windows_path": str(windows_path),
"window_id": "w1",
"max_concurrency": 1,
"synthetic_prompt_cap_tokens": 8
},
"slo": {"target_pass_rate": 0.95},
"search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1},
"llm": {"system_prompt": "", "max_history_trials": 1}
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
_, requests = load_trace_requests(study, study_spec_path=study_path)
self.assertEqual(len(requests), 1)
message = requests[0].body["messages"][0]["content"]
self.assertEqual(message.count("token"), 8)
self.assertEqual(requests[0].body["min_tokens"], 16)
self.assertEqual(requests[0].body["max_tokens"], 16)
def test_slo_evaluation_step_and_fixed_rules(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study = load_study_spec(_write_study_assets(Path(tmp)))
outcomes = [
RequestOutcome(
request_id="r1",
success=True,
ttft_ms=1000,
tpot_ms=100,
prompt_tokens=1000,
completion_tokens=16,
),
RequestOutcome(
request_id="r2",
success=True,
ttft_ms=6000,
tpot_ms=100,
prompt_tokens=5000,
completion_tokens=16,
),
]
evaluations, summary = summarize_evaluations(outcomes, study.slo)
self.assertTrue(evaluations[0].passed)
self.assertFalse(evaluations[1].passed)
self.assertEqual(summary["slo_pass_rate"], 0.5)
def test_trace_completion_tokens_override_forces_min_and_max_tokens(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study_path = _write_study_assets(
Path(tmp),
trace_overrides={"completion_tokens_override": 1},
)
study = load_study_spec(study_path)
_, requests = load_trace_requests(study, study_spec_path=study_path)
self.assertEqual(len(requests), 3)
self.assertEqual(requests[0].completion_tokens_hint, 1)
self.assertEqual(requests[1].completion_tokens_hint, 1)
self.assertEqual(requests[2].completion_tokens_hint, 1)
self.assertEqual(requests[0].body["min_tokens"], 1)
self.assertEqual(requests[0].body["max_tokens"], 1)
self.assertEqual(requests[2].body["min_tokens"], 1)
self.assertEqual(requests[2].body["max_tokens"], 1)
def test_run_one_request_fails_fixed_length_completion_mismatch(self) -> None:
request = TraceRequest(
row_id="r1",
arrival_s=0.0,
sampling_u=0.1,
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
prompt_tokens_hint=8,
completion_tokens_hint=2,
)
with mock.patch(
"aituner.worker.stream_chat_completion",
return_value=StreamMetrics(
ttft_ms=10.0,
tpot_ms=5.0,
completion_tokens=1,
),
):
outcome = _run_one_request(
request,
base_url="http://127.0.0.1:8000",
timeout_s=1.0,
)
self.assertFalse(outcome.success)
self.assertEqual(outcome.error, "completion_tokens_mismatch expected=2 actual=1")
self.assertEqual(outcome.completion_tokens, 1)
def test_build_prompt_mentions_completion_tokens_override(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study_path = _write_study_assets(
Path(tmp),
trace_overrides={"completion_tokens_override": 1},
slo_overrides={"tpot_rule": None},
)
study = load_study_spec(study_path)
store = StudyStore(Path(tmp) / ".aituner")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=state,
capability_profile=None,
)
self.assertIn('"completion_tokens_override": 1', prompt)
self.assertIn("min_tokens=max_tokens=1", prompt)
def test_slo_evaluation_supports_tpot_only_95_percent_target(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study = load_study_spec(
_write_study_assets(
Path(tmp),
slo_overrides={
"ttft_rule": None,
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
},
)
)
outcomes = [
RequestOutcome(
request_id="r1",
success=True,
ttft_ms=3000,
tpot_ms=10,
prompt_tokens=1000,
completion_tokens=16,
),
RequestOutcome(
request_id="r2",
success=True,
ttft_ms=9000,
tpot_ms=21,
prompt_tokens=5000,
completion_tokens=16,
),
]
evaluations, summary = summarize_evaluations(outcomes, study.slo)
self.assertEqual([item.passed for item in evaluations], [True, False])
self.assertEqual(summary["slo_pass_rate"], 0.5)
self.assertFalse(summary["feasible"])
def test_build_launch_recipe_serializes_list_flags_once(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
study = load_study_spec(_write_study_assets(Path(tmp)))
recipe = build_launch_recipe(
study.engine,
ConfigPatch(
flag_patch={
"cuda-graph-sizes": [1, 2, 4],
}
),
)
self.assertIn("--cuda-graph-sizes", recipe.argv)
flag_index = recipe.argv.index("--cuda-graph-sizes")
self.assertEqual(recipe.argv[flag_index + 1 : flag_index + 4], ["1", "2", "4"])
self.assertEqual(recipe.argv.count("--cuda-graph-sizes"), 1)
def test_prepare_trace_windows_materializes_repo_local_assets(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
legacy_source = tmp_path / "legacy"
thinking_source = tmp_path / "thinking"
legacy_source.mkdir()
thinking_source.mkdir()
for filename in [
"qwen_chat_blksz_64_031109-031111",
"qwen_chat_blksz_64_031121-031123",
"qwen_chat_blksz_64_031209-031211",
"qwen_chat_blksz_64_031221-031223",
"qwen_chat_blksz_64_031309-031311",
"qwen_chat_blksz_64_031321-031323",
"qwen_chat_blksz_64_031409-031411",
"qwen_chat_blksz_64_031421-031423",
"qwen_chat_blksz_64_031509-031511",
"qwen_chat_blksz_64_031521-031523",
"qwen_chat_blksz_64_031609-031611",
"qwen_chat_blksz_64_031621-031623",
"qwen_chat_blksz_64_031709-031711",
"qwen_chat_blksz_64_031721-031723",
]:
for suffix in [".jsonl", "_prompt.jsonl"]:
path = legacy_source / f"{filename}{suffix}"
path.write_text("", encoding="utf-8")
peak_trace = legacy_source / "qwen_chat_blksz_64_031109-031111.jsonl"
peak_prompt = legacy_source / "qwen_chat_blksz_64_031109-031111_prompt.jsonl"
peak_trace.write_text(
"\n".join(
[
json.dumps(
{
"chat_id": "c1",
"turn": 1,
"timestamp": 3599.0,
"input_length": 10,
"output_length": 3,
}
),
json.dumps(
{
"chat_id": "c2",
"turn": 2,
"timestamp": 3605.0,
"input_length": 20,
"output_length": 7,
}
),
]
)
+ "\n",
encoding="utf-8",
)
peak_prompt.write_text(
"\n".join(
[
json.dumps({"chat_id": "c1", "turn": 1, "prompt": "ignore me"}),
json.dumps({"chat_id": "c2", "turn": 2, "prompt": "real prompt"}),
]
)
+ "\n",
encoding="utf-8",
)
output_root = tmp_path / "trace_windows"
subprocess.run(
[
"python3",
"scripts/prepare_trace_windows.py",
"--legacy-source",
str(legacy_source),
"--thinking-source",
str(thinking_source),
"--output-root",
str(output_root),
"--workloads",
"chat",
"--overwrite",
],
check=True,
cwd=str(REPO_ROOT),
)
windows_payload = json.loads((output_root / "windows.json").read_text(encoding="utf-8"))
windows = {item["window_id"]: item for item in windows_payload["windows"]}
self.assertIn("chat_w20260311_1000", windows)
self.assertEqual(windows["chat_w20260311_1000"]["num_requests"], 1)
trace_path = output_root / windows["chat_w20260311_1000"]["trace_file"]
rows = [json.loads(line) for line in trace_path.read_text(encoding="utf-8").splitlines()]
self.assertEqual(len(rows), 1)
self.assertEqual(rows[0]["prompt"], "real prompt")
self.assertEqual(rows[0]["timestamp"], 5.0)
self.assertEqual(rows[0]["output_length"], 7)
self.assertIsInstance(rows[0]["sampling_u"], float)
def test_prepare_trace_windows_preserves_existing_files_on_failure(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
legacy_source = tmp_path / "legacy"
thinking_source = tmp_path / "thinking"
output_root = tmp_path / "trace_windows"
traces_dir = output_root / "traces"
legacy_source.mkdir()
thinking_source.mkdir()
traces_dir.mkdir(parents=True)
for filename in [
"qwen_chat_blksz_64_031109-031111",
"qwen_chat_blksz_64_031121-031123",
]:
for suffix in [".jsonl", "_prompt.jsonl"]:
path = legacy_source / f"{filename}{suffix}"
path.write_text(
json.dumps(
{
"chat_id": "c1",
"turn": 1,
"timestamp": 3605.0,
"input_length": 20,
"output_length": 7,
"prompt": "prompt",
}
)
+ "\n",
encoding="utf-8",
)
sentinel = traces_dir / "chat_w20260311_1000.jsonl"
sentinel.write_text("sentinel\n", encoding="utf-8")
proc = subprocess.run(
[
"python3",
"scripts/prepare_trace_windows.py",
"--legacy-source",
str(legacy_source),
"--thinking-source",
str(thinking_source),
"--output-root",
str(output_root),
"--workloads",
"chat",
"--overwrite",
],
cwd=str(REPO_ROOT),
capture_output=True,
text=True,
)
self.assertNotEqual(proc.returncode, 0)
self.assertEqual(sentinel.read_text(encoding="utf-8"), "sentinel\n")
self.assertEqual(sorted(path.name for path in traces_dir.glob("*.tmp.*")), [])
def test_binary_search_max_feasible(self) -> None:
result = binary_search_max_feasible(
low=0.0,
high=1.0,
tolerance=0.01,
max_probes=8,
evaluator=lambda threshold: ThresholdProbe(
threshold=threshold,
feasible=threshold <= 0.625,
payload={"threshold": threshold},
),
)
self.assertLessEqual(result.best_threshold, 0.625)
self.assertGreaterEqual(result.best_threshold, 0.5)
self.assertIsNotNone(result.best_feasible_payload)
def test_binary_search_continues_below_tolerance_when_all_infeasible(self) -> None:
seen = []
def evaluator(threshold):
seen.append(threshold)
return ThresholdProbe(
threshold=threshold,
feasible=False,
payload={"threshold": threshold},
)
result = binary_search_max_feasible(
low=0.0,
high=1.0,
tolerance=0.1,
max_probes=6,
evaluator=evaluator,
)
self.assertIsNone(result.best_feasible_payload)
self.assertEqual(len(result.probes), 6)
self.assertEqual(
seen,
[0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625],
)
def test_trace_max_requests_uses_window_wide_downsample(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
trace_dir = tmp_path / "trace_windows" / "traces"
trace_dir.mkdir(parents=True)
trace_path = trace_dir / "chat_many.jsonl"
with trace_path.open("w", encoding="utf-8") as handle:
for idx in range(10):
handle.write(
json.dumps(
{
"request_id": f"r{idx}",
"timestamp": float(idx),
"sampling_u": idx / 10.0,
"messages": [{"role": "user", "content": f"hello-{idx}"}],
"input_length": 10 + idx,
"output_length": 5,
}
)
+ "\n"
)
windows_path = tmp_path / "trace_windows" / "windows.json"
windows_path.write_text(
json.dumps(
{
"windows": [
{
"window_id": "w1",
"trace_type": "chat",
"trace_file": "traces/chat_many.jsonl",
"window_start": 0.0,
"window_end": 10.0,
}
]
}
),
encoding="utf-8",
)
study_path = tmp_path / "study.json"
study_path.write_text(
json.dumps(
{
"study_id": "study-downsample",
"hardware": {"gpu_count": 1},
"model": {"model_id": "m1", "served_model_name": "dummy-model"},
"engine": {
"engine_name": "vllm",
"exec_path": "/usr/local/bin/vllm",
"host": "127.0.0.1",
"port": 8000,
"ready_timeout_s": 10,
"request_timeout_s": 10,
"healthcheck_path": "/v1/models",
"launch_args": [],
"base_envs": {},
"base_flags": {},
"tunable_envs": [],
"tunable_flags": [],
},
"trace": {
"windows_path": str(windows_path),
"window_id": "w1",
"max_concurrency": 1,
"max_requests_per_probe": 4,
},
"slo": {"target_pass_rate": 0.95},
"search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1},
"llm": {"system_prompt": "", "max_history_trials": 1},
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
_, requests = load_trace_requests(study, study_spec_path=study_path)
self.assertEqual([item.row_id for item in requests], ["r0", "r2", "r5", "r7"])
def test_trace_replay_time_scale_scales_arrivals_and_window(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
trace_dir = tmp_path / "trace_windows" / "traces"
trace_dir.mkdir(parents=True)
trace_path = trace_dir / "chat_scale.jsonl"
trace_path.write_text(
json.dumps(
{
"request_id": "r1",
"timestamp": 10.0,
"sampling_u": 0.25,
"messages": [{"role": "user", "content": "hello"}],
"input_length": 16,
"output_length": 4,
}
)
+ "\n",
encoding="utf-8",
)
windows_path = tmp_path / "trace_windows" / "windows.json"
windows_path.write_text(
json.dumps(
{
"windows": [
{
"window_id": "w1",
"trace_type": "chat",
"trace_file": "traces/chat_scale.jsonl",
"window_start": 0.0,
"window_end": 100.0,
}
]
}
),
encoding="utf-8",
)
study_path = tmp_path / "study.json"
study_path.write_text(
json.dumps(
{
"study_id": "study-scale",
"hardware": {"gpu_count": 1},
"model": {"model_id": "m1", "served_model_name": "dummy-model"},
"engine": {
"engine_name": "vllm",
"exec_path": "/usr/local/bin/vllm",
"host": "127.0.0.1",
"port": 8000,
"ready_timeout_s": 10,
"request_timeout_s": 10,
"healthcheck_path": "/v1/models",
"launch_args": [],
"base_envs": {},
"base_flags": {},
"tunable_envs": [],
"tunable_flags": [],
},
"trace": {
"windows_path": str(windows_path),
"window_id": "w1",
"max_concurrency": 1,
"replay_time_scale": 0.1,
},
"slo": {"target_pass_rate": 0.95},
"search": {"low": 0.0, "high": 1.0, "tolerance": 0.1, "max_probes": 2, "sample_seed": 1},
"llm": {"system_prompt": "", "max_history_trials": 1},
}
),
encoding="utf-8",
)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
self.assertEqual(window.window_end, 10.0)
self.assertEqual(requests[0].arrival_s, 1.0)
def test_proposal_validation_and_job_emission(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal_text = json.dumps(
{
"observation": "Current TTFT fails before TPOT.",
"diagnosis": "Prefill pressure dominates.",
"config_patch": {
"env_patch": {"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
"flag_patch": {"tensor-parallel-size": 4, "max-num-seqs": 64}
},
"expected_effects": ["lower TTFT", "raise feasible sampling_u"],
"why_not_previous_failures": "Avoids changing unsupported envs."
}
)
proposal = parse_proposal_text(proposal_text, study)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
job = build_trial_job(study=study, trial=trial, repo_root=tmp_path)
jobs_path = tmp_path / "jobs.toml"
append_job(jobs_path, job)
rendered = jobs_path.read_text(encoding="utf-8")
self.assertIn('name = "study-1-trial-0001"', rendered)
self.assertIn('command = "python3 -m aituner.cli worker run-trial', rendered)
self.assertIn('PYTHONPATH = "src"', rendered)
def test_ingest_trial_results_updates_best(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"]
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
Path(trial.result_path).write_text(
json.dumps(
{
"study_id": study.study_id,
"trial_id": trial.trial_id,
"status": "completed",
"best_sampling_u": 0.75,
"best_request_rate": 12.5,
"best_pass_rate": 0.97
}
),
encoding="utf-8",
)
next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.best_trial_id, trial.trial_id)
self.assertEqual(next_state.best_sampling_u, 0.75)
self.assertEqual(next_state.best_request_rate, 12.5)
self.assertEqual(next_state.best_parallel_size, 4)
self.assertEqual(next_state.best_request_rate_per_gpu, 3.125)
self.assertEqual(
next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"],
3.125,
)
def test_run_trial_persists_probe_request_details(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["search"]["max_probes"] = 1
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal = Proposal.from_dict(
{
"observation": "baseline",
"diagnosis": "baseline",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": ["measure"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
def fake_replay(requests, **kwargs):
return (
[
RequestOutcome(
request_id=request.row_id,
success=True,
ttft_ms=10.0,
tpot_ms=5.0,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
)
for request in requests
],
False,
"",
)
process = mock.Mock()
process.poll.return_value = 0
with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
result = run_trial(
Path(trial.artifact_dir) / "trial_spec.json"
)
self.assertEqual(result["status"], "completed")
details_path = Path(trial.artifact_dir) / "probe_details.jsonl"
self.assertTrue(details_path.exists())
rows = [
json.loads(line)
for line in details_path.read_text(encoding="utf-8").splitlines()
]
self.assertEqual(len(rows), 1)
self.assertEqual(rows[0]["threshold"], 0.5)
self.assertEqual(rows[0]["outcomes"][0]["request_id"], "r1")
self.assertEqual(rows[0]["outcomes"][0]["sampling_u"], 0.1)
def test_run_trial_marks_full_trace_saturation_as_measurement_ceiling_insufficient(
self,
) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal = Proposal.from_dict(
{
"observation": "baseline",
"diagnosis": "baseline",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": ["measure"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
def fake_replay(requests, **kwargs):
return (
[
RequestOutcome(
request_id=request.row_id,
success=True,
ttft_ms=10.0,
tpot_ms=5.0,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
)
for request in requests
],
False,
"",
)
process = mock.Mock()
process.poll.return_value = 0
with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
with mock.patch(
"aituner.worker._replay_requests",
side_effect=fake_replay,
):
result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
self.assertEqual(result["status"], "completed")
self.assertEqual(result["best_request_count"], 3)
self.assertTrue(result["measurement"]["measurement_ceiling_insufficient"])
self.assertEqual(result["measurement"]["reason"], "measurement_ceiling_insufficient")
self.assertIn("auto_high_resolution", result["measurement"])
def test_run_trial_falls_back_below_inherited_search_floor(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["search"]["max_probes"] = 2
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=1,
best_sampling_u=0.5,
best_request_rate=2.0,
best_request_rate_per_gpu=2.0,
next_trial_index=2,
best_by_parallel_size={
"1": {
"trial_id": "trial-0001",
"parallel_size": 1,
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_request_rate_per_gpu": 2.0,
}
},
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "runtime patch",
"diagnosis": "measure even if worse than incumbent",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}},
"expected_effects": ["measure"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.0)
trial_spec_path = Path(trial.artifact_dir) / "trial_spec.json"
trial_spec_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
trial_spec_payload["search"]["low"] = 0.5
trial_spec_path.write_text(json.dumps(trial_spec_payload), encoding="utf-8")
def fake_replay(requests, **kwargs):
passing = len(requests) <= 1
return (
[
RequestOutcome(
request_id=request.row_id,
success=True,
ttft_ms=10.0 if passing else 10000.0,
tpot_ms=5.0 if passing else 1000.0,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
)
for request in requests
],
False,
"",
)
process = mock.Mock()
process.poll.return_value = 0
with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
result = run_trial(trial_spec_path)
self.assertEqual(result["status"], "completed")
self.assertEqual(result["best_source"], "lower_range_fallback")
self.assertEqual(result["best_sampling_u"], 0.375)
self.assertEqual(result["best_request_rate"], 0.1)
self.assertEqual(result["primary_search"]["low"], 0.5)
self.assertIsNone(result["primary_search"]["best_request_rate"])
self.assertEqual(result["lower_range_fallback"]["low"], 0.0)
self.assertEqual(result["lower_range_fallback"]["high"], 0.5)
self.assertEqual(result["lower_range_fallback"]["best_request_rate"], 0.1)
self.assertEqual(
[probe["threshold"] for probe in result["primary_search"]["probes"]],
[0.75, 0.625],
)
self.assertEqual(
[probe["threshold"] for probe in result["lower_range_fallback"]["probes"]],
[0.25, 0.375],
)
def test_run_trial_skips_fallback_below_incumbent_floor(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["search"]["max_probes"] = 2
payload["search"]["inherit_incumbent_floor"] = True
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=1,
best_sampling_u=0.5,
best_request_rate=2.0,
best_request_rate_per_gpu=2.0,
next_trial_index=2,
best_by_parallel_size={
"1": {
"trial_id": "trial-0001",
"parallel_size": 1,
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_request_rate_per_gpu": 2.0,
}
},
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "runtime patch",
"diagnosis": "primary range all infeasible",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 2}},
"expected_effects": ["measure"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.5)
self.assertTrue(trial.search.inherit_incumbent_floor)
def fake_replay(requests, **kwargs):
return (
[
RequestOutcome(
request_id=request.row_id,
success=True,
ttft_ms=10000.0,
tpot_ms=1000.0,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
)
for request in requests
],
False,
"",
)
process = mock.Mock()
process.poll.return_value = 0
with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
self.assertEqual(result["status"], "completed")
self.assertIsNone(result["best_request_rate"])
self.assertEqual(result["best_source"], "primary_search")
self.assertEqual(result["primary_search"]["low"], 0.5)
self.assertIsNone(result["primary_search"]["best_request_rate"])
self.assertEqual(
[probe["threshold"] for probe in result["primary_search"]["probes"]],
[0.75, 0.625],
)
self.assertEqual(result["lower_range_fallback"]["triggered"], False)
self.assertEqual(result["lower_range_fallback"]["skipped"], True)
self.assertEqual(result["lower_range_fallback"]["probes"], [])
self.assertEqual(
result["lower_range_fallback"]["reason"],
"primary_search_above_incumbent_floor_all_infeasible",
)
self.assertEqual(
result["all_infeasible_diagnostics"]["threshold"],
0.625,
)
def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal = Proposal.from_dict(
{
"observation": "baseline",
"diagnosis": "baseline",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": ["measure"],
}
)
_, next_state = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(state.trials, [])
self.assertEqual(len(next_state.trials), 1)
def test_materialize_trial_uses_full_search_range_with_incumbent(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_sampling_u=0.375,
best_request_rate=3.0,
best_request_rate_per_gpu=0.75,
next_trial_index=2,
best_by_parallel_size={
"4": {
"trial_id": "trial-0001",
"parallel_size": 4,
"best_sampling_u": 0.375,
"best_request_rate": 3.0,
"best_request_rate_per_gpu": 0.75,
}
},
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, study.search.low)
self.assertEqual(trial.search.high, 1.0)
def test_materialize_trial_uses_full_search_range_for_same_parallel_group(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_sampling_u=0.375,
best_request_rate=3.0,
best_request_rate_per_gpu=0.75,
next_trial_index=2,
best_by_parallel_size={
"2": {
"trial_id": "trial-0000",
"parallel_size": 2,
"best_sampling_u": 0.125,
"best_request_rate": 0.8,
"best_request_rate_per_gpu": 0.4,
}
},
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, study.search.low)
def test_materialize_trial_can_use_incumbent_floor_when_enabled(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["search"]["inherit_incumbent_floor"] = True
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_sampling_u=0.375,
best_request_rate=3.0,
best_request_rate_per_gpu=0.75,
next_trial_index=2,
best_by_parallel_size={
"4": {
"trial_id": "trial-0001",
"parallel_size": 4,
"best_sampling_u": 0.375,
"best_request_rate": 3.0,
"best_request_rate_per_gpu": 0.75,
}
},
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.375)
self.assertTrue(trial.search.inherit_incumbent_floor)
def test_materialize_trial_resets_search_floor_for_new_parallel_group(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=4,
best_sampling_u=0.4,
best_request_rate=3.0,
best_request_rate_per_gpu=0.75,
next_trial_index=2,
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, study.search.low)
def test_materialize_trial_inherits_incumbent_topology_for_runtime_patch(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": True,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"max-num-seqs",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": True,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.125,
best_request_rate=3.0,
best_request_rate_per_gpu=0.375,
next_trial_index=3,
best_by_parallel_size={
"8": {
"trial_id": "trial-0002",
"parallel_size": 8,
"best_sampling_u": 0.125,
"best_request_rate": 3.0,
"best_request_rate_per_gpu": 0.375,
}
},
trials=[
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_sampling_u=0.125,
best_request_rate=3.0,
best_request_rate_per_gpu=0.375,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
"expert-parallel-size": 8,
},
},
)
],
)
proposal = Proposal.from_dict(
{
"observation": "Validate runtime headroom around the incumbent.",
"diagnosis": "Try lower concurrency on the current best topology.",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
"expected_effects": ["validate incumbent runtime headroom"],
}
)
trial, next_state = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(
trial.config_patch.flag_patch,
{
"tensor-parallel-size": 2,
"data-parallel-size": 4,
"max-num-seqs": 160,
},
)
self.assertEqual(trial.search.low, study.search.low)
self.assertEqual(
next_state.trials[-1].config_patch["flag_patch"],
{
"tensor-parallel-size": 2,
"data-parallel-size": 4,
"max-num-seqs": 160,
},
)
def test_materialize_trial_keeps_explicit_topology_runtime_patch(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": True,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"max-num-seqs",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": True,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
next_trial_index=3,
trials=[
TrialSummary(
trial_id="trial-0002",
status="completed",
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
)
],
)
proposal = Proposal.from_dict(
{
"observation": "Validate base topology runtime.",
"diagnosis": "Explicitly keep base topology and adjust concurrency.",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"max-num-seqs": 160,
},
},
"expected_effects": ["test base topology runtime headroom"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(
trial.config_patch.flag_patch,
{
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"max-num-seqs": 160,
},
)
def test_ingest_trial_results_records_failure_reason(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"]
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
Path(trial.result_path).write_text(
json.dumps(
{
"study_id": study.study_id,
"trial_id": trial.trial_id,
"status": "failed",
"failure_reason": "engine_process_exited_before_ready exit_code=1",
"probes": []
}
),
encoding="utf-8",
)
next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.trials[0].status, "failed")
self.assertEqual(
next_state.trials[0].failure_reason,
"engine_process_exited_before_ready exit_code=1",
)
self.assertEqual(next_state.trials[0].failure_stage, "")
def test_ingest_trial_results_records_failure_stage(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"]
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
Path(trial.result_path).write_text(
json.dumps(
{
"study_id": study.study_id,
"trial_id": trial.trial_id,
"status": "failed",
"failure_stage": "engine_launch",
"failure_reason": "engine_process_exited_before_ready exit_code=1",
"probes": []
}
),
encoding="utf-8",
)
next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.trials[0].failure_stage, "engine_launch")
def test_ingest_trial_results_prefers_higher_request_rate_per_gpu(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = store.load_state(study.study_id)
proposal_a = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"],
}
)
trial_a, state = store.materialize_trial(study=study, state=state, proposal=proposal_a)
Path(trial_a.result_path).write_text(
json.dumps(
{
"study_id": study.study_id,
"trial_id": trial_a.trial_id,
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 4.0,
"best_pass_rate": 0.97,
}
),
encoding="utf-8",
)
proposal_b = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 2}},
"expected_effects": ["raise rate"],
}
)
trial_b, _ = store.materialize_trial(study=study, state=state, proposal=proposal_b)
Path(trial_b.result_path).write_text(
json.dumps(
{
"study_id": study.study_id,
"trial_id": trial_b.trial_id,
"status": "completed",
"best_sampling_u": 0.4,
"best_request_rate": 3.0,
"best_pass_rate": 0.97,
}
),
encoding="utf-8",
)
next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.best_trial_id, trial_b.trial_id)
self.assertEqual(next_state.best_parallel_size, 2)
self.assertEqual(next_state.best_request_rate, 3.0)
self.assertEqual(next_state.best_request_rate_per_gpu, 1.5)
self.assertEqual(next_state.best_by_parallel_size["4"]["best_request_rate_per_gpu"], 1.0)
self.assertEqual(next_state.best_by_parallel_size["2"]["best_request_rate_per_gpu"], 1.5)
def test_validate_proposal_rejects_invalid_tp_dp_product(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": True,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": True,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Bad topology",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 2,
"expert-parallel-size": 4,
},
},
"expected_effects": ["raise throughput"],
}
)
with self.assertRaisesRegex(SpecError, "must equal hardware.gpu_count"):
validate_proposal(proposal, study)
def test_validate_proposal_rejects_invalid_ep_divisibility(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": True,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": True,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Bad EP",
"config_patch": {
"env_patch": {},
"flag_patch": {
"expert-parallel-size": 3,
},
},
"expected_effects": ["raise throughput"],
}
)
with self.assertRaisesRegex(SpecError, "expert-parallel-size=3"):
validate_proposal(proposal, study)
def test_validate_proposal_accepts_valid_tp_dp_ep_combo(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": True,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 8,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": True,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8],
},
},
)
study = load_study_spec(study_path)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Valid topology",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
"expert-parallel-size": 4,
},
},
"expected_effects": ["raise throughput"],
}
)
validated = validate_proposal(proposal, study)
self.assertEqual(validated.config_patch.flag_patch["tensor-parallel-size"], 2)
def test_validate_proposal_accepts_allowed_tp_dp_product_above_gpu_count(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": False,
"tensor-parallel-size": 4,
"data-parallel-size": 1,
"expert-parallel-size": 1,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": False,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1],
},
},
)
study = load_study_spec(study_path)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Allow product 8",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 1,
},
},
"expected_effects": ["explore larger topology"],
}
)
validated = validate_proposal(proposal, study)
self.assertEqual(validated.config_patch.flag_patch["data-parallel-size"], 2)
def test_validate_proposal_rejects_tp_dp_product_outside_allowed_set(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": False,
"tensor-parallel-size": 4,
"data-parallel-size": 1,
"expert-parallel-size": 1,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": False,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 3, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 3, 4, 8],
"allowed_expert_parallel_sizes": [1],
},
},
)
study = load_study_spec(study_path)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Invalid product",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 3,
"data-parallel-size": 2,
"expert-parallel-size": 1,
},
},
"expected_effects": ["explore invalid topology"],
}
)
with self.assertRaisesRegex(SpecError, "not in \\[1, 2, 4, 8\\]"):
validate_proposal(proposal, study)
def test_cli_tune_runs_multiple_manual_proposals(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
proposal1 = tmp_path / "proposal-1.json"
proposal2 = tmp_path / "proposal-2.json"
proposal1.write_text(
json.dumps(
{
"observation": "trial one",
"diagnosis": "conservative",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["stable"],
"why_not_previous_failures": "",
}
),
encoding="utf-8",
)
proposal2.write_text(
json.dumps(
{
"observation": "trial two",
"diagnosis": "more batching",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
"expected_effects": ["higher throughput"],
"why_not_previous_failures": "",
}
),
encoding="utf-8",
)
store_root = tmp_path / "store"
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
trial_id = str(payload["trial_id"])
trial_root = Path(payload["artifact_dir"])
if trial_id.endswith("0001"):
best_rate = 1.0
best_u = 0.5
else:
best_rate = 2.0
best_u = 0.75
result = {
"study_id": payload["study_id"],
"trial_id": trial_id,
"status": "completed",
"best_sampling_u": best_u,
"best_request_rate": best_rate,
"best_pass_rate": 1.0,
"best_request_count": 2,
"probes": [],
}
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
return result
with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--proposal-file",
str(proposal1),
"--proposal-file",
str(proposal2),
]
)
self.assertEqual(exit_code, 0)
store = StudyStore(store_root)
state = store.load_state("study-1")
self.assertEqual(state.best_trial_id, "trial-0002")
self.assertEqual(state.best_sampling_u, 0.75)
self.assertEqual(state.best_request_rate, 2.0)
self.assertEqual(state.next_trial_index, 3)
def test_cli_tune_honors_should_stop_proposal(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
proposal_path = tmp_path / "stop.json"
proposal_path.write_text(
json.dumps(
{
"observation": "incumbent converged",
"diagnosis": "no adjacent harness probe is justified",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": ["stop without spending another GPU trial"],
"why_not_previous_failures": "not applicable",
"should_stop": True,
}
),
encoding="utf-8",
)
store_root = tmp_path / "store"
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--proposal-file",
str(proposal_path),
]
)
self.assertEqual(exit_code, 0)
run_trial_mock.assert_not_called()
store = StudyStore(store_root)
state = store.load_state("study-1")
self.assertEqual(state.next_trial_index, 1)
def test_cli_tune_vetoes_unauthorized_llm_stop(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
spec = json.loads(study_path.read_text(encoding="utf-8"))
spec["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://localhost:9/v1",
"model": "test-model",
"api_key_env": "AITUNER_TEST_KEY",
}
study_path.write_text(json.dumps(spec), encoding="utf-8")
store_root = tmp_path / "store"
stop_payload = json.dumps(
{
"observation": "looks done",
"diagnosis": "agent thinks it converged",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": ["stop"],
"why_not_previous_failures": "n/a",
"should_stop": True,
}
)
buffer = io.StringIO()
with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch(
"aituner.cli.call_llm_for_proposal", return_value=stop_payload
), contextlib.redirect_stdout(buffer):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--skip-baseline",
"--max-trials",
"2",
]
)
self.assertEqual(exit_code, 0)
run_trial_mock.assert_not_called()
executed = json.loads(buffer.getvalue())["executed_trials"]
# The first unauthorized LLM stop is vetoed; the second is honored
# only after the veto budget is spent.
self.assertTrue(any(item.get("stop_vetoed") for item in executed))
honored = [item for item in executed if item.get("stopped")]
self.assertTrue(honored)
self.assertEqual(honored[-1]["stop_authorized_by"], "llm_after_veto_budget")
def test_cli_tune_rejects_repeated_materialized_llm_config(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"max-num-seqs": 64,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"max-num-seqs",
],
"topology_constraints": {
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_tp_dp_products": [1, 2, 4, 8],
},
},
)
spec = json.loads(study_path.read_text(encoding="utf-8"))
spec["llm"]["use_harness"] = False
spec["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://localhost:9/v1",
"model": "test-model",
"api_key_env": "AITUNER_TEST_KEY",
}
study_path.write_text(json.dumps(spec), encoding="utf-8")
study = load_study_spec(study_path)
store_root = tmp_path / "store"
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
store.save_state(
StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.125,
best_request_rate=3.0,
best_request_rate_per_gpu=0.375,
next_trial_index=3,
trials=[
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_sampling_u=0.125,
best_request_rate=3.0,
best_request_rate_per_gpu=0.375,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
"max-num-seqs": 160,
},
},
)
],
)
)
repeated_runtime_patch = json.dumps(
{
"observation": "Try the same runtime setting.",
"diagnosis": "This is duplicate after topology inheritance.",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
"expected_effects": ["should be vetoed"],
"why_not_previous_failures": "",
"should_stop": False,
}
)
stderr = io.StringIO()
with mock.patch("aituner.cli.run_trial") as run_trial_mock, mock.patch(
"aituner.cli.call_llm_for_proposal", return_value=repeated_runtime_patch
), contextlib.redirect_stderr(stderr):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--skip-baseline",
"--max-trials",
"3",
]
)
self.assertEqual(exit_code, 2)
run_trial_mock.assert_not_called()
self.assertIn("repeats an already tested effective full config", stderr.getvalue())
self.assertIn("trial-0002", stderr.getvalue())
def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store_root = tmp_path / "store"
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
store.save_state(
StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.02,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
next_trial_index=5,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": 8,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 160},
},
),
],
)
)
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--max-trials",
"5",
]
)
self.assertEqual(exit_code, 0)
llm_mock.assert_not_called()
run_trial_mock.assert_not_called()
proposal_path = (
store.study_root(study.study_id)
/ "proposals"
/ "harness-stop-0005.json"
)
self.assertTrue(proposal_path.exists())
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
self.assertTrue(proposal["should_stop"])
snapshot_path = (
store.study_root(study.study_id)
/ "harness"
/ "candidate-set-0005.json"
)
self.assertTrue(snapshot_path.exists())
snapshot = json.loads(snapshot_path.read_text(encoding="utf-8"))
self.assertEqual(snapshot["schema_version"], 1)
self.assertEqual(snapshot["iteration"], 5)
self.assertIn("candidate_set_hash", snapshot)
self.assertIn("candidate_set", snapshot)
self.assertIn("harness_stop", snapshot["decisions"])
self.assertIn("stop_authority", snapshot["decisions"])
state = store.load_state(study.study_id)
self.assertEqual(state.tuning_stop_reason, "harness_stop")
self.assertEqual(
state.tuning_stop_details["proposal_name"],
"harness-stop-0005",
)
self.assertEqual(state.tuning_stop_details["proposal_source"], "harness")
self.assertEqual(
state.tuning_stop_details["stop_authorized_by"],
"validator",
)
self.assertTrue(state.tuning_stop_diagnosis)
def test_cli_tune_llm_first_skips_deterministic_harness_proposal(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://llm.example/v1",
"wire_api": "chat.completions",
"model": "test-model",
"api_key_env": "OPENAI_API_KEY",
}
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
store_root = tmp_path / "store"
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
store.save_state(
StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=8,
best_sampling_u=0.25,
best_request_rate=1.0,
best_request_rate_per_gpu=0.125,
next_trial_index=2,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=1.0,
best_request_rate_per_gpu=0.125,
config_patch={"env_patch": {}, "flag_patch": {}},
)
],
)
)
llm_payload = json.dumps(
{
"observation": "Use harness evidence but let the LLM choose.",
"diagnosis": "Try higher admission concurrency.",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
"expected_effects": ["measure admission concurrency"],
"why_not_previous_failures": "does not repeat a prior full config",
"should_stop": False,
}
)
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
trial_root = Path(payload["artifact_dir"])
result = {
"study_id": payload["study_id"],
"trial_id": payload["trial_id"],
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 1.0,
"best_request_count": 2,
"probes": [],
}
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
return result
with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload) as llm_mock:
with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--skip-baseline",
"--max-trials",
"2",
"--proposal-policy",
"llm-first",
]
)
self.assertEqual(exit_code, 0)
llm_mock.assert_called_once()
proposal_root = store.study_root(study.study_id) / "proposals"
self.assertTrue((proposal_root / "proposal-0002.json").exists())
self.assertFalse((proposal_root / "harness-proposal-0002.json").exists())
self.assertTrue(
(store.study_root(study.study_id) / "harness" / "candidate-set-0002.json").exists()
)
def test_cli_tune_records_advisory_llm_out_of_set_candidate_family_gap(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://llm.example/v1",
"wire_api": "chat.completions",
"model": "test-model",
"api_key_env": "OPENAI_API_KEY",
}
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
store_root = tmp_path / "store"
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
store.save_state(
StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=1,
best_sampling_u=0.25,
best_request_rate=1.0,
best_request_rate_per_gpu=1.0,
next_trial_index=2,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=1,
best_request_rate=1.0,
best_request_rate_per_gpu=1.0,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 8},
},
)
],
)
)
harness_context = {
"experiment_plan": {
"planner_version": "test",
"candidate_set": {
"candidate_set_hash": "candidate-set-test",
"eligible_candidates": [
{
"candidate_id": "cand-mns16",
"action_id": "coordinate_step:max-num-seqs:8->16",
"knob_family": "max-num-seqs",
"score": 0.8,
"effective_config_fingerprint": "not-the-llm-proposal",
"config_patch": {
"env_patch": {},
"flag_patch": {"max-num-seqs": 16},
},
}
],
"blocked_candidates": [],
},
"next_action": None,
}
}
llm_payload = json.dumps(
{
"observation": "Harness is in the right admission direction but too conservative.",
"diagnosis": "Try a larger same-operator admission step.",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
"expected_effects": ["test whether admission capacity was underexplored"],
"why_not_previous_failures": "new value and no launch failure evidence",
"should_stop": False,
}
)
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
trial_root = Path(trial_payload["artifact_dir"])
result = {
"study_id": trial_payload["study_id"],
"trial_id": trial_payload["trial_id"],
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 2.0,
"best_pass_rate": 1.0,
"best_request_count": 2,
"probes": [],
}
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
return result
buffer = io.StringIO()
with mock.patch("aituner.cli.build_harness_context", return_value=harness_context):
with mock.patch("aituner.llm.build_harness_context", return_value=harness_context):
with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
with contextlib.redirect_stdout(buffer):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--skip-baseline",
"--max-trials",
"2",
"--proposal-policy",
"llm-first",
]
)
self.assertEqual(exit_code, 0)
summary = json.loads(buffer.getvalue())
executed = summary["executed_trials"]
self.assertEqual(executed[0]["proposal_origin"], "llm_out_of_set")
self.assertTrue(executed[0]["candidate_family_gap_path"])
attribution_path = (
store.study_root(study.study_id)
/ "proposal_attributions"
/ "proposal-0002.json"
)
attribution = json.loads(attribution_path.read_text(encoding="utf-8"))
self.assertEqual(attribution["proposal_origin"], "llm_out_of_set")
self.assertEqual(attribution["harness_candidate_policy"], "advisory")
gap_path = Path(executed[0]["candidate_family_gap_path"])
gap = json.loads(gap_path.read_text(encoding="utf-8"))
self.assertEqual(gap["gap_type"], "same_operator_new_step")
self.assertEqual(gap["review_status"], "pending")
self.assertEqual(gap["changed_knobs"], ["flag:max-num-seqs"])
self.assertEqual(gap["proposal_patch"]["flag_patch"]["max-num-seqs"], 24)
self.assertEqual(gap["nearest_harness_candidates"][0]["candidate_id"], "cand-mns16")
def test_cli_tune_strict_harness_policy_rejects_llm_out_of_set_proposal(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["harness_candidate_policy"] = "strict"
payload["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://llm.example/v1",
"wire_api": "chat.completions",
"model": "test-model",
"api_key_env": "OPENAI_API_KEY",
}
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
store_root = tmp_path / "store"
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
store.save_state(
StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_parallel_size=1,
best_request_rate=1.0,
best_request_rate_per_gpu=1.0,
next_trial_index=2,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=1,
best_request_rate=1.0,
best_request_rate_per_gpu=1.0,
config_patch={"env_patch": {}, "flag_patch": {"max-num-seqs": 8}},
)
],
)
)
harness_context = {
"experiment_plan": {
"candidate_set": {
"candidate_set_hash": "candidate-set-test",
"eligible_candidates": [
{
"candidate_id": "cand-mns16",
"effective_config_fingerprint": "not-the-llm-proposal",
"config_patch": {
"env_patch": {},
"flag_patch": {"max-num-seqs": 16},
},
}
],
}
}
}
llm_payload = json.dumps(
{
"observation": "Try an out-of-set candidate.",
"diagnosis": "strict mode should reject this.",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 24}},
"expected_effects": ["should not run"],
"why_not_previous_failures": "",
"should_stop": False,
}
)
stderr = io.StringIO()
with mock.patch("aituner.cli.build_harness_context", return_value=harness_context):
with mock.patch("aituner.llm.build_harness_context", return_value=harness_context):
with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
with contextlib.redirect_stderr(stderr):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--skip-baseline",
"--max-trials",
"2",
"--proposal-policy",
"llm-first",
]
)
self.assertEqual(exit_code, 2)
run_trial_mock.assert_not_called()
self.assertIn("llm.harness_candidate_policy=strict", stderr.getvalue())
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://llm.example/v1",
"wire_api": "chat.completions",
"model": "test-model",
"api_key_env": "OPENAI_API_KEY",
}
study_path.write_text(json.dumps(payload), encoding="utf-8")
store_root = tmp_path / "store"
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
trial_root = Path(payload["artifact_dir"])
result = {
"study_id": payload["study_id"],
"trial_id": payload["trial_id"],
"status": "completed",
"best_sampling_u": 0.25,
"best_request_rate": 1.0,
"best_pass_rate": 1.0,
"best_request_count": 2,
"probes": [],
}
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
return result
llm_payload = json.dumps(
{
"observation": "baseline done",
"diagnosis": "try more batching",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 64}},
"expected_effects": ["higher throughput"],
"why_not_previous_failures": "",
"should_stop": False,
}
)
with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
with mock.patch("aituner.cli.call_llm_for_proposal", return_value=llm_payload):
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--max-trials",
"2",
]
)
self.assertEqual(exit_code, 0)
store = StudyStore(store_root)
state = store.load_state("study-1")
self.assertEqual(state.next_trial_index, 3)
self.assertEqual(state.trials[0].config_patch, {"env_patch": {}, "flag_patch": {}})
self.assertEqual(state.trials[1].config_patch["flag_patch"], {"max-num-seqs": 64})
def test_cli_tune_stops_when_baseline_is_all_infeasible(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://llm.example/v1",
"wire_api": "chat.completions",
"model": "test-model",
"api_key_env": "OPENAI_API_KEY",
}
study_path.write_text(json.dumps(payload), encoding="utf-8")
store_root = tmp_path / "store"
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
trial_root = Path(payload["artifact_dir"])
result = {
"study_id": payload["study_id"],
"trial_id": payload["trial_id"],
"status": "completed",
"best_sampling_u": None,
"best_request_rate": None,
"best_pass_rate": None,
"best_request_count": None,
"probes": [
{
"threshold": 0.5,
"feasible": False,
"payload": {"pass_rate": 0.0, "request_rate": 2.0},
},
{
"threshold": 0.25,
"feasible": False,
"payload": {"pass_rate": 0.5, "request_rate": 1.0},
},
],
"all_infeasible_diagnostics": {
"threshold": 0.25,
"request_rate": 1.0,
"pass_rate": 0.5,
"early_stop_reason": "slo_pass_rate_unrecoverable",
"latency_summary": {
"ttft_ms": {
"count": 2,
"mean": 1200.0,
"p50": 1100.0,
"p95": 1900.0,
"p99": 1980.0,
},
"tpot_ms": {
"count": 2,
"mean": 35.0,
"p50": 32.0,
"p95": 48.0,
"p99": 49.0,
},
},
},
}
(trial_root / "result.json").write_text(json.dumps(result), encoding="utf-8")
return result
with mock.patch("aituner.cli.run_trial", side_effect=fake_run_trial):
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--max-trials",
"3",
]
)
self.assertEqual(exit_code, 0)
llm_mock.assert_not_called()
store = StudyStore(store_root)
state = store.load_state("study-1")
self.assertEqual(state.next_trial_index, 2)
self.assertEqual(len(state.trials), 1)
self.assertEqual(state.tuning_stop_reason, "baseline_all_infeasible")
self.assertIn("lowest_sampled_request_rate=1", state.tuning_stop_diagnosis)
self.assertIn("lowest_probe_ttft_ms", state.tuning_stop_diagnosis)
self.assertEqual(
state.tuning_stop_details["lowest_probe_latency_ms"]["ttft"]["p95"],
1900.0,
)
self.assertEqual(
state.tuning_stop_details["lowest_probe_latency_ms"]["tpot"]["p99"],
49.0,
)
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--max-trials",
"3",
]
)
self.assertEqual(exit_code, 0)
run_trial_mock.assert_not_called()
llm_mock.assert_not_called()
def test_cli_tune_max_trials_is_total_budget_on_resume(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["endpoint"] = {
"provider": "custom",
"base_url": "http://llm.example/v1",
"wire_api": "chat.completions",
"model": "test-model",
"api_key_env": "OPENAI_API_KEY",
}
study_path.write_text(json.dumps(payload), encoding="utf-8")
store_root = tmp_path / "store"
study = load_study_spec(study_path)
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
next_trial_index=3,
trials=[
TrialSummary(trial_id="trial-0001", status="completed"),
TrialSummary(trial_id="trial-0002", status="completed"),
],
)
store.save_state(state)
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--max-trials",
"2",
]
)
self.assertEqual(exit_code, 0)
llm_mock.assert_not_called()
run_trial_mock.assert_not_called()
self.assertEqual(store.load_state(study.study_id).next_trial_index, 3)
def test_load_compare_spec_requires_window_selection(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
compare_path = tmp_path / "compare.json"
compare_path.write_text(
json.dumps(
{
"compare_id": "compare-1",
"study_spec_path": str(study_path),
"baseline": {"config_patch": {"env_patch": {}, "flag_patch": {}}},
"tuned": {"config_patch": {"env_patch": {}, "flag_patch": {}}},
}
),
encoding="utf-8",
)
with self.assertRaisesRegex(SpecError, "window_ids or window_selector"):
load_compare_spec(compare_path)
def test_run_compare_outputs_summary_and_report(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
trace_dir = tmp_path / "trace_windows" / "traces"
trace_path = trace_dir / "chat_w2.jsonl"
trace_path.write_text(
json.dumps(
{
"request_id": "r4",
"timestamp": 0.0,
"sampling_u": 0.2,
"messages": [{"role": "user", "content": "extra"}],
"input_length": 3000,
"output_length": 32,
}
)
+ "\n",
encoding="utf-8",
)
windows_path = tmp_path / "trace_windows" / "windows.json"
windows_payload = json.loads(windows_path.read_text(encoding="utf-8"))
windows_payload["windows"].append(
{
"window_id": "chat_w2",
"trace_type": "chat",
"trace_file": "traces/chat_w2.jsonl",
"window_start": 0.0,
"window_end": 10.0,
"date": "2026-03-12",
"slot_token": "1000",
"slot_label": "10:00-10:10",
}
)
windows_payload["windows"][0]["date"] = "2026-03-11"
windows_payload["windows"][0]["slot_token"] = "1000"
windows_payload["windows"][0]["slot_label"] = "10:00-10:10"
windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
compare_path = _write_compare_assets(
tmp_path,
study_path=study_path,
window_ids=["chat_w1", "chat_w2"],
)
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
source_path = Path(trial_payload["study_spec_path"])
actual_spec_path = Path(source_path.read_text(encoding="utf-8").strip())
study_payload = json.loads(actual_spec_path.read_text(encoding="utf-8"))
window_id = study_payload["trace"]["window_id"]
trial_id = trial_payload["trial_id"]
rate_map = {
("chat_w1", "baseline"): 1.0,
("chat_w1", "tuned"): 3.0,
("chat_w2", "baseline"): 3.0,
("chat_w2", "tuned"): 7.0,
}
best_rate = rate_map[(window_id, trial_id)]
result = {
"study_id": trial_payload["study_id"],
"trial_id": trial_id,
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": best_rate,
"best_pass_rate": 1.0,
"best_request_count": 2,
"probes": [],
}
Path(trial_payload["result_path"]).write_text(
json.dumps(result),
encoding="utf-8",
)
return result
with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
summary = run_compare(compare_path, output_root=tmp_path / ".compare")
self.assertEqual(len(summary["windows"]), 2)
self.assertEqual(summary["aggregate"]["wins"]["tuned"], 2)
self.assertTrue((tmp_path / ".compare" / "summary.json").exists())
self.assertTrue((tmp_path / ".compare" / "report.md").exists())
def test_compare_aggregate_counts_failed_and_no_feasible_windows(self) -> None:
summary = _aggregate_summary(
[
{
"baseline": {
"status": "completed",
"best_request_rate": 1.0,
"best_request_rate_per_gpu": 1.0,
},
"tuned": {
"status": "completed",
"best_request_rate": None,
"best_request_rate_per_gpu": None,
},
"delta": {"winner": "baseline"},
},
{
"baseline": {
"status": "failed",
"best_request_rate": None,
"best_request_rate_per_gpu": None,
},
"tuned": {
"status": "completed",
"best_request_rate": 2.0,
"best_request_rate_per_gpu": 2.0,
},
"delta": {"winner": "tuned"},
},
]
)
self.assertEqual(summary["baseline_completed_window_count"], 1)
self.assertEqual(summary["baseline_failed_window_count"], 1)
self.assertEqual(summary["baseline_no_feasible_window_count"], 1)
self.assertEqual(summary["tuned_completed_window_count"], 2)
self.assertEqual(summary["tuned_failed_window_count"], 0)
self.assertEqual(summary["tuned_no_feasible_window_count"], 1)
def test_run_compare_resolves_trial_ref_candidate(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
prior_root = tmp_path / "prior-study"
trial_dir = prior_root / "trials" / "trial-0002"
trial_dir.mkdir(parents=True)
trial_spec = {
"study_id": "prior-study",
"trial_id": "trial-0002",
"config_patch": {
"env_patch": {},
"flag_patch": {"data-parallel-size": 2},
},
"search": {
"low": 0.0,
"high": 1.0,
"tolerance": 0.01,
"max_probes": 8,
"sample_seed": 20260325,
},
"study_spec_path": str(study_path),
"artifact_dir": str(trial_dir),
"probe_log_path": str(trial_dir / "probe_history.json"),
"engine_log_path": str(trial_dir / "engine.log"),
"result_path": str(trial_dir / "result.json"),
}
(trial_dir / "trial_spec.json").write_text(json.dumps(trial_spec), encoding="utf-8")
compare_path = _write_compare_assets(
tmp_path,
study_path=study_path,
window_ids=["chat_w1"],
baseline={
"trial_ref": {
"study_root": str(prior_root),
"trial_id": "trial-0002",
}
},
)
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
flags = (trial_payload["config_patch"] or {}).get("flag_patch") or {}
best_rate = 5.0 if flags.get("data-parallel-size") == 2 else 2.0
result = {
"study_id": trial_payload["study_id"],
"trial_id": trial_payload["trial_id"],
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": best_rate,
"best_pass_rate": 1.0,
"best_request_count": 2,
"probes": [],
}
Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8")
return result
with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
summary = run_compare(compare_path, output_root=tmp_path / ".compare")
self.assertEqual(summary["baseline_source"]["kind"], "trial_ref")
self.assertEqual(
summary["windows"][0]["baseline"]["config_patch"]["flag_patch"]["data-parallel-size"],
2,
)
def test_run_compare_window_selector_filters_windows(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
trace_dir = tmp_path / "trace_windows" / "traces"
for name in ("chat_w2.jsonl", "thinking_w3.jsonl"):
(trace_dir / name).write_text(
json.dumps(
{
"request_id": name,
"timestamp": 0.0,
"sampling_u": 0.2,
"messages": [{"role": "user", "content": name}],
"input_length": 3000,
"output_length": 32,
}
)
+ "\n",
encoding="utf-8",
)
windows_path = tmp_path / "trace_windows" / "windows.json"
windows_payload = json.loads(windows_path.read_text(encoding="utf-8"))
windows_payload["windows"][0]["date"] = "2026-03-11"
windows_payload["windows"][0]["slot_token"] = "1000"
windows_payload["windows"].append(
{
"window_id": "chat_w2",
"trace_type": "chat",
"trace_file": "traces/chat_w2.jsonl",
"window_start": 0.0,
"window_end": 10.0,
"date": "2026-03-12",
"slot_token": "1000",
}
)
windows_payload["windows"].append(
{
"window_id": "thinking_w3",
"trace_type": "thinking",
"trace_file": "traces/thinking_w3.jsonl",
"window_start": 0.0,
"window_end": 10.0,
"date": "2026-03-12",
"slot_token": "1000",
}
)
windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
compare_path = _write_compare_assets(
tmp_path,
study_path=study_path,
window_selector={"trace_type": "chat", "date_prefix": "2026-03-12"},
)
def fake_run_trial(trial_spec_path: Path) -> dict[str, object]:
trial_payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
result = {
"study_id": trial_payload["study_id"],
"trial_id": trial_payload["trial_id"],
"status": "completed",
"best_sampling_u": 0.5,
"best_request_rate": 1.0,
"best_pass_rate": 1.0,
"best_request_count": 2,
"probes": [],
}
Path(trial_payload["result_path"]).write_text(json.dumps(result), encoding="utf-8")
return result
with mock.patch("aituner.compare.run_trial", side_effect=fake_run_trial):
summary = run_compare(compare_path, output_root=tmp_path / ".compare")
self.assertEqual([row["window_id"] for row in summary["windows"]], ["chat_w2"])
def test_proposal_expected_effects_accepts_string(self) -> None:
proposal = Proposal.from_dict(
{
"observation": "obs",
"diagnosis": "diag",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": "higher throughput",
}
)
self.assertEqual(proposal.expected_effects, ["higher throughput"])
def test_proposal_expected_effects_accepts_object(self) -> None:
proposal = Proposal.from_dict(
{
"observation": "obs",
"diagnosis": "diag",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": {
"throughput": "higher",
"ttft": "lower",
},
}
)
self.assertEqual(
proposal.expected_effects,
["throughput: higher", "ttft: lower"],
)
def test_proposal_observation_accepts_object(self) -> None:
proposal = Proposal.from_dict(
{
"observation": {
"incumbent_trial": "trial-0002",
"boundary_signal": "tpot cliff",
},
"diagnosis": "validate incumbent",
"config_patch": {"env_patch": {}, "flag_patch": {"max-num-seqs": 160}},
"expected_effects": ["more TPOT headroom"],
}
)
self.assertIn('"incumbent_trial": "trial-0002"', proposal.observation)
self.assertEqual(proposal.diagnosis, "validate incumbent")
def test_proposal_accepts_should_stop(self) -> None:
proposal = Proposal.from_dict(
{
"observation": "obs",
"diagnosis": "converged",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": ["avoid wasting another GPU trial"],
"should_stop": True,
}
)
self.assertTrue(proposal.should_stop)
def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
proposal = parse_proposal_text(
"""Here is the proposal:
```json
{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
```""",
study,
)
self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
requests = [
TraceRequest(
row_id=f"r{i}",
arrival_s=0.0,
sampling_u=0.1 * i,
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
prompt_tokens_hint=8,
completion_tokens_hint=4,
)
for i in range(3)
]
outcomes = [
RequestOutcome(
request_id="r0",
success=False,
ttft_ms=None,
tpot_ms=None,
prompt_tokens=8,
completion_tokens=4,
error="request_failed",
)
]
def fake_run_one_request(*args, **kwargs):
return outcomes.pop(0)
def fake_evaluate(outcome: RequestOutcome):
return type("Eval", (), {"passed": outcome.success})()
with mock.patch("aituner.worker._run_one_request", side_effect=fake_run_one_request):
replayed, early_stopped, reason = _replay_requests(
requests,
base_url="http://127.0.0.1:8000",
timeout_s=1.0,
max_concurrency=1,
target_pass_rate=0.95,
max_lag_s=None,
max_elapsed_s=None,
evaluate_outcome=fake_evaluate,
)
self.assertTrue(early_stopped)
self.assertEqual(reason, "slo_pass_rate_unrecoverable")
self.assertEqual(len(replayed), 3)
self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")
def test_replay_requests_does_not_wait_for_inflight_after_early_stop(self) -> None:
requests = [
TraceRequest(
row_id="r0",
arrival_s=0.0,
sampling_u=0.1,
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
prompt_tokens_hint=8,
completion_tokens_hint=4,
),
TraceRequest(
row_id="r1",
arrival_s=0.0,
sampling_u=0.2,
body={"model": "m", "messages": [{"role": "user", "content": "y"}]},
prompt_tokens_hint=8,
completion_tokens_hint=4,
),
]
class FakeFuture:
def __init__(self, outcome=None, *, should_fail_if_waited=False):
self._outcome = outcome
self._should_fail_if_waited = should_fail_if_waited
def result(self, timeout=None):
if self._should_fail_if_waited:
raise AssertionError("in-flight future should not be awaited after early stop")
return self._outcome
def cancel(self):
return True
done_future = FakeFuture(
RequestOutcome(
request_id="r0",
success=False,
ttft_ms=None,
tpot_ms=None,
prompt_tokens=8,
completion_tokens=4,
error="request_failed",
)
)
inflight_future = FakeFuture(should_fail_if_waited=True)
submitted = []
class FakeExecutor:
def __init__(self, max_workers):
self.max_workers = max_workers
def submit(self, fn, request, **kwargs):
submitted.append(request.row_id)
if request.row_id == "r0":
return done_future
return inflight_future
def shutdown(self, wait=False, cancel_futures=True):
return None
def fake_wait(futures, timeout=None, return_when=None):
self.assertEqual(len(futures), 2)
return {done_future}, {inflight_future}
def fake_evaluate(outcome: RequestOutcome):
return type("Eval", (), {"passed": outcome.success})()
with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
with mock.patch("aituner.worker.wait", side_effect=fake_wait):
replayed, early_stopped, reason = _replay_requests(
requests,
base_url="http://127.0.0.1:8000",
timeout_s=30.0,
max_concurrency=2,
target_pass_rate=0.95,
max_lag_s=None,
max_elapsed_s=None,
evaluate_outcome=fake_evaluate,
drain_inflight_on_early_stop=False,
)
self.assertEqual(submitted, ["r0", "r1"])
self.assertTrue(early_stopped)
self.assertEqual(reason, "slo_pass_rate_unrecoverable")
self.assertEqual(len(replayed), 2)
self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")
def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
requests = [
TraceRequest(
row_id="r0",
arrival_s=0.0,
sampling_u=0.1,
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
prompt_tokens_hint=8,
completion_tokens_hint=4,
)
]
class FakeFuture:
def result(self, timeout=None):
raise AssertionError("future should not be awaited after elapsed early stop")
def cancel(self):
return True
submitted = []
class FakeExecutor:
def __init__(self, max_workers):
self.max_workers = max_workers
def submit(self, fn, request, **kwargs):
submitted.append(request.row_id)
return FakeFuture()
def shutdown(self, wait=False, cancel_futures=True):
return None
wait_timeouts: list[float] = []
def fake_wait(futures, timeout=None, return_when=None):
wait_timeouts.append(timeout)
return set(), set(futures)
def fake_evaluate(outcome: RequestOutcome):
return type("Eval", (), {"passed": outcome.success})()
monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
with mock.patch("aituner.worker.wait", side_effect=fake_wait):
with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
replayed, early_stopped, reason = _replay_requests(
requests,
base_url="http://127.0.0.1:8000",
timeout_s=30.0,
max_concurrency=1,
target_pass_rate=0.95,
max_lag_s=None,
max_elapsed_s=1.0,
evaluate_outcome=fake_evaluate,
drain_inflight_on_early_stop=False,
)
self.assertEqual(submitted, ["r0"])
self.assertTrue(early_stopped)
self.assertEqual(reason, "probe_elapsed_s>1.0")
self.assertEqual(len(replayed), 1)
self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
self.assertTrue(wait_timeouts)
self.assertLessEqual(wait_timeouts[0], 0.5)
def test_latency_summary_reports_quantiles_and_slo(self) -> None:
study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
outcomes = [
RequestOutcome(
request_id="r1",
success=True,
ttft_ms=100.0,
tpot_ms=10.0,
prompt_tokens=100,
completion_tokens=10,
),
RequestOutcome(
request_id="r2",
success=True,
ttft_ms=200.0,
tpot_ms=20.0,
prompt_tokens=5000,
completion_tokens=10,
),
]
evaluations = [evaluate_request(item, study.slo) for item in outcomes]
summary = _latency_summary(outcomes=outcomes, evaluations=evaluations, study=study)
self.assertEqual(summary["observed_request_count"], 2)
self.assertEqual(summary["request_mode"], "chat")
self.assertEqual(summary["ttft_ms"]["mean"], 150.0)
self.assertEqual(summary["ttft_ms"]["p50"], 100.0)
self.assertEqual(summary["ttft_ms"]["p99"], 200.0)
self.assertEqual(summary["tpot_ms"]["mean"], 15.0)
self.assertEqual(summary["slo"]["target_pass_rate"], 0.95)
def test_wait_for_server_or_exit_fails_fast_when_process_exits(self) -> None:
process = mock.Mock()
process.poll.return_value = 17
with self.assertRaisesRegex(RuntimeError, "engine_process_exited_before_ready exit_code=17"):
_wait_for_server_or_exit(
process,
base_url="http://127.0.0.1:8000",
healthcheck_path="/v1/models",
ready_timeout_s=10.0,
)
def test_terminate_process_tree_kills_process_group(self) -> None:
process = mock.Mock()
process.pid = 1234
process.poll.return_value = None
process.wait.return_value = 0
with mock.patch("aituner.worker.os.getpgid", return_value=1234):
with mock.patch(
"aituner.worker.os.killpg",
side_effect=[None, ProcessLookupError],
) as mock_killpg:
_terminate_process_tree(process, timeout_s=1.0)
self.assertEqual(mock_killpg.call_args_list[0].args[0], 1234)
self.assertEqual(mock_killpg.call_args_list[0].args[1], 15)
def test_terminate_process_tree_kills_group_when_parent_already_exited(self) -> None:
process = mock.Mock()
process.pid = 1234
process.poll.return_value = 0
with mock.patch("aituner.worker.os.getpgid", side_effect=ProcessLookupError):
with mock.patch(
"aituner.worker.os.killpg",
side_effect=[None, ProcessLookupError],
) as mock_killpg:
_terminate_process_tree(process, timeout_s=1.0)
self.assertEqual(mock_killpg.call_args_list[0].args[0], 1234)
process.wait.assert_not_called()
def test_terminate_process_tree_signals_marker_processes_when_group_missing(self) -> None:
process = mock.Mock()
process.pid = 1234
process.poll.return_value = 0
marker_env = {"AITUNER_TRIAL_ID": "trial-0001"}
with mock.patch("aituner.worker.os.getpgid", side_effect=ProcessLookupError):
with mock.patch("aituner.worker.os.killpg", side_effect=ProcessLookupError):
with mock.patch(
"aituner.worker._pids_matching_env",
side_effect=[[2222], []],
) as mock_pids:
with mock.patch("aituner.worker._signal_pids") as mock_signal:
_terminate_process_tree(
process,
timeout_s=1.0,
marker_env=marker_env,
)
self.assertEqual(mock_pids.call_args_list[0].args[0], marker_env)
self.assertEqual(mock_signal.call_args_list[0].args, ([2222], signal.SIGTERM))
def test_openai_url_avoids_double_v1(self) -> None:
self.assertEqual(
_openai_url("http://example.com", "/v1/chat/completions"),
"http://example.com/v1/chat/completions",
)
self.assertEqual(
_openai_url("http://example.com/v1", "/v1/chat/completions"),
"http://example.com/v1/chat/completions",
)
def test_stream_chat_completion_handles_missing_usage_and_chunks(self) -> None:
class FakeResponse:
def __enter__(self):
return self
def __exit__(self, exc_type, exc, traceback):
return False
def __iter__(self):
return iter([b"data: {\"choices\": []}\n", b"data: [DONE]\n"])
with mock.patch("aituner.http_client._urlopen", return_value=FakeResponse()):
metrics = stream_chat_completion(
base_url="http://127.0.0.1:8000",
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
timeout_s=1.0,
)
self.assertIsNone(metrics.ttft_ms)
self.assertIsNone(metrics.tpot_ms)
self.assertIsNone(metrics.completion_tokens)
self.assertEqual(metrics.completion_tokens_source, "none")
def test_loopback_urls_bypass_proxy(self) -> None:
self.assertTrue(_should_bypass_proxy("http://127.0.0.1:8000/v1/models"))
self.assertTrue(_should_bypass_proxy("http://localhost:8000/health"))
self.assertFalse(_should_bypass_proxy("http://example.com/v1/models"))
if __name__ == "__main__":
unittest.main()