Initial AITuner study orchestrator

2026-04-04 21:26:37 +08:00
commit cdcca1d9d7
24 changed files with 3357 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.aituner/
+__pycache__/
+*.pyc
+infra/gpu_fleet/config/fleet.toml
+infra/gpu_fleet/config/jobs.toml
--- a/configs/examples/capability.example.json
+++ b/configs/examples/capability.example.json
@@ -0,0 +1,14 @@
+{
+  "prefill_service_by_bucket": {
+    "4k": {
+      "tp4_ms": 320,
+      "tp8_ms": 240
+    }
+  },
+  "queueing_knee_by_bucket": {
+    "4k": {
+      "tp4_tok_s_per_gpu": 1000,
+      "tp8_tok_s_per_gpu": 1100
+    }
+  }
+}
--- a/configs/examples/study.example.json
+++ b/configs/examples/study.example.json
@@ -0,0 +1,96 @@
+{
+  "study_id": "example-chat-window",
+  "hardware": {
+    "gpu_count": 8,
+    "gpu_model": "H20",
+    "host_candidates": ["dash0", "dash1"]
+  },
+  "model": {
+    "model_id": "qwen3-30b",
+    "served_model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507"
+  },
+  "engine": {
+    "engine_name": "vllm",
+    "engine_version": "0.x",
+    "exec_path": "/usr/local/bin/vllm",
+    "cwd": ".",
+    "host": "127.0.0.1",
+    "port": 8000,
+    "healthcheck_path": "/v1/models",
+    "ready_timeout_s": 600,
+    "request_timeout_s": 600,
+    "launch_args": [
+      "serve",
+      "/path/to/model"
+    ],
+    "base_envs": {},
+    "base_flags": {
+      "host": "127.0.0.1",
+      "port": 8000,
+      "served-model-name": "Qwen/Qwen3-30B-A3B-Instruct-2507"
+    },
+    "tunable_envs": [
+      "VLLM_ATTENTION_BACKEND",
+      "CUDA_GRAPH_MAX_BATCH_SIZE"
+    ],
+    "tunable_flags": [
+      "tensor-parallel-size",
+      "data-parallel-size",
+      "pipeline-parallel-size",
+      "max-num-seqs",
+      "max-num-batched-tokens",
+      "gpu-memory-utilization",
+      "enable-prefix-caching",
+      "block-size"
+    ],
+    "python_executable": "python3"
+  },
+  "trace": {
+    "windows_path": "trace_windows/windows.json",
+    "window_id": "chat_w_example_peak_0001",
+    "u_field": "sampling_u",
+    "timestamp_field": "timestamp",
+    "max_concurrency": 64
+  },
+  "slo": {
+    "target_pass_rate": 0.95,
+    "ttft_rule": {
+      "kind": "step_ms",
+      "buckets": [
+        {
+          "max_input_tokens": 4096,
+          "threshold_ms": 2000
+        },
+        {
+          "max_input_tokens": 16384,
+          "threshold_ms": 4000
+        },
+        {
+          "threshold_ms": 8000
+        }
+      ]
+    },
+    "tpot_rule": {
+      "kind": "fixed_ms",
+      "threshold_ms": 120
+    }
+  },
+  "search": {
+    "low": 0.0,
+    "high": 1.0,
+    "tolerance": 0.01,
+    "max_probes": 8,
+    "sample_seed": 20260325
+  },
+  "llm": {
+    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target.",
+    "max_history_trials": 8,
+    "endpoint": {
+      "base_url": "https://example-openai-compatible-endpoint",
+      "model": "gpt-4.1-mini",
+      "api_key_env": "OPENAI_API_KEY",
+      "timeout_s": 120
+    }
+  },
+  "capability_profile_path": "capability.example.json"
+}
--- a/configs/examples/trace_windows/traces/chat_w_example_peak_0001.jsonl
+++ b/configs/examples/trace_windows/traces/chat_w_example_peak_0001.jsonl
@@ -0,0 +1,3 @@
+{"request_id":"example-1","timestamp":0.0,"sampling_u":0.10,"messages":[{"role":"user","content":"hello"}],"input_length":512,"output_length":16}
+{"request_id":"example-2","timestamp":1.0,"sampling_u":0.50,"messages":[{"role":"user","content":"summarize this file"}],"input_length":2048,"output_length":64}
+{"request_id":"example-3","timestamp":2.5,"sampling_u":0.90,"messages":[{"role":"user","content":"write a longer answer"}],"input_length":8192,"output_length":128}
--- a/configs/examples/trace_windows/windows.json
+++ b/configs/examples/trace_windows/windows.json
@@ -0,0 +1,15 @@
+{
+  "sample_seed": 20260325,
+  "u_field": "sampling_u",
+  "window_duration_seconds": 10.0,
+  "windows": [
+    {
+      "window_id": "chat_w_example_peak_0001",
+      "trace_type": "chat",
+      "trace_file": "traces/chat_w_example_peak_0001.jsonl",
+      "window_start": 0.0,
+      "window_end": 10.0,
+      "num_requests": 3
+    }
+  ]
+}
--- a/infra/gpu_fleet/config/fleet.example.toml
+++ b/infra/gpu_fleet/config/fleet.example.toml
@@ -0,0 +1,59 @@
+version = 1
+
+[paths]
+state_dir = ".aituner/gpu_fleet/state"
+artifacts_dir = ".aituner/gpu_fleet/artifacts"
+
+[ssh]
+connect_timeout_sec = 10
+
+[scheduler]
+gpu_free_memory_mb = 1024
+gpu_free_utilization_pct = 10
+prefer_pack = true
+
+[sync]
+mode = "rsync"
+local_path = "."
+exclude = [
+  ".git/",
+  ".venv/",
+  ".aituner/",
+  "__pycache__/",
+  "*.pyc",
+]
+
+[[hosts]]
+name = "dash0"
+ssh_alias = "dash0"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash1"
+ssh_alias = "dash1"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash2"
+ssh_alias = "dash2"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash3"
+ssh_alias = "dash3"
+enabled = true
+sync_remote_path = "~/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash5"
+ssh_alias = "dash5"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
--- a/infra/gpu_fleet/config/jobs.example.toml
+++ b/infra/gpu_fleet/config/jobs.example.toml
@@ -0,0 +1,27 @@
+ # This file is an append-only queue source for the monitor.
+ # Each job name must stay unique and immutable once appended.
+version = 1
+
+[[jobs]]
+name = "smoke-train-h20-1gpu"
+gpus = 1
+gpu_model = "H20"
+hosts = ["dash0", "dash1", "dash2"]
+command = "python train.py --config configs/smoke.toml"
+artifacts = ["outputs/smoke-train-h20-1gpu"]
+env = { WANDB_MODE = "offline" }
+
+[[jobs]]
+name = "eval-5090-4gpu"
+gpus = 4
+gpu_model = "5090"
+hosts = ["dash5"]
+command = "python eval.py --config configs/eval.toml"
+artifacts = ["outputs/eval-5090-4gpu", "logs/eval-5090-4gpu.log"]
+
+[[jobs]]
+name = "special-dash3-run"
+gpus = 2
+hosts = ["dash3"]
+command = "python benchmark.py --suite long-context"
+artifacts = ["outputs/special-dash3-run"]
--- a/infra/gpu_fleet/config/ssh_aliases.example.txt
+++ b/infra/gpu_fleet/config/ssh_aliases.example.txt
@@ -0,0 +1,8 @@
+# One SSH alias per line.
+# Lines starting with "#" are ignored.
+dash0
+dash1
+dash2
+dash3
+dash5
+
--- a/infra/gpu_fleet/gpu_fleet.py
+++ b/infra/gpu_fleet/gpu_fleet.py
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "aituner"
+version = "0.1.0"
+description = "AITuner study orchestrator for OpenAI-compatible serving engines"
+requires-python = ">=3.11"
+dependencies = []
+
+[project.scripts]
+aituner = "aituner.cli:main"
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
--- a/src/aituner/init.py
+++ b/src/aituner/init.py
@@ -0,0 +1,5 @@
+"""AITuner package."""
+
+__all__ = [
+    "cli",
+]
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -0,0 +1,177 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from .job import append_job, build_trial_job
+from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
+from .spec import Proposal, SpecError, load_study_spec
+from .store import StudyStore
+from .trace import load_trace_requests, summarize_window
+from .worker import run_trial
+
+
+def _study_source_path(study_root: Path) -> Path:
+    return Path((study_root / "study_spec.source").read_text(encoding="utf-8").strip())
+
+
+def cmd_study_init(args: argparse.Namespace) -> int:
+    spec_path = Path(args.spec).resolve()
+    study = load_study_spec(spec_path)
+    store = StudyStore(Path(args.store_root) if args.store_root else None)
+    root = store.init_study(spec_path=spec_path, study=study)
+    print(root)
+    return 0
+
+
+def cmd_study_prompt(args: argparse.Namespace) -> int:
+    store = StudyStore(Path(args.store_root) if args.store_root else None)
+    study_root = Path(args.study_root).resolve()
+    source_path = _study_source_path(study_root)
+    study = load_study_spec(source_path)
+    state = store.load_state(study.study_id)
+    capability_profile = load_capability_profile(study, study_spec_path=source_path)
+    window, requests = load_trace_requests(study, study_spec_path=source_path)
+    prompt = build_prompt(
+        study=study,
+        window_summary=summarize_window(requests, window),
+        state=state,
+        capability_profile=capability_profile,
+    )
+    prompt_name = args.prompt_name or f"prompt-{state.next_trial_index:04d}"
+    path = store.write_prompt(study.study_id, prompt_name, prompt)
+    print(path)
+    return 0
+
+
+def cmd_study_llm_propose(args: argparse.Namespace) -> int:
+    store = StudyStore(Path(args.store_root) if args.store_root else None)
+    study_root = Path(args.study_root).resolve()
+    source_path = _study_source_path(study_root)
+    study = load_study_spec(source_path)
+    state = store.load_state(study.study_id)
+    capability_profile = load_capability_profile(study, study_spec_path=source_path)
+    window, requests = load_trace_requests(study, study_spec_path=source_path)
+    prompt = build_prompt(
+        study=study,
+        window_summary=summarize_window(requests, window),
+        state=state,
+        capability_profile=capability_profile,
+    )
+    proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
+    proposal = parse_proposal_text(proposal_text, study)
+    name = args.proposal_name or f"proposal-{state.next_trial_index:04d}"
+    path = store.write_proposal(study.study_id, name, proposal)
+    print(path)
+    return 0
+
+
+def cmd_study_register_proposal(args: argparse.Namespace) -> int:
+    store = StudyStore(Path(args.store_root) if args.store_root else None)
+    study_root = Path(args.study_root).resolve()
+    source_path = _study_source_path(study_root)
+    study = load_study_spec(source_path)
+    proposal = parse_proposal_text(Path(args.proposal_file).read_text(encoding="utf-8"), study)
+    name = args.proposal_name or Path(args.proposal_file).stem
+    path = store.write_proposal(study.study_id, name, proposal)
+    print(path)
+    return 0
+
+
+def cmd_study_emit_job(args: argparse.Namespace) -> int:
+    store = StudyStore(Path(args.store_root) if args.store_root else None)
+    study_root = Path(args.study_root).resolve()
+    source_path = _study_source_path(study_root)
+    study = load_study_spec(source_path)
+    state = store.load_state(study.study_id)
+    proposal_text = Path(args.proposal_file).read_text(encoding="utf-8")
+    proposal = parse_proposal_text(proposal_text, study)
+    trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+    repo_root = Path(__file__).resolve().parents[2]
+    job = build_trial_job(study=study, trial=trial, repo_root=repo_root)
+    append_job(Path(args.jobs_file).resolve(), job)
+    print(trial.trial_id)
+    return 0
+
+
+def cmd_study_ingest(args: argparse.Namespace) -> int:
+    store = StudyStore(Path(args.store_root) if args.store_root else None)
+    study_root = Path(args.study_root).resolve()
+    study_id = study_root.name
+    state = store.ingest_trial_results(study_id)
+    print(json.dumps({"best_trial_id": state.best_trial_id, "best_request_rate": state.best_request_rate}))
+    return 0
+
+
+def cmd_worker_run_trial(args: argparse.Namespace) -> int:
+    result = run_trial(Path(args.trial_spec).resolve())
+    print(json.dumps(result))
+    return 0
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="AITuner CLI")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    study = subparsers.add_parser("study")
+    study_sub = study.add_subparsers(dest="study_command", required=True)
+
+    init = study_sub.add_parser("init")
+    init.add_argument("--spec", required=True)
+    init.add_argument("--store-root")
+    init.set_defaults(func=cmd_study_init)
+
+    prompt = study_sub.add_parser("prompt")
+    prompt.add_argument("--study-root", required=True)
+    prompt.add_argument("--store-root")
+    prompt.add_argument("--prompt-name")
+    prompt.set_defaults(func=cmd_study_prompt)
+
+    llm_propose = study_sub.add_parser("llm-propose")
+    llm_propose.add_argument("--study-root", required=True)
+    llm_propose.add_argument("--store-root")
+    llm_propose.add_argument("--proposal-name")
+    llm_propose.set_defaults(func=cmd_study_llm_propose)
+
+    register = study_sub.add_parser("register-proposal")
+    register.add_argument("--study-root", required=True)
+    register.add_argument("--store-root")
+    register.add_argument("--proposal-file", required=True)
+    register.add_argument("--proposal-name")
+    register.set_defaults(func=cmd_study_register_proposal)
+
+    emit = study_sub.add_parser("emit-job")
+    emit.add_argument("--study-root", required=True)
+    emit.add_argument("--store-root")
+    emit.add_argument("--proposal-file", required=True)
+    emit.add_argument("--jobs-file", required=True)
+    emit.set_defaults(func=cmd_study_emit_job)
+
+    ingest = study_sub.add_parser("ingest")
+    ingest.add_argument("--study-root", required=True)
+    ingest.add_argument("--store-root")
+    ingest.set_defaults(func=cmd_study_ingest)
+
+    worker = subparsers.add_parser("worker")
+    worker_sub = worker.add_subparsers(dest="worker_command", required=True)
+    run = worker_sub.add_parser("run-trial")
+    run.add_argument("--trial-spec", required=True)
+    run.set_defaults(func=cmd_worker_run_trial)
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    try:
+        return int(args.func(args))
+    except SpecError as exc:
+        print(str(exc), file=sys.stderr)
+        return 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/src/aituner/engine.py
+++ b/src/aituner/engine.py
@@ -0,0 +1,65 @@
+from __future__ import annotations
+
+import os
+import shlex
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from .spec import ConfigPatch, EngineLaunchSpec
+
+
+@dataclass(frozen=True)
+class LaunchRecipe:
+    argv: list[str]
+    env: dict[str, str]
+    cwd: str | None
+    base_url: str
+    healthcheck_path: str
+    ready_timeout_s: float
+    request_timeout_s: float
+
+
+def _normalize_flag_name(name: str) -> str:
+    return str(name).strip().replace("_", "-")
+
+
+def _serialize_flag_parts(name: str, value: Any) -> list[str]:
+    flag = f"--{_normalize_flag_name(name)}"
+    if value is None:
+        return []
+    if isinstance(value, bool):
+        return [flag] if value else [f"--no-{_normalize_flag_name(name)}"]
+    if isinstance(value, list):
+        parts: list[str] = []
+        for item in value:
+            parts.extend([flag, str(item)])
+        return parts
+    return [flag, str(value)]
+
+
+def build_launch_recipe(spec: EngineLaunchSpec, patch: ConfigPatch) -> LaunchRecipe:
+    env = dict(os.environ)
+    env.update(spec.base_envs)
+    env.update(patch.env_patch)
+    flags = dict(spec.base_flags)
+    flags.update(patch.flag_patch)
+    argv = [spec.exec_path, *spec.launch_args]
+    for key, value in flags.items():
+        argv.extend(_serialize_flag_parts(key, value))
+    cwd = None
+    if spec.cwd:
+        cwd = str(Path(spec.cwd).expanduser())
+    return LaunchRecipe(
+        argv=argv,
+        env={str(key): str(value) for key, value in env.items()},
+        cwd=cwd,
+        base_url=spec.base_url,
+        healthcheck_path=spec.healthcheck_path,
+        ready_timeout_s=spec.ready_timeout_s,
+        request_timeout_s=spec.request_timeout_s,
+    )
+
+
+def shell_join(argv: list[str]) -> str:
+    return " ".join(shlex.quote(item) for item in argv)
--- a/src/aituner/http_client.py
+++ b/src/aituner/http_client.py
@@ -0,0 +1,147 @@
+from __future__ import annotations
+
+import json
+import os
+import time
+import urllib.error
+import urllib.request
+from dataclasses import dataclass
+from typing import Any, Iterable
+
+
+class HttpClientError(RuntimeError):
+    """Raised for HTTP client failures."""
+
+
+def _auth_headers(api_key_env: str | None) -> dict[str, str]:
+    headers = {"Content-Type": "application/json"}
+    if api_key_env:
+        api_key = os.environ.get(api_key_env)
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+    return headers
+
+
+def wait_for_server(base_url: str, path: str, timeout_s: float) -> None:
+    deadline = time.monotonic() + timeout_s
+    url = f"{base_url.rstrip('/')}{path}"
+    last_error = "server_not_ready"
+    while time.monotonic() < deadline:
+        try:
+            request = urllib.request.Request(url=url, headers=_auth_headers(None), method="GET")
+            with urllib.request.urlopen(request, timeout=5) as response:
+                if 200 <= response.status < 500:
+                    return
+        except Exception as exc:  # noqa: BLE001
+            last_error = str(exc)
+        time.sleep(1.0)
+    raise HttpClientError(f"Timed out waiting for {url}: {last_error}")
+
+
+def chat_completion(
+    *,
+    base_url: str,
+    api_key_env: str | None,
+    model: str,
+    messages: list[dict[str, Any]],
+    timeout_s: float,
+    system_prompt: str = "",
+) -> dict[str, Any]:
+    payload: dict[str, Any] = {"model": model, "messages": messages}
+    if system_prompt:
+        payload["messages"] = [{"role": "system", "content": system_prompt}, *messages]
+    data = json.dumps(payload).encode("utf-8")
+    request = urllib.request.Request(
+        url=f"{base_url.rstrip('/')}/v1/chat/completions",
+        headers=_auth_headers(api_key_env),
+        data=data,
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(request, timeout=timeout_s) as response:
+            return json.loads(response.read().decode("utf-8"))
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8", errors="replace")
+        raise HttpClientError(f"chat_completion failed: {exc.code} {detail}") from exc
+
+
+@dataclass(frozen=True)
+class StreamMetrics:
+    ttft_ms: float | None
+    tpot_ms: float | None
+    completion_tokens: int | None
+
+
+def stream_chat_completion(
+    *,
+    base_url: str,
+    body: dict[str, Any],
+    timeout_s: float,
+) -> StreamMetrics:
+    data = json.dumps(body).encode("utf-8")
+    request = urllib.request.Request(
+        url=f"{base_url.rstrip('/')}/v1/chat/completions",
+        headers=_auth_headers(None),
+        data=data,
+        method="POST",
+    )
+    start = time.monotonic()
+    first_token_at: float | None = None
+    last_token_at: float | None = None
+    chunk_token_count = 0
+    completion_tokens: int | None = None
+    try:
+        with urllib.request.urlopen(request, timeout=timeout_s) as response:
+            for raw in _iter_sse_lines(response):
+                if raw == "[DONE]":
+                    break
+                payload = json.loads(raw)
+                if not isinstance(payload, dict):
+                    continue
+                usage = payload.get("usage")
+                if isinstance(usage, dict):
+                    comp = usage.get("completion_tokens")
+                    if isinstance(comp, int) and comp >= 0:
+                        completion_tokens = comp
+                choices = payload.get("choices")
+                if not isinstance(choices, list) or not choices:
+                    continue
+                delta = choices[0].get("delta", {})
+                if not isinstance(delta, dict):
+                    continue
+                content = delta.get("content")
+                if isinstance(content, str) and content:
+                    now = time.monotonic()
+                    if first_token_at is None:
+                        first_token_at = now
+                    last_token_at = now
+                    chunk_token_count += 1
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8", errors="replace")
+        raise HttpClientError(f"stream_chat_completion failed: {exc.code} {detail}") from exc
+    ttft_ms = None if first_token_at is None else (first_token_at - start) * 1000.0
+    used_tokens = completion_tokens if completion_tokens is not None else chunk_token_count
+    if (
+        first_token_at is None
+        or last_token_at is None
+        or used_tokens is None
+        or used_tokens <= 1
+    ):
+        tpot_ms = None
+    else:
+        tpot_ms = ((last_token_at - first_token_at) / max(used_tokens - 1, 1)) * 1000.0
+    return StreamMetrics(
+        ttft_ms=ttft_ms,
+        tpot_ms=tpot_ms,
+        completion_tokens=used_tokens if used_tokens > 0 else None,
+    )
+
+
+def _iter_sse_lines(response: Any) -> Iterable[str]:
+    for raw in response:
+        line = raw.decode("utf-8", errors="replace").strip()
+        if not line.startswith("data:"):
+            continue
+        payload = line[len("data:") :].strip()
+        if payload:
+            yield payload
--- a/src/aituner/job.py
+++ b/src/aituner/job.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from .spec import StudySpec, TrialSpec
+
+
+@dataclass(frozen=True)
+class InfraJob:
+    name: str
+    gpus: int
+    gpu_model: str | None
+    hosts: list[str]
+    command: str
+    artifacts: list[str]
+    env: dict[str, str]
+
+
+def _toml_scalar(value: Any) -> str:
+    if isinstance(value, bool):
+        return "true" if value else "false"
+    if isinstance(value, int):
+        return str(value)
+    text = str(value).replace("\\", "\\\\").replace('"', '\\"')
+    return f'"{text}"'
+
+
+def _toml_list(values: list[Any]) -> str:
+    return "[" + ", ".join(_toml_scalar(item) for item in values) + "]"
+
+
+def _toml_inline_table(mapping: dict[str, str]) -> str:
+    parts = [f"{key} = {_toml_scalar(value)}" for key, value in sorted(mapping.items())]
+    return "{ " + ", ".join(parts) + " }"
+
+
+def build_trial_job(*, study: StudySpec, trial: TrialSpec, repo_root: Path) -> InfraJob:
+    trial_path = Path(trial.artifact_dir) / "trial_spec.json"
+    rel_trial_path = trial_path.resolve().relative_to(repo_root.resolve())
+    rel_trial_dir = Path(trial.artifact_dir).resolve().relative_to(repo_root.resolve())
+    command = (
+        f"{study.engine.python_executable} -m aituner.cli worker run-trial "
+        f"--trial-spec {rel_trial_path}"
+    )
+    env = {"PYTHONPATH": "src"}
+    return InfraJob(
+        name=f"{study.study_id}-{trial.trial_id}",
+        gpus=study.hardware.gpu_count,
+        gpu_model=study.hardware.gpu_model,
+        hosts=list(study.hardware.host_candidates),
+        command=command,
+        artifacts=[str(rel_trial_dir)],
+        env=env,
+    )
+
+
+def append_job(jobs_path: Path, job: InfraJob) -> None:
+    jobs_path.parent.mkdir(parents=True, exist_ok=True)
+    with jobs_path.open("a", encoding="utf-8") as handle:
+        if jobs_path.stat().st_size == 0:
+            handle.write("version = 1\n")
+        handle.write("\n[[jobs]]\n")
+        handle.write(f"name = {_toml_scalar(job.name)}\n")
+        handle.write(f"gpus = {job.gpus}\n")
+        if job.gpu_model:
+            handle.write(f"gpu_model = {_toml_scalar(job.gpu_model)}\n")
+        if job.hosts:
+            handle.write(f"hosts = {_toml_list(job.hosts)}\n")
+        handle.write(f"command = {_toml_scalar(job.command)}\n")
+        if job.artifacts:
+            handle.write(f"artifacts = {_toml_list(job.artifacts)}\n")
+        if job.env:
+            handle.write(f"env = {_toml_inline_table(job.env)}\n")
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from .http_client import chat_completion
+from .spec import LLMPolicySpec, Proposal, SpecError, StudySpec, StudyState
+
+
+def build_prompt(
+    *,
+    study: StudySpec,
+    window_summary: dict[str, Any],
+    state: StudyState,
+    capability_profile: dict[str, Any] | None,
+) -> str:
+    history = []
+    for trial in state.trials[-study.llm.max_history_trials :]:
+        history.append(
+            {
+                "trial_id": trial.trial_id,
+                "status": trial.status,
+                "best_sampling_u": trial.best_sampling_u,
+                "best_request_rate": trial.best_request_rate,
+                "best_pass_rate": trial.best_pass_rate,
+                "diagnosis": trial.diagnosis,
+            }
+        )
+    sections = [
+        "You are tuning an OpenAI-compatible serving engine.",
+        "Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures.",
+        "config_patch must contain env_patch and flag_patch.",
+        "Only use allowed tunable env keys and allowed tunable flag keys.",
+        "",
+        "Study stack:",
+        json.dumps(
+            {
+                "study_id": study.study_id,
+                "hardware": {
+                    "gpu_count": study.hardware.gpu_count,
+                    "gpu_model": study.hardware.gpu_model,
+                },
+                "model": {
+                    "model_id": study.model.model_id,
+                    "served_model_name": study.model.served_model_name,
+                },
+                "engine": {
+                    "engine_name": study.engine.engine_name,
+                    "engine_version": study.engine.engine_version,
+                    "base_flags": study.engine.base_flags,
+                    "base_envs": study.engine.base_envs,
+                    "allowed_flag_keys": study.engine.tunable_flags,
+                    "allowed_env_keys": study.engine.tunable_envs,
+                },
+            },
+            ensure_ascii=False,
+            indent=2,
+        ),
+        "",
+        "Window summary:",
+        json.dumps(window_summary, ensure_ascii=False, indent=2),
+        "",
+        "SLO:",
+        json.dumps(
+            {
+                "target_pass_rate": study.slo.target_pass_rate,
+                "ttft_rule": study.slo.ttft_rule,
+                "tpot_rule": study.slo.tpot_rule,
+            },
+            default=lambda value: value.__dict__,
+            ensure_ascii=False,
+            indent=2,
+        ),
+        "",
+        "Capability profile:",
+        json.dumps(capability_profile or {}, ensure_ascii=False, indent=2),
+        "",
+        "Trial history:",
+        json.dumps(history, ensure_ascii=False, indent=2),
+        "",
+        "The proposal should improve the maximum feasible sampling_u under the 95%+ SLO target.",
+    ]
+    return "\n".join(sections)
+
+
+def load_capability_profile(study: StudySpec, *, study_spec_path: Path) -> dict[str, Any] | None:
+    if not study.capability_profile_path:
+        return None
+    path = Path(study.capability_profile_path)
+    if not path.is_absolute():
+        path = (study_spec_path.parent / path).resolve()
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal:
+    unknown_envs = sorted(set(proposal.config_patch.env_patch) - set(study.engine.tunable_envs))
+    unknown_flags = sorted(
+        set(proposal.config_patch.flag_patch) - set(study.engine.tunable_flags)
+    )
+    if unknown_envs:
+        raise SpecError(f"Proposal uses unsupported env keys: {', '.join(unknown_envs)}")
+    if unknown_flags:
+        raise SpecError(f"Proposal uses unsupported flag keys: {', '.join(unknown_flags)}")
+    return proposal
+
+
+def parse_proposal_text(text: str, study: StudySpec) -> Proposal:
+    payload = json.loads(text)
+    proposal = Proposal.from_dict(payload)
+    return validate_proposal(proposal, study)
+
+
+def call_llm_for_proposal(
+    *,
+    policy: LLMPolicySpec,
+    prompt: str,
+) -> str:
+    if policy.endpoint is None:
+        raise RuntimeError("study.llm.endpoint is not configured")
+    response = chat_completion(
+        base_url=policy.endpoint.base_url,
+        api_key_env=policy.endpoint.api_key_env,
+        model=policy.endpoint.model,
+        messages=[{"role": "user", "content": prompt}],
+        timeout_s=policy.endpoint.timeout_s,
+        system_prompt=policy.system_prompt,
+    )
+    choices = response.get("choices")
+    if not isinstance(choices, list) or not choices:
+        raise RuntimeError("LLM response does not contain choices")
+    message = choices[0].get("message", {})
+    if not isinstance(message, dict):
+        raise RuntimeError("LLM response does not contain a valid message")
+    content = message.get("content")
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return "".join(
+            item.get("text", "")
+            for item in content
+            if isinstance(item, dict) and isinstance(item.get("text"), str)
+        )
+    raise RuntimeError("LLM response content is empty")
--- a/src/aituner/search.py
+++ b/src/aituner/search.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Callable, Generic, TypeVar
+
+
+T = TypeVar("T")
+
+
+@dataclass(frozen=True)
+class ThresholdProbe(Generic[T]):
+    threshold: float
+    feasible: bool
+    payload: T
+
+
+@dataclass(frozen=True)
+class ThresholdSearchResult(Generic[T]):
+    best_threshold: float
+    best_feasible_payload: T | None
+    probes: list[ThresholdProbe[T]]
+
+
+def binary_search_max_feasible(
+    *,
+    low: float,
+    high: float,
+    tolerance: float,
+    max_probes: int,
+    evaluator: Callable[[float], ThresholdProbe[T]],
+) -> ThresholdSearchResult[T]:
+    probes: list[ThresholdProbe[T]] = []
+    cache: dict[float, ThresholdProbe[T]] = {}
+    best_threshold = low
+    best_payload: T | None = None
+    cur_low = low
+    cur_high = high
+    for _ in range(max_probes):
+        if cur_high - cur_low <= tolerance:
+            break
+        threshold = round((cur_low + cur_high) / 2.0, 12)
+        probe = cache.get(threshold)
+        if probe is None:
+            probe = evaluator(threshold)
+            cache[threshold] = probe
+            probes.append(probe)
+        if probe.feasible:
+            if threshold >= best_threshold:
+                best_threshold = threshold
+                best_payload = probe.payload
+            cur_low = threshold
+        else:
+            cur_high = threshold
+    return ThresholdSearchResult(
+        best_threshold=best_threshold,
+        best_feasible_payload=best_payload,
+        probes=probes,
+    )
--- a/src/aituner/slo.py
+++ b/src/aituner/slo.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from .spec import SloSpec, ThresholdRule
+
+
+@dataclass(frozen=True)
+class RequestOutcome:
+    request_id: str
+    success: bool
+    ttft_ms: float | None
+    tpot_ms: float | None
+    prompt_tokens: int | None
+    completion_tokens: int | None
+    error: str = ""
+
+
+@dataclass(frozen=True)
+class RequestEvaluation:
+    request_id: str
+    passed: bool
+    reasons: list[str]
+
+
+def _rule_threshold_ms(rule: ThresholdRule, prompt_tokens: int | None) -> float:
+    if rule.kind == "fixed_ms":
+        assert rule.threshold_ms is not None
+        return rule.threshold_ms
+    if rule.kind != "step_ms":
+        raise ValueError(f"Unsupported threshold rule: {rule.kind}")
+    prompt = float(prompt_tokens or 0)
+    for bucket in rule.buckets:
+        ceiling = bucket.get("max_input_tokens")
+        if ceiling is None or prompt <= ceiling:
+            return float(bucket["threshold_ms"])
+    return float(rule.buckets[-1]["threshold_ms"])
+
+
+def evaluate_request(outcome: RequestOutcome, slo: SloSpec) -> RequestEvaluation:
+    reasons: list[str] = []
+    if not outcome.success:
+        reasons.append(outcome.error or "request_failed")
+        return RequestEvaluation(request_id=outcome.request_id, passed=False, reasons=reasons)
+    if slo.ttft_rule is not None:
+        if outcome.ttft_ms is None:
+            reasons.append("ttft_missing")
+        else:
+            threshold = _rule_threshold_ms(slo.ttft_rule, outcome.prompt_tokens)
+            if outcome.ttft_ms > threshold:
+                reasons.append(f"ttft_ms>{threshold}")
+    if slo.tpot_rule is not None:
+        if outcome.tpot_ms is None:
+            reasons.append("tpot_missing")
+        else:
+            threshold = _rule_threshold_ms(slo.tpot_rule, outcome.prompt_tokens)
+            if outcome.tpot_ms > threshold:
+                reasons.append(f"tpot_ms>{threshold}")
+    return RequestEvaluation(
+        request_id=outcome.request_id,
+        passed=not reasons,
+        reasons=reasons,
+    )
+
+
+def summarize_evaluations(
+    outcomes: list[RequestOutcome], slo: SloSpec
+) -> tuple[list[RequestEvaluation], dict[str, Any]]:
+    evaluations = [evaluate_request(item, slo) for item in outcomes]
+    total = len(evaluations)
+    passed = sum(1 for item in evaluations if item.passed)
+    pass_rate = (passed / total) if total else 0.0
+    return evaluations, {
+        "request_count": total,
+        "slo_pass_count": passed,
+        "slo_pass_rate": pass_rate,
+        "target_pass_rate": slo.target_pass_rate,
+        "feasible": pass_rate >= slo.target_pass_rate,
+    }
--- a/src/aituner/spec.py
+++ b/src/aituner/spec.py
@@ -0,0 +1,440 @@
+from __future__ import annotations
+
+import json
+import tomllib
+from dataclasses import asdict, dataclass, field, is_dataclass
+from pathlib import Path
+from typing import Any, Mapping
+
+
+class SpecError(ValueError):
+    """Raised when a structured spec is invalid."""
+
+
+def _require_mapping(value: Any, *, context: str) -> Mapping[str, Any]:
+    if not isinstance(value, Mapping):
+        raise SpecError(f"{context} must be an object.")
+    return value
+
+
+def _require_str(value: Any, *, context: str) -> str:
+    if not isinstance(value, str) or not value.strip():
+        raise SpecError(f"{context} must be a non-empty string.")
+    return value.strip()
+
+
+def _require_float(value: Any, *, context: str) -> float:
+    if isinstance(value, bool) or not isinstance(value, (int, float)):
+        raise SpecError(f"{context} must be numeric.")
+    return float(value)
+
+
+def _require_int(value: Any, *, context: str) -> int:
+    if isinstance(value, bool) or not isinstance(value, int):
+        raise SpecError(f"{context} must be an integer.")
+    return value
+
+
+def _coerce_str_map(value: Any, *, context: str) -> dict[str, str]:
+    mapping = _require_mapping(value or {}, context=context)
+    return {str(key): str(item) for key, item in mapping.items()}
+
+
+def _coerce_any_map(value: Any, *, context: str) -> dict[str, Any]:
+    mapping = _require_mapping(value or {}, context=context)
+    return {str(key): item for key, item in mapping.items()}
+
+
+def _coerce_str_list(value: Any, *, context: str) -> list[str]:
+    if value is None:
+        return []
+    if not isinstance(value, list):
+        raise SpecError(f"{context} must be a list.")
+    result: list[str] = []
+    for item in value:
+        result.append(_require_str(item, context=context))
+    return result
+
+
+@dataclass(frozen=True)
+class HardwareSpec:
+    gpu_count: int
+    gpu_model: str | None = None
+    host_candidates: list[str] = field(default_factory=list)
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "HardwareSpec":
+        return cls(
+            gpu_count=_require_int(data.get("gpu_count"), context="hardware.gpu_count"),
+            gpu_model=str(data["gpu_model"]).strip() if data.get("gpu_model") else None,
+            host_candidates=_coerce_str_list(
+                data.get("host_candidates"), context="hardware.host_candidates"
+            ),
+        )
+
+
+@dataclass(frozen=True)
+class ModelSpec:
+    model_id: str
+    served_model_name: str
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "ModelSpec":
+        return cls(
+            model_id=_require_str(data.get("model_id"), context="model.model_id"),
+            served_model_name=_require_str(
+                data.get("served_model_name"), context="model.served_model_name"
+            ),
+        )
+
+
+@dataclass(frozen=True)
+class EngineLaunchSpec:
+    engine_name: str
+    engine_version: str | None
+    exec_path: str
+    cwd: str | None
+    host: str
+    port: int
+    ready_timeout_s: float
+    request_timeout_s: float
+    healthcheck_path: str
+    launch_args: list[str]
+    base_envs: dict[str, str]
+    base_flags: dict[str, Any]
+    tunable_envs: list[str]
+    tunable_flags: list[str]
+    python_executable: str = "python3"
+
+    @property
+    def base_url(self) -> str:
+        return f"http://{self.host}:{self.port}"
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "EngineLaunchSpec":
+        return cls(
+            engine_name=_require_str(data.get("engine_name"), context="engine.engine_name"),
+            engine_version=str(data["engine_version"]).strip()
+            if data.get("engine_version")
+            else None,
+            exec_path=_require_str(data.get("exec_path"), context="engine.exec_path"),
+            cwd=str(data["cwd"]).strip() if data.get("cwd") else None,
+            host=str(data.get("host") or "127.0.0.1").strip(),
+            port=_require_int(data.get("port", 8000), context="engine.port"),
+            ready_timeout_s=_require_float(
+                data.get("ready_timeout_s", 600.0), context="engine.ready_timeout_s"
+            ),
+            request_timeout_s=_require_float(
+                data.get("request_timeout_s", 600.0),
+                context="engine.request_timeout_s",
+            ),
+            healthcheck_path=str(data.get("healthcheck_path") or "/v1/models").strip(),
+            launch_args=_coerce_str_list(data.get("launch_args"), context="engine.launch_args"),
+            base_envs=_coerce_str_map(data.get("base_envs"), context="engine.base_envs"),
+            base_flags=_coerce_any_map(data.get("base_flags"), context="engine.base_flags"),
+            tunable_envs=_coerce_str_list(
+                data.get("tunable_envs"), context="engine.tunable_envs"
+            ),
+            tunable_flags=_coerce_str_list(
+                data.get("tunable_flags"), context="engine.tunable_flags"
+            ),
+            python_executable=str(data.get("python_executable") or "python3").strip(),
+        )
+
+
+@dataclass(frozen=True)
+class TraceSpec:
+    windows_path: str
+    window_id: str
+    trace_file_override: str | None
+    u_field: str
+    timestamp_field: str
+    max_concurrency: int
+    max_requests_per_probe: int | None = None
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "TraceSpec":
+        max_requests = data.get("max_requests_per_probe")
+        return cls(
+            windows_path=_require_str(data.get("windows_path"), context="trace.windows_path"),
+            window_id=_require_str(data.get("window_id"), context="trace.window_id"),
+            trace_file_override=str(data["trace_file_override"]).strip()
+            if data.get("trace_file_override")
+            else None,
+            u_field=str(data.get("u_field") or "sampling_u").strip(),
+            timestamp_field=str(data.get("timestamp_field") or "timestamp").strip(),
+            max_concurrency=_require_int(
+                data.get("max_concurrency", 64), context="trace.max_concurrency"
+            ),
+            max_requests_per_probe=int(max_requests) if max_requests is not None else None,
+        )
+
+
+@dataclass(frozen=True)
+class ThresholdRule:
+    kind: str
+    threshold_ms: float | None = None
+    buckets: list[dict[str, float]] = field(default_factory=list)
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any], *, context: str) -> "ThresholdRule":
+        kind = _require_str(data.get("kind"), context=f"{context}.kind")
+        if kind == "fixed_ms":
+            return cls(
+                kind=kind,
+                threshold_ms=_require_float(
+                    data.get("threshold_ms"), context=f"{context}.threshold_ms"
+                ),
+            )
+        if kind == "step_ms":
+            raw = data.get("buckets")
+            if not isinstance(raw, list) or not raw:
+                raise SpecError(f"{context}.buckets must be a non-empty list.")
+            buckets: list[dict[str, float]] = []
+            for idx, item in enumerate(raw):
+                mapping = _require_mapping(item, context=f"{context}.buckets[{idx}]")
+                bucket: dict[str, float] = {
+                    "threshold_ms": _require_float(
+                        mapping.get("threshold_ms"),
+                        context=f"{context}.buckets[{idx}].threshold_ms",
+                    )
+                }
+                if "max_input_tokens" in mapping and mapping["max_input_tokens"] is not None:
+                    bucket["max_input_tokens"] = _require_float(
+                        mapping["max_input_tokens"],
+                        context=f"{context}.buckets[{idx}].max_input_tokens",
+                    )
+                buckets.append(bucket)
+            return cls(kind=kind, buckets=buckets)
+        raise SpecError(f"Unsupported threshold rule kind: {kind}")
+
+
+@dataclass(frozen=True)
+class SloSpec:
+    target_pass_rate: float
+    ttft_rule: ThresholdRule | None
+    tpot_rule: ThresholdRule | None
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "SloSpec":
+        ttft_rule = (
+            ThresholdRule.from_dict(
+                _require_mapping(data["ttft_rule"], context="slo.ttft_rule"),
+                context="slo.ttft_rule",
+            )
+            if data.get("ttft_rule")
+            else None
+        )
+        tpot_rule = (
+            ThresholdRule.from_dict(
+                _require_mapping(data["tpot_rule"], context="slo.tpot_rule"),
+                context="slo.tpot_rule",
+            )
+            if data.get("tpot_rule")
+            else None
+        )
+        return cls(
+            target_pass_rate=_require_float(
+                data.get("target_pass_rate", 0.95), context="slo.target_pass_rate"
+            ),
+            ttft_rule=ttft_rule,
+            tpot_rule=tpot_rule,
+        )
+
+
+@dataclass(frozen=True)
+class SamplingSearchSpec:
+    low: float
+    high: float
+    tolerance: float
+    max_probes: int
+    sample_seed: int
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "SamplingSearchSpec":
+        return cls(
+            low=_require_float(data.get("low", 0.0), context="search.low"),
+            high=_require_float(data.get("high", 1.0), context="search.high"),
+            tolerance=_require_float(
+                data.get("tolerance", 0.01), context="search.tolerance"
+            ),
+            max_probes=_require_int(data.get("max_probes", 8), context="search.max_probes"),
+            sample_seed=_require_int(
+                data.get("sample_seed", 20260325), context="search.sample_seed"
+            ),
+        )
+
+
+@dataclass(frozen=True)
+class LLMEndpointSpec:
+    base_url: str
+    model: str
+    api_key_env: str = "OPENAI_API_KEY"
+    timeout_s: float = 120.0
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "LLMEndpointSpec":
+        return cls(
+            base_url=_require_str(data.get("base_url"), context="llm.endpoint.base_url"),
+            model=_require_str(data.get("model"), context="llm.endpoint.model"),
+            api_key_env=str(data.get("api_key_env") or "OPENAI_API_KEY").strip(),
+            timeout_s=_require_float(
+                data.get("timeout_s", 120.0), context="llm.endpoint.timeout_s"
+            ),
+        )
+
+
+@dataclass(frozen=True)
+class LLMPolicySpec:
+    endpoint: LLMEndpointSpec | None
+    system_prompt: str
+    max_history_trials: int
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
+        payload = _require_mapping(data or {}, context="llm")
+        endpoint = (
+            LLMEndpointSpec.from_dict(
+                _require_mapping(payload["endpoint"], context="llm.endpoint")
+            )
+            if payload.get("endpoint")
+            else None
+        )
+        return cls(
+            endpoint=endpoint,
+            system_prompt=str(payload.get("system_prompt") or "").strip(),
+            max_history_trials=_require_int(
+                payload.get("max_history_trials", 8), context="llm.max_history_trials"
+            ),
+        )
+
+
+@dataclass(frozen=True)
+class StudySpec:
+    study_id: str
+    hardware: HardwareSpec
+    model: ModelSpec
+    engine: EngineLaunchSpec
+    trace: TraceSpec
+    slo: SloSpec
+    search: SamplingSearchSpec
+    llm: LLMPolicySpec
+    capability_profile_path: str | None = None
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "StudySpec":
+        return cls(
+            study_id=_require_str(data.get("study_id"), context="study_id"),
+            hardware=HardwareSpec.from_dict(
+                _require_mapping(data.get("hardware"), context="hardware")
+            ),
+            model=ModelSpec.from_dict(_require_mapping(data.get("model"), context="model")),
+            engine=EngineLaunchSpec.from_dict(
+                _require_mapping(data.get("engine"), context="engine")
+            ),
+            trace=TraceSpec.from_dict(_require_mapping(data.get("trace"), context="trace")),
+            slo=SloSpec.from_dict(_require_mapping(data.get("slo"), context="slo")),
+            search=SamplingSearchSpec.from_dict(
+                _require_mapping(data.get("search"), context="search")
+            ),
+            llm=LLMPolicySpec.from_dict(data.get("llm")),
+            capability_profile_path=str(data["capability_profile_path"]).strip()
+            if data.get("capability_profile_path")
+            else None,
+        )
+
+
+@dataclass(frozen=True)
+class ConfigPatch:
+    env_patch: dict[str, str] = field(default_factory=dict)
+    flag_patch: dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "ConfigPatch":
+        return cls(
+            env_patch=_coerce_str_map(data.get("env_patch"), context="config_patch.env_patch"),
+            flag_patch=_coerce_any_map(
+                data.get("flag_patch"), context="config_patch.flag_patch"
+            ),
+        )
+
+
+@dataclass(frozen=True)
+class Proposal:
+    observation: str
+    diagnosis: str
+    config_patch: ConfigPatch
+    expected_effects: list[str]
+    why_not_previous_failures: str = ""
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "Proposal":
+        return cls(
+            observation=_require_str(data.get("observation"), context="proposal.observation"),
+            diagnosis=_require_str(data.get("diagnosis"), context="proposal.diagnosis"),
+            config_patch=ConfigPatch.from_dict(
+                _require_mapping(data.get("config_patch"), context="proposal.config_patch")
+            ),
+            expected_effects=_coerce_str_list(
+                data.get("expected_effects"), context="proposal.expected_effects"
+            ),
+            why_not_previous_failures=str(data.get("why_not_previous_failures") or "").strip(),
+        )
+
+
+@dataclass(frozen=True)
+class TrialSpec:
+    study_id: str
+    trial_id: str
+    config_patch: ConfigPatch
+    search: SamplingSearchSpec
+    study_spec_path: str
+    artifact_dir: str
+    probe_log_path: str
+    engine_log_path: str
+    result_path: str
+
+
+@dataclass
+class TrialSummary:
+    trial_id: str
+    status: str
+    best_sampling_u: float | None = None
+    best_request_rate: float | None = None
+    best_pass_rate: float | None = None
+    result_path: str | None = None
+    diagnosis: str = ""
+
+
+@dataclass
+class StudyState:
+    study_id: str
+    best_trial_id: str | None = None
+    best_request_rate: float | None = None
+    next_trial_index: int = 1
+    trials: list[TrialSummary] = field(default_factory=list)
+
+
+def to_jsonable(value: Any) -> Any:
+    if is_dataclass(value):
+        return {key: to_jsonable(item) for key, item in asdict(value).items()}
+    if isinstance(value, dict):
+        return {str(key): to_jsonable(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [to_jsonable(item) for item in value]
+    return value
+
+
+def load_structured_file(path: Path) -> Mapping[str, Any]:
+    suffix = path.suffix.lower()
+    if suffix == ".json":
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    elif suffix in {".toml", ".tml"}:
+        payload = tomllib.loads(path.read_text(encoding="utf-8"))
+    else:
+        raise SpecError(f"Unsupported spec file type: {path}")
+    return _require_mapping(payload, context=str(path))
+
+
+def load_study_spec(path: Path) -> StudySpec:
+    return StudySpec.from_dict(load_structured_file(path))
--- a/src/aituner/store.py
+++ b/src/aituner/store.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+import json
+from dataclasses import replace
+from pathlib import Path
+from typing import Any
+
+from .spec import Proposal, StudySpec, StudyState, TrialSpec, TrialSummary, to_jsonable
+
+
+class StudyStore:
+    def __init__(self, root: Path | None = None):
+        base = root or Path(".aituner") / "studies"
+        self.root = base.resolve()
+
+    def study_root(self, study_id: str) -> Path:
+        return self.root / study_id
+
+    def init_study(self, *, spec_path: Path, study: StudySpec) -> Path:
+        root = self.study_root(study.study_id)
+        for rel in ("prompts", "proposals", "trials", "results"):
+            (root / rel).mkdir(parents=True, exist_ok=True)
+        (root / "study_spec.source").write_text(str(spec_path.resolve()) + "\n", encoding="utf-8")
+        self.write_json(root / "study_spec.snapshot.json", to_jsonable(study))
+        if not (root / "state.json").exists():
+            self.write_json(root / "state.json", to_jsonable(StudyState(study_id=study.study_id)))
+        return root
+
+    def load_state(self, study_id: str) -> StudyState:
+        payload = json.loads((self.study_root(study_id) / "state.json").read_text(encoding="utf-8"))
+        trials = [TrialSummary(**item) for item in payload.get("trials", [])]
+        return StudyState(
+            study_id=str(payload["study_id"]),
+            best_trial_id=payload.get("best_trial_id"),
+            best_request_rate=payload.get("best_request_rate"),
+            next_trial_index=int(payload.get("next_trial_index", 1)),
+            trials=trials,
+        )
+
+    def save_state(self, state: StudyState) -> None:
+        self.write_json(self.study_root(state.study_id) / "state.json", to_jsonable(state))
+
+    def write_prompt(self, study_id: str, prompt_name: str, prompt_text: str) -> Path:
+        path = self.study_root(study_id) / "prompts" / f"{prompt_name}.txt"
+        path.write_text(prompt_text, encoding="utf-8")
+        return path
+
+    def write_proposal(self, study_id: str, proposal_name: str, proposal: Proposal) -> Path:
+        path = self.study_root(study_id) / "proposals" / f"{proposal_name}.json"
+        self.write_json(path, to_jsonable(proposal))
+        return path
+
+    def materialize_trial(
+        self,
+        *,
+        study: StudySpec,
+        state: StudyState,
+        proposal: Proposal,
+    ) -> tuple[TrialSpec, StudyState]:
+        trial_id = f"trial-{state.next_trial_index:04d}"
+        trial_root = self.study_root(study.study_id) / "trials" / trial_id
+        trial_root.mkdir(parents=True, exist_ok=True)
+        spec = TrialSpec(
+            study_id=study.study_id,
+            trial_id=trial_id,
+            config_patch=proposal.config_patch,
+            search=study.search,
+            study_spec_path=str((self.study_root(study.study_id) / "study_spec.source").resolve()),
+            artifact_dir=str(trial_root),
+            probe_log_path=str(trial_root / "probe_history.json"),
+            engine_log_path=str(trial_root / "engine.log"),
+            result_path=str(trial_root / "result.json"),
+        )
+        self.write_json(trial_root / "trial_spec.json", to_jsonable(spec))
+        next_state = replace(state, next_trial_index=state.next_trial_index + 1)
+        next_state.trials.append(
+            TrialSummary(trial_id=trial_id, status="queued", diagnosis=proposal.diagnosis)
+        )
+        self.save_state(next_state)
+        return spec, next_state
+
+    def ingest_trial_results(self, study_id: str) -> StudyState:
+        state = self.load_state(study_id)
+        by_id = {item.trial_id: item for item in state.trials}
+        trials_dir = self.study_root(study_id) / "trials"
+        best_trial_id = state.best_trial_id
+        best_rate = state.best_request_rate
+        for trial_dir in sorted(trials_dir.glob("trial-*")):
+            result_path = trial_dir / "result.json"
+            if not result_path.exists():
+                continue
+            payload = json.loads(result_path.read_text(encoding="utf-8"))
+            trial_id = str(payload["trial_id"])
+            summary = by_id.get(trial_id)
+            if summary is None:
+                summary = TrialSummary(trial_id=trial_id, status="unknown")
+                state.trials.append(summary)
+                by_id[trial_id] = summary
+            summary.status = str(payload.get("status") or "completed")
+            summary.best_sampling_u = payload.get("best_sampling_u")
+            summary.best_request_rate = payload.get("best_request_rate")
+            summary.best_pass_rate = payload.get("best_pass_rate")
+            summary.result_path = str(result_path)
+            if (
+                isinstance(summary.best_request_rate, (int, float))
+                and (best_rate is None or summary.best_request_rate > best_rate)
+            ):
+                best_rate = float(summary.best_request_rate)
+                best_trial_id = trial_id
+        state.best_request_rate = best_rate
+        state.best_trial_id = best_trial_id
+        self.save_state(state)
+        return state
+
+    @staticmethod
+    def write_json(path: Path, payload: Any) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
--- a/src/aituner/trace.py
+++ b/src/aituner/trace.py
@@ -0,0 +1,178 @@
+from __future__ import annotations
+
+import json
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Mapping
+
+from .spec import StudySpec
+
+
+class TraceError(ValueError):
+    """Raised when trace assets are invalid."""
+
+
+def _percentile(values: list[float], p: float) -> float:
+    if not values:
+        return 0.0
+    ordered = sorted(values)
+    idx = min(len(ordered) - 1, max(0, math.ceil((p / 100.0) * len(ordered)) - 1))
+    return ordered[idx]
+
+
+@dataclass(frozen=True)
+class WindowRecord:
+    window_id: str
+    trace_path: Path
+    trace_type: str
+    window_start: float
+    window_end: float
+    source_payload: dict[str, Any]
+
+
+@dataclass(frozen=True)
+class TraceRequest:
+    row_id: str
+    arrival_s: float
+    sampling_u: float
+    body: dict[str, Any]
+    prompt_tokens_hint: int | None
+    completion_tokens_hint: int | None
+
+
+def resolve_window_record(study: StudySpec, *, study_spec_path: Path) -> WindowRecord:
+    windows_path = Path(study.trace.windows_path)
+    if not windows_path.is_absolute():
+        windows_path = (study_spec_path.parent / windows_path).resolve()
+    payload = json.loads(windows_path.read_text(encoding="utf-8"))
+    windows = payload["windows"] if isinstance(payload, Mapping) and "windows" in payload else payload
+    if not isinstance(windows, list):
+        raise TraceError(f"windows payload must contain a list: {windows_path}")
+    for item in windows:
+        if not isinstance(item, Mapping):
+            continue
+        if str(item.get("window_id") or "").strip() != study.trace.window_id:
+            continue
+        trace_file = study.trace.trace_file_override or str(item.get("trace_file") or "").strip()
+        if not trace_file:
+            raise TraceError(f"window {study.trace.window_id} does not define trace_file")
+        trace_path = Path(trace_file)
+        if not trace_path.is_absolute():
+            trace_path = (windows_path.parent / trace_path).resolve()
+        return WindowRecord(
+            window_id=study.trace.window_id,
+            trace_path=trace_path,
+            trace_type=str(item.get("trace_type") or "chat").strip(),
+            window_start=float(item.get("window_start") or 0.0),
+            window_end=float(item.get("window_end") or 0.0),
+            source_payload={str(key): value for key, value in item.items()},
+        )
+    raise TraceError(f"window_id not found: {study.trace.window_id}")
+
+
+def _coerce_messages(row: Mapping[str, Any]) -> list[dict[str, Any]]:
+    messages = row.get("messages")
+    if isinstance(messages, list) and messages:
+        return [dict(item) for item in messages if isinstance(item, Mapping)]
+    prompt = row.get("prompt") or row.get("input") or row.get("text")
+    if isinstance(prompt, str) and prompt.strip():
+        return [{"role": "user", "content": prompt}]
+    raise TraceError("trace row is missing chat messages/prompt text")
+
+
+def _coerce_completion_tokens(row: Mapping[str, Any]) -> int | None:
+    for key in ("max_completion_tokens", "max_tokens", "output_length", "completion_tokens"):
+        value = row.get(key)
+        if isinstance(value, bool):
+            continue
+        if isinstance(value, int) and value >= 0:
+            return value
+        if isinstance(value, float) and value >= 0:
+            return int(value)
+    return None
+
+
+def _coerce_prompt_tokens(row: Mapping[str, Any]) -> int | None:
+    for key in ("input_length", "prompt_length", "prompt_len", "input_tokens"):
+        value = row.get(key)
+        if isinstance(value, bool):
+            continue
+        if isinstance(value, int) and value >= 0:
+            return value
+        if isinstance(value, float) and value >= 0:
+            return int(value)
+    return None
+
+
+def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[WindowRecord, list[TraceRequest]]:
+    window = resolve_window_record(study, study_spec_path=study_spec_path)
+    requests: list[TraceRequest] = []
+    with window.trace_path.open("r", encoding="utf-8") as handle:
+        for idx, raw in enumerate(handle):
+            if not raw.strip():
+                continue
+            row = json.loads(raw)
+            if not isinstance(row, Mapping):
+                continue
+            timestamp = row.get(study.trace.timestamp_field)
+            if timestamp is None:
+                timestamp = row.get("arrival_time", row.get("timestamp"))
+            if isinstance(timestamp, bool) or not isinstance(timestamp, (int, float)):
+                raise TraceError(f"trace row {idx} is missing numeric timestamp")
+            sampling_u = row.get(study.trace.u_field, 1.0)
+            if isinstance(sampling_u, bool) or not isinstance(sampling_u, (int, float)):
+                raise TraceError(f"trace row {idx} is missing numeric {study.trace.u_field}")
+            body: dict[str, Any] = {
+                "model": study.model.served_model_name,
+                "messages": _coerce_messages(row),
+                "stream": True,
+                "stream_options": {"include_usage": True},
+            }
+            completion_tokens = _coerce_completion_tokens(row)
+            if completion_tokens is not None:
+                body["max_tokens"] = completion_tokens
+            temperature = row.get("temperature")
+            if isinstance(temperature, (int, float)) and not isinstance(temperature, bool):
+                body["temperature"] = temperature
+            requests.append(
+                TraceRequest(
+                    row_id=str(row.get("request_id") or row.get("id") or idx),
+                    arrival_s=float(timestamp),
+                    sampling_u=float(sampling_u),
+                    body=body,
+                    prompt_tokens_hint=_coerce_prompt_tokens(row),
+                    completion_tokens_hint=completion_tokens,
+                )
+            )
+    requests.sort(key=lambda item: item.arrival_s)
+    if study.trace.max_requests_per_probe is not None:
+        requests = requests[: study.trace.max_requests_per_probe]
+    return window, requests
+
+
+def summarize_window(requests: list[TraceRequest], window: WindowRecord) -> dict[str, Any]:
+    prompt_tokens = [float(item.prompt_tokens_hint or 0) for item in requests]
+    completion_tokens = [float(item.completion_tokens_hint or 0) for item in requests]
+    duration = max(window.window_end - window.window_start, 0.0) or (
+        requests[-1].arrival_s - requests[0].arrival_s if len(requests) >= 2 else 0.0
+    )
+    qps = (len(requests) / duration) if duration > 0 else 0.0
+    return {
+        "window_id": window.window_id,
+        "trace_path": str(window.trace_path),
+        "trace_type": window.trace_type,
+        "request_count": len(requests),
+        "duration_s": duration,
+        "request_rate": qps,
+        "prompt_tokens_p50": _percentile(prompt_tokens, 50.0),
+        "prompt_tokens_p95": _percentile(prompt_tokens, 95.0),
+        "completion_tokens_p50": _percentile(completion_tokens, 50.0),
+        "completion_tokens_p95": _percentile(completion_tokens, 95.0),
+    }
+
+
+def select_requests_for_threshold(
+    requests: list[TraceRequest], *, threshold: float
+) -> list[TraceRequest]:
+    return [item for item in requests if item.sampling_u <= threshold]
--- a/src/aituner/worker.py
+++ b/src/aituner/worker.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+import json
+import subprocess
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from .engine import build_launch_recipe
+from .http_client import HttpClientError, stream_chat_completion, wait_for_server
+from .search import ThresholdProbe, binary_search_max_feasible
+from .slo import RequestOutcome, summarize_evaluations
+from .spec import ConfigPatch, SamplingSearchSpec, TrialSpec, load_study_spec
+from .trace import TraceRequest, load_trace_requests, select_requests_for_threshold
+
+
+@dataclass(frozen=True)
+class ProbePayload:
+    threshold: float
+    request_count: int
+    pass_rate: float
+    request_rate: float
+    feasible: bool
+    outcomes: list[dict[str, Any]]
+
+def _trial_spec_from_json(path: Path) -> TrialSpec:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    return TrialSpec(
+        study_id=str(payload["study_id"]),
+        trial_id=str(payload["trial_id"]),
+        config_patch=ConfigPatch.from_dict(payload["config_patch"]),
+        search=SamplingSearchSpec.from_dict(payload["search"]),
+        study_spec_path=str(payload["study_spec_path"]),
+        artifact_dir=str(payload["artifact_dir"]),
+        probe_log_path=str(payload["probe_log_path"]),
+        engine_log_path=str(payload["engine_log_path"]),
+        result_path=str(payload["result_path"]),
+    )
+
+
+def _run_one_request(
+    request: TraceRequest,
+    *,
+    base_url: str,
+    timeout_s: float,
+) -> RequestOutcome:
+    try:
+        metrics = stream_chat_completion(base_url=base_url, body=request.body, timeout_s=timeout_s)
+        return RequestOutcome(
+            request_id=request.row_id,
+            success=True,
+            ttft_ms=metrics.ttft_ms,
+            tpot_ms=metrics.tpot_ms,
+            prompt_tokens=request.prompt_tokens_hint,
+            completion_tokens=metrics.completion_tokens or request.completion_tokens_hint,
+        )
+    except HttpClientError as exc:
+        return RequestOutcome(
+            request_id=request.row_id,
+            success=False,
+            ttft_ms=None,
+            tpot_ms=None,
+            prompt_tokens=request.prompt_tokens_hint,
+            completion_tokens=request.completion_tokens_hint,
+            error=str(exc),
+        )
+
+
+def _replay_requests(
+    requests: list[TraceRequest],
+    *,
+    base_url: str,
+    timeout_s: float,
+    max_concurrency: int,
+) -> list[RequestOutcome]:
+    outcomes_by_id: dict[str, RequestOutcome] = {}
+    lock = threading.Lock()
+    start = time.monotonic()
+    with ThreadPoolExecutor(max_workers=max_concurrency) as pool:
+        futures = []
+        for request in requests:
+            delay = max(0.0, request.arrival_s)
+            now = time.monotonic()
+            sleep_for = (start + delay) - now
+            if sleep_for > 0:
+                time.sleep(sleep_for)
+            futures.append(
+                pool.submit(
+                    _run_one_request,
+                    request,
+                    base_url=base_url,
+                    timeout_s=timeout_s,
+                )
+            )
+        for future in as_completed(futures):
+            outcome = future.result()
+            with lock:
+                outcomes_by_id[outcome.request_id] = outcome
+    return [outcomes_by_id[item.row_id] for item in requests if item.row_id in outcomes_by_id]
+
+
+def run_trial(trial_spec_path: Path) -> dict[str, Any]:
+    from .store import StudyStore
+
+    trial = _trial_spec_from_json(trial_spec_path)
+    study_spec_path = Path(Path(trial.study_spec_path).read_text(encoding="utf-8").strip())
+    study = load_study_spec(study_spec_path)
+    window, requests = load_trace_requests(study, study_spec_path=study_spec_path)
+    recipe = build_launch_recipe(study.engine, trial.config_patch)
+    artifact_dir = Path(trial.artifact_dir)
+    artifact_dir.mkdir(parents=True, exist_ok=True)
+    engine_log_path = Path(trial.engine_log_path)
+    with engine_log_path.open("w", encoding="utf-8") as engine_log:
+        process = subprocess.Popen(  # noqa: S603
+            recipe.argv,
+            cwd=recipe.cwd,
+            env=recipe.env,
+            stdout=engine_log,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        try:
+            wait_for_server(recipe.base_url, recipe.healthcheck_path, recipe.ready_timeout_s)
+            probe_history: list[dict[str, Any]] = []
+
+            def evaluator(threshold: float) -> ThresholdProbe[ProbePayload]:
+                selected = select_requests_for_threshold(requests, threshold=threshold)
+                outcomes = _replay_requests(
+                    selected,
+                    base_url=recipe.base_url,
+                    timeout_s=recipe.request_timeout_s,
+                    max_concurrency=study.trace.max_concurrency,
+                )
+                evaluations, summary = summarize_evaluations(outcomes, study.slo)
+                request_rate = (
+                    len(selected) / max(window.window_end - window.window_start, 1e-9)
+                    if selected
+                    else 0.0
+                )
+                payload = ProbePayload(
+                    threshold=threshold,
+                    request_count=len(selected),
+                    pass_rate=float(summary["slo_pass_rate"]),
+                    request_rate=request_rate,
+                    feasible=bool(summary["feasible"]),
+                    outcomes=[
+                        {
+                            "request_id": outcome.request_id,
+                            "success": outcome.success,
+                            "ttft_ms": outcome.ttft_ms,
+                            "tpot_ms": outcome.tpot_ms,
+                            "prompt_tokens": outcome.prompt_tokens,
+                            "completion_tokens": outcome.completion_tokens,
+                            "evaluation": evaluation.passed,
+                            "reasons": evaluation.reasons,
+                        }
+                        for outcome, evaluation in zip(outcomes, evaluations)
+                    ],
+                )
+                probe_record = {
+                    "threshold": threshold,
+                    "request_count": payload.request_count,
+                    "pass_rate": payload.pass_rate,
+                    "request_rate": payload.request_rate,
+                    "feasible": payload.feasible,
+                }
+                probe_history.append(probe_record)
+                StudyStore.write_json(Path(trial.probe_log_path), probe_history)
+                return ThresholdProbe(
+                    threshold=threshold,
+                    feasible=payload.feasible,
+                    payload=payload,
+                )
+
+            search = binary_search_max_feasible(
+                low=trial.search.low,
+                high=trial.search.high,
+                tolerance=trial.search.tolerance,
+                max_probes=trial.search.max_probes,
+                evaluator=evaluator,
+            )
+            best = search.best_feasible_payload
+            result = {
+                "study_id": trial.study_id,
+                "trial_id": trial.trial_id,
+                "status": "completed",
+                "best_sampling_u": search.best_threshold if best is not None else None,
+                "best_request_rate": best.request_rate if best is not None else None,
+                "best_pass_rate": best.pass_rate if best is not None else None,
+                "best_request_count": best.request_count if best is not None else None,
+                "probes": [
+                    {
+                        "threshold": probe.threshold,
+                        "feasible": probe.feasible,
+                        "payload": {
+                            "request_count": probe.payload.request_count,
+                            "pass_rate": probe.payload.pass_rate,
+                            "request_rate": probe.payload.request_rate,
+                        },
+                    }
+                    for probe in search.probes
+                ],
+            }
+            StudyStore.write_json(Path(trial.result_path), result)
+            return result
+        finally:
+            process.terminate()
+            try:
+                process.wait(timeout=30)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                process.wait(timeout=30)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SRC = ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -0,0 +1,267 @@
+from __future__ import annotations
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from aituner.job import append_job, build_trial_job
+from aituner.llm import build_prompt, parse_proposal_text
+from aituner.search import ThresholdProbe, binary_search_max_feasible
+from aituner.slo import RequestOutcome, summarize_evaluations
+from aituner.spec import Proposal, load_study_spec
+from aituner.store import StudyStore
+from aituner.trace import load_trace_requests, summarize_window
+
+
+def _write_study_assets(tmp_path: Path) -> Path:
+    trace_dir = tmp_path / "trace_windows" / "traces"
+    trace_dir.mkdir(parents=True)
+    trace_path = trace_dir / "chat_w1.jsonl"
+    rows = [
+        {
+            "request_id": "r1",
+            "timestamp": 0.0,
+            "sampling_u": 0.10,
+            "messages": [{"role": "user", "content": "hello"}],
+            "input_length": 1000,
+            "output_length": 16
+        },
+        {
+            "request_id": "r2",
+            "timestamp": 1.0,
+            "sampling_u": 0.50,
+            "messages": [{"role": "user", "content": "world"}],
+            "input_length": 5000,
+            "output_length": 32
+        },
+        {
+            "request_id": "r3",
+            "timestamp": 2.0,
+            "sampling_u": 0.90,
+            "messages": [{"role": "user", "content": "!"}],
+            "input_length": 20000,
+            "output_length": 64
+        }
+    ]
+    with trace_path.open("w", encoding="utf-8") as handle:
+        for row in rows:
+            handle.write(json.dumps(row) + "\n")
+
+    windows_path = tmp_path / "trace_windows" / "windows.json"
+    windows_payload = {
+        "u_field": "sampling_u",
+        "windows": [
+            {
+                "window_id": "chat_w1",
+                "trace_type": "chat",
+                "trace_file": "traces/chat_w1.jsonl",
+                "window_start": 0.0,
+                "window_end": 10.0
+            }
+        ]
+    }
+    windows_path.write_text(json.dumps(windows_payload), encoding="utf-8")
+
+    capability_path = tmp_path / "capability.json"
+    capability_path.write_text(
+        json.dumps({"prefill_service_by_bucket": {"4k": {"tp4_ms": 320, "tp8_ms": 240}}}),
+        encoding="utf-8",
+    )
+
+    study_path = tmp_path / "study.json"
+    study_payload = {
+        "study_id": "study-1",
+        "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
+        "model": {
+            "model_id": "qwen",
+            "served_model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507"
+        },
+        "engine": {
+            "engine_name": "vllm",
+            "engine_version": "0.1",
+            "exec_path": "/usr/local/bin/vllm",
+            "cwd": str(tmp_path),
+            "host": "127.0.0.1",
+            "port": 8000,
+            "healthcheck_path": "/v1/models",
+            "ready_timeout_s": 30,
+            "request_timeout_s": 30,
+            "launch_args": ["serve", "/models/qwen"],
+            "base_envs": {"BASE_ENV": "1"},
+            "base_flags": {"host": "127.0.0.1", "port": 8000},
+            "tunable_envs": ["VLLM_ATTENTION_BACKEND"],
+            "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
+            "python_executable": "python3"
+        },
+        "trace": {
+            "windows_path": str(windows_path),
+            "window_id": "chat_w1",
+            "u_field": "sampling_u",
+            "timestamp_field": "timestamp",
+            "max_concurrency": 4
+        },
+        "slo": {
+            "target_pass_rate": 0.95,
+            "ttft_rule": {
+                "kind": "step_ms",
+                "buckets": [
+                    {"max_input_tokens": 4096, "threshold_ms": 2000},
+                    {"max_input_tokens": 16384, "threshold_ms": 5000},
+                    {"threshold_ms": 9000}
+                ]
+            },
+            "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 120}
+        },
+        "search": {
+            "low": 0.0,
+            "high": 1.0,
+            "tolerance": 0.01,
+            "max_probes": 8,
+            "sample_seed": 20260325
+        },
+        "llm": {"system_prompt": "Tune it.", "max_history_trials": 8},
+        "capability_profile_path": str(capability_path)
+    }
+    study_path.write_text(json.dumps(study_payload), encoding="utf-8")
+    return study_path
+
+
+class CoreFlowTests(unittest.TestCase):
+    def test_trace_and_prompt_flow(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            study_root = store.init_study(spec_path=study_path, study=study)
+            state = store.load_state(study.study_id)
+
+            window, requests = load_trace_requests(study, study_spec_path=study_path)
+            summary = summarize_window(requests, window)
+            self.assertEqual(summary["request_count"], 3)
+            self.assertEqual(summary["request_rate"], 0.3)
+
+            prompt = build_prompt(
+                study=study,
+                window_summary=summary,
+                state=state,
+                capability_profile={"queueing_knee_by_bucket": {"4k": 1000}},
+            )
+            self.assertIn("allowed_flag_keys", prompt)
+            self.assertIn("study-1", prompt)
+            self.assertIn("queueing_knee_by_bucket", prompt)
+            self.assertTrue(study_root.exists())
+
+    def test_slo_evaluation_step_and_fixed_rules(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            study = load_study_spec(_write_study_assets(Path(tmp)))
+            outcomes = [
+                RequestOutcome(
+                    request_id="r1",
+                    success=True,
+                    ttft_ms=1000,
+                    tpot_ms=100,
+                    prompt_tokens=1000,
+                    completion_tokens=16,
+                ),
+                RequestOutcome(
+                    request_id="r2",
+                    success=True,
+                    ttft_ms=6000,
+                    tpot_ms=100,
+                    prompt_tokens=5000,
+                    completion_tokens=16,
+                ),
+            ]
+            evaluations, summary = summarize_evaluations(outcomes, study.slo)
+            self.assertTrue(evaluations[0].passed)
+            self.assertFalse(evaluations[1].passed)
+            self.assertEqual(summary["slo_pass_rate"], 0.5)
+            self.assertFalse(summary["feasible"])
+
+    def test_binary_search_max_feasible(self) -> None:
+        result = binary_search_max_feasible(
+            low=0.0,
+            high=1.0,
+            tolerance=0.01,
+            max_probes=8,
+            evaluator=lambda threshold: ThresholdProbe(
+                threshold=threshold,
+                feasible=threshold <= 0.625,
+                payload={"threshold": threshold},
+            ),
+        )
+        self.assertLessEqual(result.best_threshold, 0.625)
+        self.assertGreaterEqual(result.best_threshold, 0.5)
+        self.assertIsNotNone(result.best_feasible_payload)
+
+    def test_proposal_validation_and_job_emission(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = store.load_state(study.study_id)
+
+            proposal_text = json.dumps(
+                {
+                    "observation": "Current TTFT fails before TPOT.",
+                    "diagnosis": "Prefill pressure dominates.",
+                    "config_patch": {
+                        "env_patch": {"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+                        "flag_patch": {"tensor-parallel-size": 4, "max-num-seqs": 64}
+                    },
+                    "expected_effects": ["lower TTFT", "raise feasible sampling_u"],
+                    "why_not_previous_failures": "Avoids changing unsupported envs."
+                }
+            )
+            proposal = parse_proposal_text(proposal_text, study)
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+
+            job = build_trial_job(study=study, trial=trial, repo_root=tmp_path)
+            jobs_path = tmp_path / "jobs.toml"
+            append_job(jobs_path, job)
+            rendered = jobs_path.read_text(encoding="utf-8")
+            self.assertIn('name = "study-1-trial-0001"', rendered)
+            self.assertIn('command = "python3 -m aituner.cli worker run-trial', rendered)
+            self.assertIn('PYTHONPATH = "src"', rendered)
+
+    def test_ingest_trial_results_updates_best(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = store.load_state(study.study_id)
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "Obs",
+                    "diagnosis": "Diag",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
+                    "expected_effects": ["raise rate"]
+                }
+            )
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+            Path(trial.result_path).write_text(
+                json.dumps(
+                    {
+                        "study_id": study.study_id,
+                        "trial_id": trial.trial_id,
+                        "status": "completed",
+                        "best_sampling_u": 0.75,
+                        "best_request_rate": 12.5,
+                        "best_pass_rate": 0.97
+                    }
+                ),
+                encoding="utf-8",
+            )
+            next_state = store.ingest_trial_results(study.study_id)
+            self.assertEqual(next_state.best_trial_id, trial.trial_id)
+            self.assertEqual(next_state.best_request_rate, 12.5)
+
+
+if __name__ == "__main__":
+    unittest.main()