Compare commits

..

60 Commits

Author SHA1 Message Date
adc4351e5d Report latency stats for infeasible baseline 2026-05-08 11:10:34 +08:00
eb137a0b62 Document TPOT40 baseline infeasible run 2026-05-08 02:57:03 +08:00
f212673f44 Stop tuning when baseline is infeasible 2026-05-08 01:07:36 +08:00
a7a5e9ad80 Make tune trial budget resumable 2026-05-07 17:18:06 +08:00
7263587cb6 clean: ci 2026-05-06 22:56:53 +08:00
d7df1ebdac Add open source project metadata
Some checks failed
CI / test (3.11) (push) Has been cancelled
CI / test (3.12) (push) Has been cancelled
2026-05-06 21:18:21 +08:00
c1ff64381d Harden trial measurement accounting 2026-05-06 21:18:09 +08:00
871c4cfc02 Document qwen27b chat setup audit 2026-05-06 20:32:09 +08:00
98cd6dd81a Document qwen27b current config harness curve 2026-05-06 18:00:43 +08:00
f653af09a8 Stop harness when feasible probe reaches search high 2026-05-06 17:59:09 +08:00
5d96689ea6 Make harness runtime refinement memory safe 2026-05-06 17:37:31 +08:00
cf2e741550 Document high search rerun 2026-05-06 03:19:51 +08:00
0622e23817 Guide harness runtime refinement after TP 2026-05-06 02:46:07 +08:00
50067c926d Add harness guided first topology probe 2026-05-06 02:28:46 +08:00
915861b706 Document community vllm harness ablation 2026-05-02 11:17:24 +08:00
4c066c4e4e Stop harness when search high is saturated 2026-05-02 11:04:59 +08:00
ccbf24ac47 Use time-compressed community vllm ablation 2026-05-02 10:03:59 +08:00
d3d4c234f6 Bound community vllm ablation replay 2026-05-02 09:58:56 +08:00
4ef69cce78 Make harness stop conservative for ablation 2026-05-02 09:47:16 +08:00
664aeb49b2 Use local cache for qwen30b vllm runs 2026-05-02 08:47:16 +08:00
1880e859b5 Use vllm cu129 wheel on dash0 2026-05-02 08:28:23 +08:00
e215827503 Use uv auto torch backend for vllm 0.20 2026-05-02 08:21:27 +08:00
a7c9518ef6 Use local vllm venv for dash0 community run 2026-05-02 08:17:04 +08:00
1a3d628268 Add harness early stop ablation 2026-05-02 08:08:14 +08:00
6d3459c82d Document decode harness one-shot mechanism 2026-05-02 06:25:06 +08:00
9e5394b557 Inherit incumbent topology for runtime validation 2026-04-30 09:33:49 +08:00
f59919e21c Clarify base-relative validation patches 2026-04-30 06:52:09 +08:00
46e9040613 Record decode validation follow-up 2026-04-28 21:20:41 +08:00
38ff4380e5 Make strong incumbent trigger validation phase 2026-04-28 20:54:05 +08:00
68cdaf56a8 Summarize qwen235b decode harness result 2026-04-28 20:36:17 +08:00
f982395aad Record qwen235b decode harness launch 2026-04-28 07:02:13 +08:00
c9089cf4f0 Ignore non-SLO probe bookkeeping in bottleneck diagnosis 2026-04-28 06:58:38 +08:00
a9943e0240 Use probe sequence bottlenecks in harness 2026-04-28 06:57:45 +08:00
39aa47fbf1 Add generic decode-only harness guidance 2026-04-28 06:46:18 +08:00
71902b9fc2 Record qwen235b harness convergence test 2026-04-27 18:59:25 +08:00
bc884f6701 Document AITuner harness behavior 2026-04-27 16:34:19 +08:00
a962781b6c Document qwen27b harness convergence curve 2026-04-26 01:32:18 +08:00
29d0548e06 Stop after strong incumbent harness gains 2026-04-26 01:29:05 +08:00
a53445868e Make early-stop engine relaunch opt-in 2026-04-26 01:26:26 +08:00
d76ac49198 Relaunch engine after early-stopped probes 2026-04-26 00:32:39 +08:00
440f5b491b Record plateau guard verification 2026-04-25 18:50:23 +08:00
6bac389aae Add infeasible plateau guard to harness 2026-04-25 18:49:23 +08:00
6c04b9dbbc Evaluate baseline before LLM tuning 2026-04-25 17:14:05 +08:00
2d7ebe50ee Drain inflight requests after early stop 2026-04-25 16:57:01 +08:00
2dc2815620 Make harness verification portable 2026-04-25 16:37:13 +08:00
2c5e9af02a Add harness-guided tuning prompts 2026-04-25 16:35:33 +08:00
661db1e0c6 Document dash0 experiment workflow 2026-04-25 16:18:28 +08:00
dfe792ff6f docs: add q235b prefill 0-32k tight summary 2026-04-18 16:10:29 +08:00
d237fc2723 docs: expand qwen27b 0-8k compare summary 2026-04-17 20:45:24 +08:00
9919b9a7bd configs: add q235b prefill 1s 2s 0-32k study 2026-04-17 19:25:32 +08:00
34eb495b3e configs: add qwen235b prefill 0-32k study 2026-04-17 19:20:44 +08:00
bf286ef2a6 docs: add qwen235b prefill 7-day compare 2026-04-14 10:27:08 +08:00
26f3b46966 compare: add multi-candidate runner 2026-04-13 20:50:39 +08:00
18ff644b32 configs: add qwen235b prefill tight ttft 0323 study 2026-04-13 09:39:32 +08:00
bbecec4e9f docs: add qwen235b tight ttft prefill summary 2026-04-13 09:37:06 +08:00
ee9ec3c60b docs: add qwen235b decode 0323 summary 2026-04-13 09:33:02 +08:00
a1b96f7dd2 docs: update qwen27b 7-day compare 2026-04-13 09:16:31 +08:00
4625fba487 trace: make window materialization atomic 2026-04-12 23:09:30 +08:00
631a076498 trace: include weekend legacy windows 2026-04-12 22:43:02 +08:00
ade81b5549 docs: add qwen27b chat 0-8k compare summary 2026-04-12 22:39:57 +08:00
45 changed files with 7103 additions and 92 deletions

2
.gitignore vendored
View File

@@ -2,6 +2,8 @@
.aituner-smoke/
.aituner-decode/
.aituner-tight/
.aituner-prefill/
.aituner-compare/
.env
__pycache__/
*.pyc

15
AGENTS.md Normal file
View File

@@ -0,0 +1,15 @@
# Project Operating Notes
## Remote experiment host
- Default experiment machine: `dash0`.
- Hardware expectation: 8 NVIDIA H20 GPUs.
- SSH check: use `ssh dash0` before scheduling or debugging remote runs.
- Remote project path: `/home/admin/cpfs/wjh/aituner/aituner`.
## Local/remote sync workflow
- Treat this local repository and the `dash0` repository as the same project checkout.
- Synchronize code through Git using `commit`, `push`, and `pull`.
- For remote experiments, commit local changes, push to `origin`, then pull on `dash0` in `/home/admin/cpfs/wjh/aituner/aituner` before running.
- Do not ask for the remote host or project path again unless the user explicitly changes them.

23
CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,23 @@
# Contributing
## Development Setup
```bash
python3 -m pip install -e .
PYTHONPATH=src python3 -m unittest discover -s tests -v
```
## Change Requirements
- Add or update tests for behavior changes.
- Keep experiment claims tied to reproducible artifacts: study spec, trial spec,
result JSON, probe history, and per-request probe details.
- Do not publish benchmark conclusions from bounded or time-compressed replays
without clearly labeling the replay controls.
- Keep example configs free of private credentials and prefer explicit,
reproducible endpoint settings.
## Commit Hygiene
Use small commits grouped by behavior: measurement integrity, orchestration
logic, documentation, or infrastructure.

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 AITuner contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

72
README.md Normal file
View File

@@ -0,0 +1,72 @@
# AITuner
AITuner is a small study orchestrator for OpenAI-compatible serving engines. It
replays trace windows, searches for the highest feasible offered load under
configured SLOs, and records enough trial context for LLM- or harness-guided
configuration proposals.
## Status
This repository is research tooling. Treat reported experiment numbers as valid
only when the matching study spec, trial artifacts, probe history, and
`probe_details.jsonl` files are available for audit.
## Install
```bash
python3 -m pip install -e .
```
## Test
The test suite uses the Python standard library `unittest` runner:
```bash
PYTHONPATH=src python3 -m unittest discover -s tests -v
```
If the package is installed in editable mode, `PYTHONPATH=src` is optional.
## Basic Workflow
Initialize a study:
```bash
aituner study init --spec configs/examples/study.example.json
```
Run a local tuning loop:
```bash
aituner study tune --spec configs/examples/study.example.json --max-trials 2
```
Run a compare:
```bash
aituner compare run --spec configs/examples/compare.example.json
```
Remote experiment notes for this checkout live in `AGENTS.md`. The default
remote host is `dash0`, and code should be synchronized through Git before
remote runs.
## Experiment Integrity
- Fixed-length replay requests are scored only when completion token usage is
verifiable and matches the trace expectation.
- Each trial writes aggregate probe history and per-request probe details.
- `request_rate_per_gpu` is the primary cross-topology metric:
`best_feasible_request_rate / (tensor_parallel_size * data_parallel_size)`.
- Compare reports include failed and no-feasible window counts; do not interpret
mean request rates without those counts.
- Bounded replays using `max_requests_per_probe`, `completion_tokens_override`,
or `replay_time_scale` are convergence tests for that bounded workload, not
production benchmarks.
## Configuration Notes
Example specs that use `llm.endpoint.provider=codex` resolve the endpoint from
the local Codex configuration unless `llm.endpoint.base_url` or
`AITUNER_CODEX_BASE_URL` is set. Public, reproducible examples should prefer an
explicit endpoint or omit the LLM endpoint and use proposal files.

19
SECURITY.md Normal file
View File

@@ -0,0 +1,19 @@
# Security
AITuner launches local or remote serving engines and may replay trace payloads.
Do not commit secrets, API keys, private trace content, or private model access
tokens.
## Reporting
Report security issues privately to the project maintainers. If this repository
is mirrored to a public forge, use that forge's private vulnerability reporting
flow when available.
## Operational Guidance
- Keep `.env` files local; `.env.example` documents expected variable names.
- Review generated trial artifacts before publishing them, because request
payloads may contain trace text.
- Treat remote execution configs as sensitive when they include internal host
names, paths, or scheduler details.

View File

@@ -157,7 +157,8 @@
"max_concurrency": 128,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 180.0,
"early_stop_max_elapsed_s": 1200.0
"early_stop_max_elapsed_s": 1200.0,
"restart_engine_after_early_stop": true
},
"slo": {
"target_pass_rate": 0.95,

View File

@@ -167,7 +167,8 @@
"max_concurrency": 128,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 180.0,
"early_stop_max_elapsed_s": 1200.0
"early_stop_max_elapsed_s": 1200.0,
"restart_engine_after_early_stop": true
},
"slo": {
"target_pass_rate": 0.95,

View File

@@ -0,0 +1,208 @@
{
"study_id": "dash0-qwen235b-prefill-thinking-run3-ttft-tight-0323-topology",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "qwen3-235b-a22b-256k-0717-internal",
"served_model_name": "qwen3-235b-prefill"
},
"engine": {
"engine_name": "vllm",
"engine_version": "internal-on-dash0",
"exec_path": "/usr/local/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18127,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 1800,
"request_timeout_s": 1800,
"launch_args": [
"serve",
"/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZATION_LAYER_WISE": "1",
"VLLM_MOE_USE_DEEPEP": "0",
"VLLM_MOE_BALANCED_GATING": "0",
"VLLM_MOE_RANDOM_GATING": "0",
"VLLM_FUSED_MOE_CHUNK_SIZE": "4096",
"VLLM_DP_META_USE_CPU_GROUP": "0",
"VLLM_MLA_FP8_ATTENTION": "0",
"VLLM_MOE_EXPERTS_OVERLAP": "0",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_RESPONSE_TIMEOUT": "290",
"VLLM_FP8_USE_BLADNN": "1",
"VLLM_MOE_USE_BLADNN": "1",
"VLLM_USE_DEEP_GEMM": "0",
"VLLM_PD_TRY_CONNECT_TIMEOUT_SECONDS": "120",
"VLLM_DEEP_GEMM_WARMUP": "skip",
"DEEPEP_LL_COMBINE_USE_FP8": "1",
"DEEPEP_LL_BUFFER_FP8_OPT": "1",
"DEEPEP_LL_DISPATCH_USE_NVL": "1",
"DEEPEP_LL_COMBINE_USE_NVL": "1",
"ACCL_LOW_LATENCY_OPTIMIZE": "2",
"ACCL_WRITEBATCH_OPT": "2",
"ACCL_IBV_MTU": "9000",
"ACCL_TX_DEPTH": "1024",
"ACCL_RETRANSMIT_TIMEOUT": "17",
"NVSHMEM_IBGDA_NUM_RC_PER_PE": "4",
"BLLM_KVTRANS_RDMA_SP": "2",
"NCCL_SOCKET_IFNAME": "eth1",
"NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME": "eth1",
"GLOO_SOCKET_IFNAME": "eth1"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18127,
"served-model-name": "qwen3-235b-prefill",
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.85,
"enable-prefix-caching": true,
"enable-chunked-prefill": true,
"max-num-batched-tokens": 8192,
"disable-hybrid-kv-cache-manager": true,
"max-model-len": 262144,
"block-size": 64,
"max-num-seqs": 64,
"quantization": "fp8",
"cuda-graph-sizes": [
16,
32,
64,
96,
128,
160,
192,
224,
256,
288,
320,
352,
384,
416,
448,
480,
512,
544,
576,
608,
640,
672,
704,
736,
768,
800,
832,
864,
896,
928,
960,
992,
1024
],
"compilation-config": "{\"cudagraph_mode\":\"PIECEWISE\",\"use_inductor\":false,\"custom_ops\":[\"all\"],\"max_cudagraph_capture_size\":2048}",
"speculative-config": "{\"method\":\"eagle3\",\"num_speculative_tokens\":1,\"hf_overrides\":{\"rope_scaling\":{\"type\":\"yarn\",\"factor\":128,\"original_max_position_embeddings\":2048,\"semi_dynamic\":false,\"dynamic\":true},\"num_experts\":0},\"model\":\"/home/admin/resource/model/464482ce.qwen3-235b-a22b/0717-eagle-0820\"}",
"hf-overrides": "{\"architectures\":[\"Qwen3MoeForCausalLM\"],\"model_type\":\"qwen3_moe\"}",
"kv-cache-dtype": "fp8",
"disable-log-requests": true
},
"tunable_envs": [
"VLLM_ENABLE_TORCH_COMPILE"
],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [
4,
8
],
"allowed_tensor_parallel_sizes": [
4,
8
],
"allowed_data_parallel_sizes": [
1,
2
],
"allowed_expert_parallel_sizes": [
1,
2,
4,
8
]
},
"python_executable": "python3"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "thinking_w20260323_1000",
"request_mode": "chat",
"completion_tokens_override": 1,
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 180.0,
"early_stop_max_elapsed_s": 1200.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 8191,
"threshold_ms": 2000
},
{
"max_input_tokens": 32767,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "You are tuning a prefill-dominated vLLM serving stack. The trace replay forces completion length to exactly 1 token, so optimize for TTFT under the configured stepped SLO. Propose one launch-safe config patch that increases the maximum feasible sampling_u while respecting the topology constraints and avoiding known launch failures.",
"max_history_trials": 8,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -0,0 +1,122 @@
{
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-harness",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "Qwen/Qwen3-30B-A3B",
"served_model_name": "qwen3-30b-a3b-community"
},
"engine": {
"engine_name": "vllm",
"engine_version": "0.20.0",
"exec_path": "/tmp/wjh/venvs/vllm-0.20.0-cu129/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18230,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
"HOME": "/tmp/wjh",
"XDG_CACHE_HOME": "/tmp/wjh/.cache"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18230,
"served-model-name": "qwen3-30b-a3b-community"
},
"tunable_envs": [],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
},
"python_executable": "/tmp/wjh/venvs/vllm-0.20.0-cu129/bin/python"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"completion_tokens_override": 128,
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
},
"max_requests_per_probe": 512,
"replay_time_scale": 0.1,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 4,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
"max_history_trials": 8,
"use_harness": true,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -0,0 +1,122 @@
{
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-noharness",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "Qwen/Qwen3-30B-A3B",
"served_model_name": "qwen3-30b-a3b-community"
},
"engine": {
"engine_name": "vllm",
"engine_version": "0.20.0",
"exec_path": "/tmp/wjh/venvs/vllm-0.20.0-cu129/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18231,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
"HOME": "/tmp/wjh",
"XDG_CACHE_HOME": "/tmp/wjh/.cache"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18231,
"served-model-name": "qwen3-30b-a3b-community"
},
"tunable_envs": [],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
},
"python_executable": "/tmp/wjh/venvs/vllm-0.20.0-cu129/bin/python"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"completion_tokens_override": 128,
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
},
"max_requests_per_probe": 512,
"replay_time_scale": 0.1,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 4,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
"max_history_trials": 8,
"use_harness": false,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -0,0 +1,52 @@
{
"compare_id": "dash1-qwen235b-prefill-thinking-7day-baseline-vs-0323-vs-0327",
"study_spec_path": "dash0_qwen235b_prefill_thinking_run3_ttft_tight_0323.json",
"output_root": "../../.aituner-compare/dash1-qwen235b-prefill-thinking-7day-baseline-vs-0323-vs-0327",
"window_ids": [
"thinking_w20260321_1000",
"thinking_w20260322_1000",
"thinking_w20260323_1000",
"thinking_w20260324_1000",
"thinking_w20260325_1000",
"thinking_w20260326_1000",
"thinking_w20260327_1000"
],
"candidates": [
{
"name": "baseline",
"phase": 1,
"config_patch": {
"env_patch": {},
"flag_patch": {}
},
"runtime": {
"cuda_visible_devices": "0,1,2,3",
"port": 18141
}
},
{
"name": "tuned_0323",
"phase": 1,
"trial_ref": {
"study_root": "../../.aituner-prefill/dash0-qwen235b-prefill-thinking-run3-ttft-tight-0323-topology",
"trial_id": "trial-0006"
},
"runtime": {
"cuda_visible_devices": "4,5,6,7",
"port": 18142
}
},
{
"name": "tuned_0327",
"phase": 2,
"trial_ref": {
"study_root": "../../.aituner-prefill/dash0-qwen235b-prefill-thinking-run2-ttft-tight-topology",
"trial_id": "trial-0012"
},
"runtime": {
"cuda_visible_devices": "0,1,2,3,4,5,6,7",
"port": 18143
}
}
]
}

View File

@@ -0,0 +1,212 @@
{
"study_id": "dash1-qwen235b-prefill-thinking-run4-ttft-tight-0-32k-topology",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash1"
]
},
"model": {
"model_id": "qwen3-235b-a22b-256k-0717-internal",
"served_model_name": "qwen3-235b-prefill"
},
"engine": {
"engine_name": "vllm",
"engine_version": "internal-on-dash1",
"exec_path": "/usr/local/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18145,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 1800,
"request_timeout_s": 1800,
"launch_args": [
"serve",
"/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZATION_LAYER_WISE": "1",
"VLLM_MOE_USE_DEEPEP": "0",
"VLLM_MOE_BALANCED_GATING": "0",
"VLLM_MOE_RANDOM_GATING": "0",
"VLLM_FUSED_MOE_CHUNK_SIZE": "4096",
"VLLM_DP_META_USE_CPU_GROUP": "0",
"VLLM_MLA_FP8_ATTENTION": "0",
"VLLM_MOE_EXPERTS_OVERLAP": "0",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_RESPONSE_TIMEOUT": "290",
"VLLM_FP8_USE_BLADNN": "1",
"VLLM_MOE_USE_BLADNN": "1",
"VLLM_USE_DEEP_GEMM": "0",
"VLLM_PD_TRY_CONNECT_TIMEOUT_SECONDS": "120",
"VLLM_DEEP_GEMM_WARMUP": "skip",
"DEEPEP_LL_COMBINE_USE_FP8": "1",
"DEEPEP_LL_BUFFER_FP8_OPT": "1",
"DEEPEP_LL_DISPATCH_USE_NVL": "1",
"DEEPEP_LL_COMBINE_USE_NVL": "1",
"ACCL_LOW_LATENCY_OPTIMIZE": "2",
"ACCL_WRITEBATCH_OPT": "2",
"ACCL_IBV_MTU": "9000",
"ACCL_TX_DEPTH": "1024",
"ACCL_RETRANSMIT_TIMEOUT": "17",
"NVSHMEM_IBGDA_NUM_RC_PER_PE": "4",
"BLLM_KVTRANS_RDMA_SP": "2",
"NCCL_SOCKET_IFNAME": "eth1",
"NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME": "eth1",
"GLOO_SOCKET_IFNAME": "eth1"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18145,
"served-model-name": "qwen3-235b-prefill",
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.85,
"enable-prefix-caching": true,
"enable-chunked-prefill": true,
"max-num-batched-tokens": 8192,
"disable-hybrid-kv-cache-manager": true,
"max-model-len": 262144,
"block-size": 64,
"max-num-seqs": 64,
"quantization": "fp8",
"cuda-graph-sizes": [
16,
32,
64,
96,
128,
160,
192,
224,
256,
288,
320,
352,
384,
416,
448,
480,
512,
544,
576,
608,
640,
672,
704,
736,
768,
800,
832,
864,
896,
928,
960,
992,
1024
],
"compilation-config": "{\"cudagraph_mode\":\"PIECEWISE\",\"use_inductor\":false,\"custom_ops\":[\"all\"],\"max_cudagraph_capture_size\":2048}",
"speculative-config": "{\"method\":\"eagle3\",\"num_speculative_tokens\":1,\"hf_overrides\":{\"rope_scaling\":{\"type\":\"yarn\",\"factor\":128,\"original_max_position_embeddings\":2048,\"semi_dynamic\":false,\"dynamic\":true},\"num_experts\":0},\"model\":\"/home/admin/resource/model/464482ce.qwen3-235b-a22b/0717-eagle-0820\"}",
"hf-overrides": "{\"architectures\":[\"Qwen3MoeForCausalLM\"],\"model_type\":\"qwen3_moe\"}",
"kv-cache-dtype": "fp8",
"disable-log-requests": true
},
"tunable_envs": [
"VLLM_ENABLE_TORCH_COMPILE"
],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [
4,
8
],
"allowed_tensor_parallel_sizes": [
4,
8
],
"allowed_data_parallel_sizes": [
1,
2
],
"allowed_expert_parallel_sizes": [
1,
2,
4,
8
]
},
"python_executable": "python3"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "thinking_w20260327_1000",
"request_mode": "chat",
"completion_tokens_override": 1,
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 32768
},
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 180.0,
"early_stop_max_elapsed_s": 1200.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 8191,
"threshold_ms": 2000
},
{
"max_input_tokens": 32767,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "You are tuning a prefill-dominated vLLM serving stack on the 0~32k input-length bucket. The trace replay forces completion length to exactly 1 token, so optimize for TTFT under the configured stepped SLO. Propose one launch-safe config patch that increases the maximum feasible sampling_u while respecting the topology constraints and avoiding known launch failures.",
"max_history_trials": 8,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -0,0 +1,212 @@
{
"study_id": "dash1-qwen235b-prefill-thinking-run5-ttft-1s-2s-0-32k-topology",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash1"
]
},
"model": {
"model_id": "qwen3-235b-a22b-256k-0717-internal",
"served_model_name": "qwen3-235b-prefill"
},
"engine": {
"engine_name": "vllm",
"engine_version": "internal-on-dash1",
"exec_path": "/usr/local/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18146,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 1800,
"request_timeout_s": 1800,
"launch_args": [
"serve",
"/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZATION_LAYER_WISE": "1",
"VLLM_MOE_USE_DEEPEP": "0",
"VLLM_MOE_BALANCED_GATING": "0",
"VLLM_MOE_RANDOM_GATING": "0",
"VLLM_FUSED_MOE_CHUNK_SIZE": "4096",
"VLLM_DP_META_USE_CPU_GROUP": "0",
"VLLM_MLA_FP8_ATTENTION": "0",
"VLLM_MOE_EXPERTS_OVERLAP": "0",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_RESPONSE_TIMEOUT": "290",
"VLLM_FP8_USE_BLADNN": "1",
"VLLM_MOE_USE_BLADNN": "1",
"VLLM_USE_DEEP_GEMM": "0",
"VLLM_PD_TRY_CONNECT_TIMEOUT_SECONDS": "120",
"VLLM_DEEP_GEMM_WARMUP": "skip",
"DEEPEP_LL_COMBINE_USE_FP8": "1",
"DEEPEP_LL_BUFFER_FP8_OPT": "1",
"DEEPEP_LL_DISPATCH_USE_NVL": "1",
"DEEPEP_LL_COMBINE_USE_NVL": "1",
"ACCL_LOW_LATENCY_OPTIMIZE": "2",
"ACCL_WRITEBATCH_OPT": "2",
"ACCL_IBV_MTU": "9000",
"ACCL_TX_DEPTH": "1024",
"ACCL_RETRANSMIT_TIMEOUT": "17",
"NVSHMEM_IBGDA_NUM_RC_PER_PE": "4",
"BLLM_KVTRANS_RDMA_SP": "2",
"NCCL_SOCKET_IFNAME": "eth1",
"NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME": "eth1",
"GLOO_SOCKET_IFNAME": "eth1"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18146,
"served-model-name": "qwen3-235b-prefill",
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.85,
"enable-prefix-caching": true,
"enable-chunked-prefill": true,
"max-num-batched-tokens": 8192,
"disable-hybrid-kv-cache-manager": true,
"max-model-len": 262144,
"block-size": 64,
"max-num-seqs": 64,
"quantization": "fp8",
"cuda-graph-sizes": [
16,
32,
64,
96,
128,
160,
192,
224,
256,
288,
320,
352,
384,
416,
448,
480,
512,
544,
576,
608,
640,
672,
704,
736,
768,
800,
832,
864,
896,
928,
960,
992,
1024
],
"compilation-config": "{\"cudagraph_mode\":\"PIECEWISE\",\"use_inductor\":false,\"custom_ops\":[\"all\"],\"max_cudagraph_capture_size\":2048}",
"speculative-config": "{\"method\":\"eagle3\",\"num_speculative_tokens\":1,\"hf_overrides\":{\"rope_scaling\":{\"type\":\"yarn\",\"factor\":128,\"original_max_position_embeddings\":2048,\"semi_dynamic\":false,\"dynamic\":true},\"num_experts\":0},\"model\":\"/home/admin/resource/model/464482ce.qwen3-235b-a22b/0717-eagle-0820\"}",
"hf-overrides": "{\"architectures\":[\"Qwen3MoeForCausalLM\"],\"model_type\":\"qwen3_moe\"}",
"kv-cache-dtype": "fp8",
"disable-log-requests": true
},
"tunable_envs": [
"VLLM_ENABLE_TORCH_COMPILE"
],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [
4,
8
],
"allowed_tensor_parallel_sizes": [
4,
8
],
"allowed_data_parallel_sizes": [
1,
2
],
"allowed_expert_parallel_sizes": [
1,
2,
4,
8
]
},
"python_executable": "python3"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "thinking_w20260327_1000",
"request_mode": "chat",
"completion_tokens_override": 1,
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 32768
},
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 180.0,
"early_stop_max_elapsed_s": 1200.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 8191,
"threshold_ms": 1000
},
{
"max_input_tokens": 32767,
"threshold_ms": 2000
},
{
"threshold_ms": 2000
}
]
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "You are tuning a prefill-dominated vLLM serving stack on the 0~32k input-length bucket. The trace replay forces completion length to exactly 1 token, so optimize for TTFT under the configured stepped SLO. The SLO is stricter than the previous 2s/4s regime: <=8k prompts must hit 1s TTFT and <=32k prompts must hit 2s TTFT. Propose one launch-safe config patch that increases the maximum feasible sampling_u while respecting the topology constraints and avoiding known launch failures.",
"max_history_trials": 8,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -0,0 +1,125 @@
# AITuner Harness Summary
## What The Harness Adds
The harness turns each LLM proposal from open-ended config search into a bottleneck-directed decision.
1. Workload profile
- Extracts L-C-A features from the trace window:
- L: prompt length percentiles and tail ratio.
- C: prefix/cache reuse estimates from `hash_ids` when available.
- A: request rate, burst ratio, and interarrival variation.
- These features are injected into the prompt as a structured `Harnesses` section.
2. Trial diagnostics
- Reads recent trial result JSON.
- Summarizes feasible probes, all-infeasible probes, pass rates, request rates, latency percentiles, and failed SLO reason counts.
- Classifies the active bottleneck as `ttft_prefill`, `decode_tpot`, `admission_or_queueing`, `launch_or_memory`, or unknown.
3. Knob-family harnesses
- Maps bottlenecks to a small number of plausible knob families.
- Current harness families:
- `tensor-parallel-size`: long-prompt TTFT/prefill bottlenecks.
- `max-num-batched-tokens`: prefill batching or fragmentation, with trust-region guards.
- `max-num-seqs`: cache-heavy or admission-limited workloads.
- `enable-chunked-prefill`: long-tail prompt blocking.
- `gpu-memory-utilization`: memory headroom after topology and batching are stable.
- Each family has `use_when`, `procedure`, `guards`, and `active_now` fields.
4. Proposal discipline and early stop
- The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed.
- It must use adjacent legal topology choices and stay inside topology constraints.
- It receives tested config signatures, so it should not repeat already-tried configs.
- A deterministic harness stop can now emit `should_stop=true` before calling the LLM when completed validation evidence says another trial is not justified.
5. Baseline-first loop
- LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed.
- This aligns the loop with evaluate-then-search: the first LLM proposal sees measured bottleneck evidence rather than guessing from static config.
## What Accelerates Convergence
The speedup comes from reducing wasted proposal families, not from changing the benchmark metric.
1. Topology-before-runtime on prefill bottlenecks
- For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs.
- Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`.
2. Guarded stop after validation, not immediately after a strong incumbent
- If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks.
- It does not stop at the first large gain. It requires post-incumbent validation trials across nearby topology/runtime families, and stops only if those trials fail to produce a feasible per-GPU improvement.
- With the guard, `study tune` can write a `harness-stop-XXXX` proposal and exit without spending another GPU trial.
3. All-infeasible plateau detection
- When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family.
- This prevents continuing a direction such as DP-only scale-out after DP4 and DP8 plateau.
- Plateau alone does not trigger deterministic early stop; it forces either a different justified family or a later validation/convergence stop.
4. Cleaner early-stop handling
- Early-stopped probes no longer leave in-flight requests polluting the next probe.
- Default behavior drains in-flight requests for comparable production runs.
- Engine relaunch after early stop is available as opt-in for faster smoke studies, but it is not the default because it can change warm-state comparability.
5. Search-high saturation stop
- If the incumbent's highest measured probe is feasible and is within the configured binary-search resolution of `search.high`, the harness stops before asking the LLM for another proposal. Individual request failures can be present when the aggregate probe still meets the configured pass-rate SLO.
- This is not a model-specific threshold. It means the workload search range, not the engine config, is currently the limiting measurement bound.
6. Deterministic first probes
- After a baseline latency bottleneck, the harness can propose the adjacent legal TP increase before asking the LLM.
- After a TP incumbent improves per-GPU throughput, the harness keeps that topology and applies a same-topology runtime seed before trying DP/EP or broad runtime changes.
## qwen27b 0-8k Evidence
Source: `docs/qwen27b-chat-0-8k-harness-fig18.md`.
Metric: best-so-far feasible `request_rate_per_gpu`.
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5-12 |
| --- | ---: | ---: | ---: | ---: | ---: |
| Before harness | 0.0350 | 0.0617 | 0.0617 | 0.2025 | 0.2025 |
| After harness strict replay | 0.0350 | 0.2025 | 0.2025 stop | 0.2025 | 0.2025 |
Result:
- Before harness reached the best value at iter 4.
- After harness reached the same value at iter 2 and stopped at iter 3.
- Iterations-to-best improved from `4` to `2`, a `2x` convergence speedup on this case.
- The harness also avoided eight post-best infeasible runtime-only probes.
## Current Risks
- The harness is still prompt-guided for choosing the next non-stop proposal. The deterministic stop path is hard-coded in `study tune`, but proposal-family blocking is not yet enforced by a separate validator.
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials after validation evidence accumulates. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
- Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows.
## Qwen3-30B-A3B Community vLLM Evidence
Source: `docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md`.
Metric: best-so-far feasible `request_rate_per_gpu` on the bounded 0-8k chat replay with 128 output tokens and `replay_time_scale=0.1`.
Initial `search.high=0.125` result:
| Variant | Iter 1 | Iter 2 | Iter 3-12 |
| --- | ---: | ---: | ---: |
| no-harness | 1.0333 | 1.0333 | 1.0333 |
| harness | 1.0333 | 1.0333 stop | 1.0333 |
Result:
- Both variants found the same best measured config: the default community vLLM launch.
- Harness stopped at iter 2 because the incumbent saturated `search.high`; no LLM proposal or GPU trial was needed after baseline.
- No-harness spent the full 12-iteration budget: iter 2 was worse per GPU, and iter 3-12 were launch failures.
- This was a measurement-ceiling result, not proof of global optimality.
High `search.high=1.0` rerun:
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8-12 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness | 2.2000 | 3.2583 | 3.2583 | 3.2583 | 3.2583 | 3.3000 | 3.3500 | 3.3500 |
| harness-guided-v2 | 2.3833 | 3.2583 | 3.2833 | 3.3000 | 3.3000 stop | 3.3000 | 3.3000 | 3.3000 |
Result:
- Raising `search.high` showed that default vLLM was not actually optimal; the prior run was capped by workload range.
- Harness reached the same TP2/runtime config family in 4 iterations instead of 7 by making deterministic first TP and same-topology runtime proposals.
- The single-run best value differs by about 1.5% (`3.3000` vs `3.3500`) for the same config family, so this should be interpreted as faster convergence to the same region, not an exact single-run throughput win.

View File

@@ -0,0 +1,129 @@
# Harness-Guided AITuner Progress
## Goal
Improve AITuner convergence for the `dash0` internal vLLM + Qwen3.5-27B 0-8k chat study. The prior 12-iteration run can still propose worse configs after finding good ones. The new harness should make config proposals bottleneck-directed and stop spending GPU trials once no adjacent harness-guided probe is justified.
## Paper Alignment
- Prompt structure now includes an explicit `[Harnesses]` section aligned with paper Figure 12.
- The harness uses the paper's L-C-A workload model:
- L: prompt length percentiles and tail ratio.
- C: prefix/KV-cache reuse estimated from repeated `hash_ids` blocks when available.
- A: request rate, 1-second QPS burst ratio, and interarrival CV.
- Knob rules follow the paper's Figure 13 style:
- map active bottleneck to a knob family;
- probe adjacent legal choices;
- enforce guard conditions to avoid harmful side effects;
- prefer stopping over weak exploratory proposals after convergence.
## Local Implementation Log
- Added `src/aituner/harness.py`.
- Builds structured harness context for prompt injection.
- Adds TP, max-num-seqs, max-num-batched-tokens, chunked-prefill, and memory-utilization harnesses when those knobs are tunable.
- Extracts compact recent trial diagnostics from result JSON files.
- Adds a convergence guard based on recent completed trial performance.
- Adds an infeasible-progress guard: when recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT after changing one knob family, the next proposal must switch primary family or stop.
- Classifies `slo_pass_rate_unrecoverable` by latency failure counts first, and ignores probe-budget markers such as `probe_elapsed_s>` for bottleneck voting, so TTFT-heavy failures stay aligned to prefill/TP or batching harnesses instead of being treated as generic queueing.
- Extended `src/aituner/trace.py`.
- `summarize_window` now reports L-C-A features.
- `TraceRequest` now carries optional metadata for `hash_ids`, turn, parent chat id, and trace type.
- Extended `src/aituner/llm.py`.
- Prompt now includes tested config signatures and the structured harness section.
- Prompt schema now asks for `should_stop`.
- Extended `src/aituner/spec.py`.
- `Proposal` accepts optional `should_stop`.
- Extended `src/aituner/cli.py`.
- `study tune` honors `should_stop=true` by recording the proposal and not launching another GPU trial.
- Extended `tests/test_core_flow.py`.
- Prompt includes harness context.
- Trace summary includes new L-C-A fields.
- Proposal parsing accepts `should_stop`.
- CLI does not launch a trial for a stop proposal.
## Local Verification
- `python3 -m compileall -q src tests`: passed.
- `PYTHONPATH=src python3 -m unittest tests.test_core_flow`: passed, 62 tests.
- `pytest -q` and `python3 -m pytest -q`: not runnable locally because `pytest` is not installed.
## Remote Experiment Log
### 2026-04-25 16:30-16:45 CST
- Pushed commit `2c5e9af` to `origin/main` and pulled it on `dash0`.
- Remote prompt check command:
- `PYTHONPATH=src python3 -m aituner.cli study prompt --study-root /tmp/aituner-harness-prompt-check/dash0-qwen27b-tight-slo-10min-run4-chat-0-8k --store-root /tmp/aituner-harness-prompt-check --prompt-name harness-check`
- Harness profile for `chat_w20260311_1000`, after applying the 0-8k filter:
- L: p50 1992, p95 7628, p99 8102, tail ratio 3.83, regime `moderate_tail_prefill_sensitive`.
- C: repeated token ratio estimate 0.191, repeated block ratio 0.189, multi-turn ratio 0.160, regime `low_prefix_reuse`.
- A: request rate 29.52 req/s, p95 1s QPS 40, burst ratio 1.36, regime `smooth`.
- Active harnesses: `tensor-parallel-size` and `max-num-batched-tokens`, which matches a TTFT/prefill-sensitive 0-8k chat workload.
- Remote `compileall` passed.
- Remote `unittest discover` initially exposed two pre-existing path-sensitive tests that hardcoded `/home/gahow/phd/aituner`; fixed them to derive `REPO_ROOT` from the test file path.
### 2026-04-25 16:38-16:58 CST
- Started real run in tmux session `aituner_harness_qwen27b_0_8k_20260425`.
- Store root: `.aituner/harness-studies-20260425`.
- First proposal followed the harness:
- proposal: `tensor-parallel-size: 2`;
- rationale: L profile is prefill-sensitive, prefix reuse is low, arrivals are smooth, so probe adjacent TP before runtime batching knobs.
- First high-load probe at `sampling_u=0.03125` was infeasible:
- request rate 0.895 req/s;
- pass rate 0.145;
- p95 TTFT 4063 ms and p95 TPOT 113 ms;
- failed reasons included `tpot_ms>50.0` and `slo_pass_rate_unrecoverable`.
- Important implementation issue found: after an early-stopped probe, the worker returned while in-flight HTTP requests could continue occupying the engine, stalling/polluting the next binary-search probe.
- Action: stopped the run and freed GPUs. Updating `worker._replay_requests` to drain in-flight requests after early stop before the next probe starts.
### 2026-04-25 17:00-17:12 CST
- r2 confirmed that draining avoids immediate cross-probe pollution, but the first LLM trial still started from a speculative TP=2 edit without a measured incumbent.
- This is not aligned with the paper's agentic loop, which evaluates the initial configuration first and then searches from measured feedback.
- Action: update `study tune` so LLM-driven studies automatically materialize a baseline empty-patch trial first, unless `--skip-baseline` is passed. This should reduce early bad proposals because the first LLM edit will see real baseline bottleneck diagnostics and an incumbent request_rate_per_gpu.
### 2026-04-25 17:20-18:30 CST
- r3 started with baseline-first enabled, but the full 0-8k run was too slow for fast iteration with raw chat completions. Stopped it before using it as a convergence signal.
- A fast validation using `max_requests_per_probe=160` was invalid: the trace is downsampled before threshold selection, so lower thresholds can end up with `request_count=0`. Do not use that result for performance claims.
- Prefill smoke v1 used `completion_tokens_override=1` but kept the TPOT SLO. That made TPOT missing failures dominate, so it was useful only for checking control flow, not for performance.
### 2026-04-25 18:30-20:10 CST
- Prefill smoke v2 used real dash0 internal vLLM, Qwen3.5-27B, the real 0-8k prompt distribution and arrivals, `completion_tokens_override=1`, and `tpot_rule=null`.
- Trial 0001 baseline TP1/DP1:
- sampling `0.0078125`: pass rate 0.270, mean TTFT 2033.9 ms, p95 TTFT 5656.7 ms, p99 TTFT 6832.8 ms.
- Trial 0002 TP1/DP2:
- sampling `0.0078125`: pass rate 0.277, mean TTFT 1766.9 ms, p95 TTFT 4215.3 ms, p99 TTFT 5801.7 ms.
- Trial 0003 TP1/DP4:
- sampling `0.0078125`: pass rate 0.345, mean TTFT 1668.9 ms, p95 TTFT 3818.4 ms, p99 TTFT 5804.9 ms.
- Trial 0004 TP1/DP8:
- sampling `0.0078125`: pass rate 0.345, mean TTFT 1675.7 ms, p95 TTFT 3823.4 ms.
- Interpretation:
- The harness improved directionality: after the measured baseline, proposals followed a consistent scale-out path and avoided random runtime-knob churn.
- The smoke result improved p95 TTFT by about 32% versus baseline at the low sampling threshold and improved pass rate from 0.270 to 0.345 within 3-4 trials.
- It did not reach the 95% pass-rate SLO in this smoke setting, so this is not a full proof of convergence to a good production config.
- DP8 did not improve over DP4, which exposed a gap: when every trial is infeasible, the prior convergence guard had no feasible incumbent and could not detect plateau.
### 2026-04-25 20:10 CST
- Added the all-infeasible plateau guard described above.
- Added unit coverage for:
- TTFT failure classification under `slo_pass_rate_unrecoverable`;
- blocking a repeat of the DP family after DP4 and DP8 show no material improvement at the same sampling threshold.
- Pulled the commit on `dash0` and reran remote verification:
- `python3 -m compileall -q src tests`: passed.
- `PYTHONPATH=src python3 -m unittest discover -s tests -p "test_*.py"`: passed, 62 tests.
- Regenerated a prompt against the real smoke v2 history:
- `convergence_guard.reason`: `data-parallel-size_plateau_on_infeasible_trials`.
- `should_stop_if_no_harness_can_justify_a_new_adjacent_probe`: `true`.
- blocked primary family: `data-parallel-size`.
- latest two active bottlenecks after ignoring `probe_elapsed_s>` for voting: `ttft_prefill`, `ttft_prefill`.
- Current status: the harness now has the mechanism needed to avoid continuing the exact DP-only direction seen in the smoke v2 plateau. The next real experiment should either switch to a bottleneck-justified mixed TP/DP candidate or return `should_stop=true`.
Remaining next steps:
1. Use the Fig18-style qwen27b 0-8k comparison in `docs/qwen27b-chat-0-8k-harness-fig18.md` as the current convergence evidence.
2. If a future full no-relaunch rerun is required for publication-quality reproduction, reserve a multi-hour dash0 window; the comparable full-chat evaluator keeps drain-based probe isolation and is much slower than prefill smoke.

View File

@@ -0,0 +1,85 @@
# qwen235b-thinking-decode-0323
qwen3-235b-a22b `thinking` trace, `decode_only` mode, internal vLLM (`/usr/local/bin/vllm`), tuned on `thinking_w20260323_1000` with `TPOT <= 40ms`.
## Setup
- Hardware: `dash2`, `8x H20`
- Model: `/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717`
- Engine: internal vLLM, decode-only mode with `--kv-transfer-config {"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}`
- Baseline topology: `TP=4, DP=2, EP=8`
- Trace: `thinking_w20260323_1000`
- Trace source: `trace_windows/traces/thinking_w20260323_1000.jsonl`
- Window duration: `600s` (`10:00-10:10`, `2026-03-23`)
- Request mode: `decode_only`
- SLO:
- pass target: `95%`
- `TPOT <= 40ms`
- `TTFT` not enforced
- Search:
- `sampling_u in [0, 0.125]`
- `max_probes = 6`
- `12` trials total
- Proposal model: `codex / gpt-5.4`
## Run assets
- Study root: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-decode/dash2-qwen235b-decode-thinking-run1-0323-tpot40-topology`
- State: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-decode/dash2-qwen235b-decode-thinking-run1-0323-tpot40-topology/state.json`
- Spec: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-decode/specs/dash2_qwen235b_decode_thinking_run1_0323_tpot40_topology.json`
## Best result
- Best trial: `trial-0007`
- Best config delta:
- `gpu-memory-utilization=0.86`
- Best topology stayed unchanged:
- `tensor-parallel-size=4`
- `data-parallel-size=2`
- `expert-parallel-size=8`
- Best `sampling_u`: `0.033736228943`
- Best request rate: `0.48333333333333334 req/s`
- Best request rate per GPU: `0.06041666666666667 req/s/gpu`
- Best pass rate: `0.9551724137931035`
Compared with baseline:
- `trial-0001`: `0.43333333333333335 req/s`, `0.05416666666666667 req/s/gpu`
- `trial-0007`: `0.48333333333333334 req/s`, `0.06041666666666667 req/s/gpu`
- Throughput gain: `1.12x`
Best-point latency:
- `TPOT mean/p50/p90/p95/p99 = 26.18 / 24.04 / 38.46 / 39.55 / 40.76 ms`
## 12-trial summary
| Trial | Proposed config delta | Result |
| --- | --- | --- |
| `trial-0001` | baseline `TP4/DP2/EP8` | `0.4333 req/s`, feasible |
| `trial-0002` | `EP=4` | launch fail |
| `trial-0003` | `gpu-memory-utilization=0.8`, `max-num-seqs=224` | infeasible |
| `trial-0004` | `gpu-memory-utilization=0.8` | infeasible |
| `trial-0005` | `gpu-memory-utilization=0.82` | `0.4650 req/s`, feasible |
| `trial-0006` | `gpu-memory-utilization=0.84` | infeasible |
| `trial-0007` | `gpu-memory-utilization=0.86` | `0.4833 req/s`, feasible, best |
| `trial-0008` | `gpu-memory-utilization=0.87` | infeasible |
| `trial-0009` | `gpu-memory-utilization=0.86`, `block-size=32` | launch fail |
| `trial-0010` | `gpu-memory-utilization=0.86`, `max-num-seqs=208` | infeasible |
| `trial-0011` | `gpu-memory-utilization=0.86`, `max-num-seqs=176` | infeasible |
| `trial-0012` | `gpu-memory-utilization=0.86`, `max-num-batched-tokens=896` | infeasible |
## Key insights
- This `0323` window did not produce a better topology than the baseline. The winning move was a small memory-headroom increase, not a TP/DP/EP change.
- `EP=4` was not viable under the current deployment shape and failed at launch, so the run quickly converged away from topology changes.
- The best point is very close to the SLO edge: `TPOT p95 ~= 39.55ms`, so the remaining headroom is small.
- Compared with the heavier `0327` decode-only tuning, `0323` is a milder window: baseline is already strong, and tuning only adds about `11.5%`.
## Recommendation
For a `0323`-like decode-only `thinking` window, keep the baseline `TP4/DP2/EP8` topology and use:
- `gpu-memory-utilization=0.86`
Do not treat this run as evidence that `0323` prefers the same topology changes as `0327`; this study mainly supports a small residency-headroom refinement.

View File

@@ -77,6 +77,7 @@ Best-point latency:
- `TP1/DP8/EP8` launched, but did not beat `TP2/DP4/EP8`.
- `EP4` under `TP2/DP4` failed at launch and should be treated as negative evidence for this stack.
- After topology settled at `TP2/DP4/EP8`, the useful runtime refinement was tighter decode batching: `max-num-seqs=128`, `max-num-batched-tokens=256`.
- Harness mechanism and ablation notes are in `one-shot-mechanism-ablation-20260502.md`.
## Current recommendation

View File

@@ -0,0 +1,129 @@
# Qwen235B Thinking Decode-Only Harness Run, 2026-04-28
## Goal
Run the qwen235b thinking decode-only tuning with the same harness-guided workflow used for the prefill-only test, while keeping the harness generic. The harness must use workload mode, configured SLOs, legal topology constraints, and measured trial history rather than testcase-specific throughput thresholds.
## Baseline Reference
The before-harness comparison run is `dash0-qwen235b-decode-thinking-run5-tpot40-topology`:
| Iter | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| before harness request/s | 0.1267 | 0.2450 | infeasible | launch fail | infeasible | infeasible | infeasible | infeasible | 0.2817 | infeasible | infeasible | infeasible |
Before harness, the best feasible config appeared at iter 9 with 0.2817 request/s.
## Harness Change
The decode-only harness now defaults to `decode_tpot` when `trace.request_mode=decode_only` and a TPOT SLO is configured. This avoids treating long decode-only prompt hints as a TTFT-prefill workload.
Active decode harness families are generic:
- `tensor-parallel-size`: legal TP/DP redistribution, judged by configured SLO pass rate and request_rate_per_gpu.
- `data-parallel-size`: legal replica topology changes for decode/admission bottlenecks.
- `max-num-seqs`: concurrency adjustment from observed TPOT failures or SLO headroom.
- `max-num-batched-tokens`: decode batching adjustment after topology is stable.
- `expert-parallel`: preserve known-valid EP topology, but change EP size only with EP-specific evidence.
No qwen235b-specific threshold or testcase-specific rule was added.
## Current Run
Started on dash0, 8x H20.
- Remote spec: `.aituner/harness-qwen235b-decode-20260428/dash0_qwen235b_decode_thinking_harness_20260428.json`
- Remote store: `.aituner/harness-qwen235b-decode-20260428/dash0-qwen235b-decode-thinking-harness-20260428`
- Remote tmux: `aituner_qwen235b_decode_harness_20260428`
- Remote log: `logs/qwen235b_decode_harness_20260428.log`
- Code commit: `39aa47f`
- Verification: local and dash0 both passed `PYTHONPATH=src python3 -m unittest discover -s tests`.
The first attempt started a duplicate `trial-0001` baseline. Because the identical baseline was already measured in run5 and the decode probe can run for many minutes, that duplicate run was stopped and GPUs were freed.
The active run is now seeded from the real run5 baseline and continues from `trial-0002`:
- Remote spec: `.aituner/harness-qwen235b-decode-20260428-seeded/dash0_qwen235b_decode_thinking_harness_seeded_20260428.json`
- Remote store: `.aituner/harness-qwen235b-decode-20260428-seeded/dash0-qwen235b-decode-thinking-harness-seeded-20260428`
- Seeded `trial-0001`: 0.1267 request/s, 0.0158 request/s/GPU, pass rate 0.9868.
- `proposal-0002`: legal adjacent decode topology move from `TP4/DP2/EP8` to `TP2/DP4/EP8`; no EP-size search and no testcase threshold.
- `trial-0002`: completed, 0.3767 request/s, 0.0471 request/s/GPU, pass rate 0.9779.
- `trial-0003`: completed with no feasible point for `TP1/DP8/EP8`.
- `trial-0004`: completed with no feasible point for `max-num-seqs=160`.
- Important caveat: `trial-0004` did not actually validate `TP2/DP4/EP8 + max-num-seqs=160`. AITuner applies `config_patch` relative to the study base config, and the proposal only patched `max-num-seqs`. The actual launch therefore used the base topology `TP4/DP2/EP8 + max-num-seqs=160`, so this is not evidence that same-topology refinement around `trial-0002` is exhausted.
- `trial-0005`: corrected same-topology validation, `TP2/DP4/EP8 + max-num-seqs=160`; completed with no feasible point.
The `trial-0002` proposal matches the first useful topology direction from the earlier before-harness run, but the new harness-controlled run measured substantially better throughput for that topology.
## Result Judgment
Fig-18-style raw throughput table:
| Run | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | ---: | ---: | --- | --- | --- | --- | --- | --- | ---: | --- | --- | --- |
| before harness request/s | 0.1267 | 0.2450 | infeasible | launch fail | infeasible | infeasible | infeasible | infeasible | 0.2817 | infeasible | infeasible | infeasible |
| harness request/s | 0.1267 | 0.3767 | infeasible | infeasible | infeasible | not run | not run | not run | not run | not run | not run | not run |
Per-GPU throughput table:
| Run | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | ---: | ---: | --- | --- | --- | --- | --- | --- | ---: | --- | --- | --- |
| before harness req/s/GPU | 0.0158 | 0.0306 | infeasible | launch fail | infeasible | infeasible | infeasible | infeasible | 0.0352 | infeasible | infeasible | infeasible |
| harness req/s/GPU | 0.0158 | 0.0471 | infeasible | infeasible | infeasible | not run | not run | not run | not run | not run | not run | not run |
Decision: the harness accelerated convergence on qwen235b decode-only, but this is not a proof of global optimality after one proposal. The before-harness run first reached its best observed throughput at iter 9 with 0.2817 request/s. The harness run exceeded that value at iter 2 with 0.3767 request/s, a 1.34x improvement over the before-harness 12-iter best and a 2.97x improvement over the baseline config.
The harness did not stop cleanly after finding the strong incumbent. It spent one additional trial on `TP1/DP8/EP8`, which found no feasible point. The next proposal intended same-topology runtime validation, but omitted the incumbent topology fields, so the materialized trial validated the base topology instead. This issue was corrected with `trial-0005`.
Important interpretation: `trial-0002` should be called the current best observed config, not a global optimum proof. The harness got there quickly because the decode-only harness biases the first proposal toward the most relevant adjacent topology redistribution, `TP4/DP2/EP8 -> TP2/DP4/EP8`, instead of spending trials on prefill-oriented runtime knobs. Later validation now supports local optimality against the tested adjacent topology and the tested same-topology `max-num-seqs=160` runtime refinement.
Follow-up implementation after this result:
- `strong_incumbent.guard_active` no longer directly contributes to `should_stop_if_no_harness_can_justify_a_new_adjacent_probe`.
- A strong incumbent now means "enter validation phase": run adjacent topology or same-topology runtime probes that could falsify the incumbent.
- The proposal rules now explicitly say not to stop solely because a strong incumbent appeared.
- Proposal parsing now accepts structured `observation`/`diagnosis` by converting them to text, so a usable validation proposal is not dropped only because the LLM used an object instead of a string.
After the implementation fix, the previously rejected `proposal-0004` was resumed as a validation trial:
- `trial-0004`: intended same-topology validation with `max-num-seqs=160`, but actually ran on base topology because the proposal omitted `TP2/DP4/EP8`.
- Remote tmux: `aituner_qwen235b_decode_harness_validate_20260428`.
- Result: completed with no feasible point. This is useful negative evidence for the base topology plus `max-num-seqs=160`, but not for the `trial-0002` incumbent topology.
A second validation trial was then launched with the full incumbent topology in the patch:
- `trial-0005` config: `TP2/DP4/EP8 + max-num-seqs=160`.
- Search range: low `0.017028808593`, high `0.125`, tolerance `0.001`, max probes `6`.
- Result: completed with no feasible point; `trial-0002` remained the best trial.
- Probe outcomes:
| Probe sampling_u | Request/s | Pass rate | Feasible | Early-stop reason |
| ---: | ---: | ---: | --- | --- |
| 0.0710144 | 1.7800 | 0.2818 | no | `slo_pass_rate_unrecoverable` |
| 0.0440216 | 1.0900 | 0.1789 | no | `slo_pass_rate_unrecoverable` |
| 0.0305252 | 0.7050 | 0.3002 | no | `slo_pass_rate_unrecoverable` |
| 0.0237770 | 0.5417 | 0.4092 | no | `slo_pass_rate_unrecoverable` |
| 0.0204029 | 0.4533 | 0.4890 | no | `slo_pass_rate_unrecoverable` |
| 0.0187159 | 0.4117 | 0.5466 | no | `slo_pass_rate_unrecoverable` |
This directly answers the one-iter-to-best concern for this refinement: the harness did not stop after `trial-0002`; it ran a corrected same-topology validation, and every tested point above the incumbent search floor failed the 95% TPOT SLO. Therefore `max-num-seqs=160` does not falsify `trial-0002` as the current best.
## Follow-up Fix
The seeded prompt exposed a generic diagnosis issue: if the best feasible probe had no latency failures, the harness could miss the prior infeasible probe that showed the real bottleneck at higher load. The harness now scans the probe sequence backward and uses the nearest non-trivial bottleneck before falling back to the best feasible probe. This keeps decode-only runs focused on `decode_tpot` after a feasible low-load point, without adding testcase thresholds.
A second generic diagnosis bug was fixed: non-SLO bookkeeping counts such as `probe_elapsed_s>...` no longer collapse to `ttft_prefill` when TTFT/TPOT/request failure counts are all zero.
## Follow-up Fix, 2026-04-30
The base-relative patch issue is now guarded in code, not only in the LLM prompt. When `StudyStore.materialize_trial` sees a runtime/env-only proposal after a non-base incumbent has been found, it inherits the incumbent topology patch into the trial spec unless the proposal explicitly provides a topology. This keeps same-topology runtime validation on the actual incumbent while preserving the ability to test the base topology by stating it explicitly.
Local verification at that commit: `PYTHONPATH=src python3 -m unittest discover -s tests` passed. The current repository suite has since grown; rerun the command rather than relying on this historical test count.
## Current Harness Judgment
For qwen235b decode-only, the harness still accelerates convergence: before harness, the best observed 12-iter result appeared at iter 9 with 0.2817 request/s; with harness, iter 2 reached 0.3767 request/s and later validation did not find a better adjacent or same-topology runtime point.
The remaining optimization is validation cost, not convergence quality. `trial-0005` took a long time because early-stopped decode-only probes still had to wait for in-flight long-output requests unless the engine is restarted after early stop. As of 2026-05-02, decode-only studies default to `trace.restart_engine_after_early_stop=true` when the field is not explicitly set, and the qwen235b decode examples set it explicitly.
See `docs/qwen235b-thinking-decode/one-shot-mechanism-ablation-20260502.md` for the detailed mechanism explanation and harness ablation.

View File

@@ -0,0 +1,150 @@
# qwen235b Decode Harness One-Shot Mechanism and Ablation, 2026-05-02
## Question
The harness run reached its best observed qwen235b decode-only config at iter 2:
`TP4/DP2/EP8 -> TP2/DP4/EP8`
This document explains why that happened, what information the harness added to the LLM prompt, what the LLM did with that information, and what the non-harness ablation shows.
## Short Answer
The iter-2 result is not magic and should not be described as a global optimum proof. It is a local topology sweet spot for this decode-only workload:
- Baseline `TP4/DP2/EP8` has only 2 data-parallel replicas and pays tensor-parallel communication on every decode step.
- `TP2/DP4/EP8` halves tensor-parallel width and doubles independent decode replicas while preserving the known-good EP8 MoE sharding.
- `TP1/DP8/EP8` goes too far: it was tested next and produced no feasible point.
- Same-topology `TP2/DP4/EP8 + max-num-seqs=160` was also tested later and produced no feasible point.
So the harness run's iter 2 is best observed because it hit the nearby topology balance point early, and the follow-up validation probes did not falsify it.
## Mechanism
The workload is `decode_only` with `TPOT <= 40ms` and no TTFT objective. In this regime, the critical cost is steady-state token generation rather than prompt prefill latency.
For a large MoE decode stack:
- Higher TP can reduce per-GPU model shard size, but it also adds tensor-parallel collectives to every decode step.
- Higher DP gives more independent serving replicas and absorbs bursty arrivals better, but each replica has less tensor parallelism.
- EP should not be changed without EP-specific evidence, because MoE expert sharding affects launch safety, memory layout, and expert dispatch.
The baseline shape `TP4/DP2/EP8` is therefore not obviously optimal for decode. The adjacent legal move `TP2/DP4/EP8` is the natural first test: reduce repeated per-token TP communication and increase replica count while keeping all 8 GPUs and EP8 fixed.
The measured data supports this:
| Run | Config | Best sampling_u | Request/s | Pass rate |
| --- | --- | ---: | ---: | ---: |
| harness trial-0001 | `TP4/DP2/EP8` | 0.0058594 | 0.1267 | 0.9868 |
| harness trial-0002 | `TP2/DP4/EP8` | 0.0170288 | 0.3767 | 0.9779 |
| harness trial-0003 | `TP1/DP8/EP8` | none | infeasible | none |
| harness trial-0005 | `TP2/DP4/EP8 + max-num-seqs=160` | none | infeasible | none |
## Harness Information Added
The harness prompt added structured context that the non-harness prompt did not have:
| Harness field | Concrete value in this run | How it affected the proposal |
| --- | --- | --- |
| workload mode | `request_mode=decode_only`; TTFT not an objective | Avoid prefill-first reasoning; optimize TPOT and decode throughput. |
| active bottleneck | `decode_tpot` | Make TP/DP redistribution and decode batching relevant, not TTFT knobs. |
| L-C-A profile | prompt p50 1491, p95 19670, p99 29961; prefix reuse about 0.41; burst ratio about 1.40 | Treat the workload as long-tail, moderately cache-reused, moderately bursty decode. |
| current best | baseline request/s/GPU 0.0158, pass rate 0.9868 | Require proposals to improve per-GPU throughput under SLO. |
| legal topology candidates | TP/DP products constrained to 8 GPUs; candidate includes `TP2/DP4/EP8` and `TP1/DP8/EP8` | Restrict search to launch-plausible adjacent topologies. |
| knob harness rules | topology-first for `decode_tpot`; keep EP fixed without EP-specific evidence | Pick `TP2/DP4/EP8`, not EP changes or runtime-only knobs first. |
| tested signatures | only baseline tested at iter 2 | Avoid repeating baseline; choose first adjacent topology. |
The relevant LLM response for harness `proposal-0002` followed this structure:
```json
{
"diagnosis": "Follow the topology-first harness for decode_tpot. Because the incumbent already satisfies the TPOT SLO, the next justified adjacent probe is to trade some tensor parallelism for more data-parallel replicas, while keeping expert parallel fixed to avoid introducing an EP-specific variable without evidence. The adjacent legal move from TP4/DP2 is TP2/DP4 with EP8 preserved.",
"config_patch": {
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
"expert-parallel-size": 8
}
}
}
```
The important behavior is not just "choose TP2/DP4"; it is "choose the adjacent topology, keep EP fixed, judge by request_rate_per_gpu and TPOT SLO, then validate nearby alternatives."
## Non-Harness Ablation
The before-harness run is `dash0-qwen235b-decode-thinking-run5-tpot40-topology`.
It is a useful ablation because it used the same trace, same model family, same baseline topology, same TPOT SLO, and same 12-trial budget, but did not include the structured harness context.
| Iter | Non-harness proposal | Result |
| ---: | --- | --- |
| 1 | baseline `TP4/DP2/EP8` | 0.1267 request/s |
| 2 | `TP2/DP4` | 0.2450 request/s |
| 3 | `TP1/DP8/EP8` | infeasible |
| 4 | `TP2/DP4/EP4` | launch fail |
| 5 | `gpu-memory-utilization=0.8`, `max-num-seqs=256` | infeasible |
| 6 | `max-num-seqs=128` | infeasible |
| 7 | `block-size=128` | infeasible |
| 8 | `max-num-batched-tokens=384` | infeasible |
| 9 | `TP2/DP4/EP8 + max-num-seqs=128 + max-num-batched-tokens=256` | 0.2817 request/s |
| 10 | trial 9 + `block-size=128` | infeasible |
| 11 | `TP1/DP8/EP8 + max-num-seqs=128 + max-num-batched-tokens=256` | infeasible |
| 12 | `TP2/DP4/EP8 + max-num-seqs=96 + max-num-batched-tokens=192` | infeasible |
The non-harness LLM also found `TP2/DP4` at iter 2, so we should not claim that the harness uniquely discovered the direction. The difference is that the non-harness prompt left the model to reason from raw history and raw topology candidates. After iter 2 it spent trials on EP changes, memory/concurrency changes, block size, and batch-token variants before finding its best observed point at iter 9.
## Full Harness Ablation
| Run | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Best observed |
| --- | ---: | ---: | --- | --- | --- | ---: |
| non-harness | 0.1267 | 0.2450 | infeasible | launch fail | infeasible | 0.2817 at iter 9 |
| harness | 0.1267 | 0.3767 | infeasible | infeasible | infeasible | 0.3767 at iter 2 |
The harness accelerated convergence in two ways:
- It made the first post-baseline trial a structured adjacent topology test with EP fixed.
- It converted later iterations into validation of local alternatives rather than broad, weakly justified search.
The measured validation points after iter 2:
| Trial | Config | Outcome |
| --- | --- | --- |
| trial-0003 | `TP1/DP8/EP8` | no feasible point |
| trial-0004 | intended `max-num-seqs=160`, but actually base topology due old base-relative patch issue | no feasible point, not valid incumbent validation |
| trial-0005 | `TP2/DP4/EP8 + max-num-seqs=160` | no feasible point |
For `trial-0005`, every probe above the incumbent floor failed:
| Probe sampling_u | Request/s | Pass rate | Feasible |
| ---: | ---: | ---: | --- |
| 0.0710144 | 1.7800 | 0.2818 | no |
| 0.0440216 | 1.0900 | 0.1789 | no |
| 0.0305252 | 0.7050 | 0.3002 | no |
| 0.0237770 | 0.5417 | 0.4092 | no |
| 0.0204029 | 0.4533 | 0.4890 | no |
| 0.0187159 | 0.4117 | 0.5466 | no |
This is the evidence that iter 2 was not just a premature stop. The harness continued probing nearby alternatives, and those alternatives did not beat the incumbent.
## Implementation Update
Long decode-only validation exposed a cost issue: once a probe became SLO-unrecoverable, the worker still waited for in-flight long-output requests unless the study explicitly enabled engine restart after early stop.
The implementation now makes this the default for decode-only studies:
- `trace.request_mode=decode_only` and no explicit `restart_engine_after_early_stop` means `restart_engine_after_early_stop=true`.
- An explicit `restart_engine_after_early_stop=false` is still honored.
- Chat/prefill studies keep the old default `false`.
- The LLM prompt now includes `early_stop_max_lag_s`, `early_stop_max_elapsed_s`, and `restart_engine_after_early_stop` in the trace block.
- qwen235b decode example specs now explicitly set `restart_engine_after_early_stop=true`.
This change does not alter the SLO decision for a probe. It changes the cost model after an already-unrecoverable probe: cancel in-flight requests, restart the engine cleanly, and move to the next probe instead of waiting for long decode tails.
## Interpretation
The correct claim is:
The harness did not prove global optimality in one iteration. It made the first post-baseline proposal land on the correct local topology neighborhood, and follow-up harness validation failed to find a better adjacent topology or tested same-topology runtime refinement. On this workload, that was enough for iter 2 to remain the best observed configuration.
The non-harness ablation shows that the model could guess the same topology direction, but without harness structure it spent the remaining budget exploring less controlled directions and only reached its best observed result at iter 9.

View File

@@ -0,0 +1,58 @@
# qwen235b Thinking Prefill Harness Test
## Setup
- Workload: `qwen3-235b-a22b` thinking trace, prefill-only replay with `min_tokens=max_tokens=1`.
- Window: `thinking_w20260327_1000`.
- SLO: 95% pass rate, stepped TTFT `3s/6s/9s`.
- Metric: best-so-far feasible `request_rate_per_gpu`.
- Before-harness source: actual 12-trial run
`.aituner-prefill/dash0-qwen235b-prefill-thinking-run1-ttft-topology`.
- Harness test source:
`.aituner/harness-qwen235b-prefill-20260427/dash0-qwen235b-prefill-thinking-harness-run1-20260427`.
## Result So Far
The harness run was stopped after establishing the convergence result and observing the next weak proposal. The useful comparison is already visible by iter 2.
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| Before harness, actual run1 | 0.2029 | 0.2029 | 0.2029 | 0.2029 | 0.2029 | 0.3575 | 0.3575 | 0.3708 | 0.3708 | 0.3794 | 0.3794 | 0.3794 |
| Harness, actual 2026-04-27 run | 0.1892 | 0.3863 | 0.3863 | 0.3863 | n/a | n/a | n/a | n/a | n/a | n/a | n/a | n/a |
## Trial Details
| Variant | Iter | Config | Result |
| --- | ---: | --- | --- |
| Before harness | 1 | baseline `TP4/DP1/EP-off`, `MBT=8192` | `0.2029 req/s/gpu` |
| Before harness | 2 | `DP=2`, `MBT=4096` | runtime failure |
| Before harness | 3 | `DP=2`, `MBT=8192` | runtime failure |
| Before harness | 4 | `EP=4` | launch failure |
| Before harness | 6 | `TP8/DP1/EP-off`, `MBT=4096` | `0.3575 req/s/gpu` |
| Before harness | 10 | `TP8/DP1/EP-off`, `MBT=3712` | `0.3794 req/s/gpu`, best |
| Harness | 1 | baseline `TP4/DP1/EP-off`, `MBT=8192` | `0.1892 req/s/gpu` |
| Harness | 2 | `TP8/DP1/EP-off`, `MBT=8192` | `0.3863 req/s/gpu`, best so far |
| Harness | 3 | `TP8/DP1/EP=2` | launch failure |
The harness baseline was slightly lower than the original baseline (`0.1892` vs `0.2029 req/s/gpu`), but iter 2 still exceeded the original 12-trial best (`0.3863` vs `0.3794 req/s/gpu`).
## Convergence Judgment
- Before harness reached its best at iter 10.
- Harness reached a better result at iter 2.
- Iterations-to-best improved from `10` to `2`, a `5x` improvement on this run.
- The important behavior change is that the harness skipped the original failed DP2 and EP4 exploration and moved directly from baseline to `TP8/DP1`.
## Follow-Up Optimization
The run also exposed a remaining weakness: after reaching the strong `TP8/DP1` incumbent, the LLM proposed `EP=2`, which failed at launch. To address that, the harness was tightened after this test:
- strong-incumbent stop threshold changed from `3x` to `1.8x` over baseline;
- expert parallel is now explicitly guarded and should not be introduced for TTFT-prefill bottlenecks without direct positive EP evidence.
With the new guard, the intended behavior after this iter-2 result is `should_stop=true` unless a same-topology runtime harness has strong direct evidence.
## Run Status
- The 2026-04-27 harness run was stopped after collecting the iter-2 convergence result and the iter-3 EP failure.
- GPUs were freed after stopping the run.

View File

@@ -0,0 +1,79 @@
# qwen235b-thinking-prefill-7day-compare
qwen3-235b-a22b `thinking` trace, prefill-only replay with `output_length=1`, comparing 3 configs across 7 daily `10:00-10:10` windows by `request_rate_per_gpu`.
## Setup
- Hardware: `dash1`, `8x H20`
- Model: `/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717`
- Engine: internal vLLM, baseline aligned to `~/run_qwen235b.sh`
- Trace set: `thinking_w20260321_1000` to `thinking_w20260327_1000`
- Window duration: `600s` each
- Request mode: `chat`
- Replay override: `min_tokens=max_tokens=1`
- SLO:
- pass target: `95%`
- `TTFT <= 2000ms` for `<=8191` input tokens
- `TTFT <= 4000ms` for `<=32767` input tokens
- `TTFT <= 6000ms` for `>32767` input tokens
- Search:
- each candidate independently binary-searches its own `sampling_u`
- `sampling_u in [0, 0.125]`
- `max_probes = 6`
## Candidates
- `baseline`
- `TP=4, DP=1, EP=off`
- baseline `run_qwen235b.sh` shape
- `tuned_0323`
- tuned on `thinking_w20260323_1000`
- `TP=4, DP=1, EP=off`
- `max-num-batched-tokens=3072`
- `max-num-seqs=32`
- `tuned_0327`
- tuned on `thinking_w20260327_1000`
- `TP=8, DP=1, EP=off`
- `max-num-batched-tokens=6144`
- `max-num-seqs=48`
- `block-size=32`
## Run assets
- Compare root: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-compare/dash1-qwen235b-prefill-thinking-7day-baseline-vs-0323-vs-0327`
- Summary: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-compare/dash1-qwen235b-prefill-thinking-7day-baseline-vs-0323-vs-0327/summary.json`
- Report: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-compare/dash1-qwen235b-prefill-thinking-7day-baseline-vs-0323-vs-0327/report.md`
- Compare spec: `/home/admin/cpfs/wjh/aituner/aituner/configs/examples/dash1_qwen235b_prefill_thinking_7day_compare.json`
## Aggregate result
- Wins by `request_rate_per_gpu`:
- `tuned_0327`: `5 / 7`
- `baseline`: `2 / 7`
- `tuned_0323`: `0 / 7`
- Mean `request_rate_per_gpu`:
- `baseline`: `0.13845`
- `tuned_0323`: `0.12756`
- `tuned_0327`: `0.17232`
- Relative to baseline:
- `tuned_0323`: `0.92x` mean per-GPU throughput
- `tuned_0327`: `1.24x` mean per-GPU throughput
## Per-day result
| Date | baseline req/s/gpu | tuned_0323 req/s/gpu | tuned_0327 req/s/gpu | Winner |
| --- | ---: | ---: | ---: | --- |
| `2026-03-21` | `0.08500` | `0.03917` | `0.14375` | `tuned_0327` |
| `2026-03-22` | `0.10125` | `0.12083` | `0.15313` | `tuned_0327` |
| `2026-03-23` | `0.12792` | `0.12792` | `0.19167` | `tuned_0327` |
| `2026-03-24` | `0.09000` | `0.09583` | `0.11250` | `tuned_0327` |
| `2026-03-25` | `0.13792` | `0.13208` | `0.13146` | `baseline` |
| `2026-03-26` | `0.32000` | `0.25917` | `0.23375` | `baseline` |
| `2026-03-27` | `0.10708` | `0.11792` | `0.24000` | `tuned_0327` |
## Key takeaways
- `tuned_0327` is the only candidate with clear cross-day value. It wins `5/7` windows and improves mean per-GPU throughput by about `24%`.
- `tuned_0323` does not generalize. It is slightly more conservative and keeps high pass rate, but mean per-GPU throughput is below baseline.
- The `0327` winner is not universal. On `2026-03-25` and especially `2026-03-26`, the 4-GPU baseline is more efficient per GPU than the `TP8` tuned shape.
- The practical reading is that prefill-only tuning has workload-regime sensitivity. `TP8 + 6144 + 48 + block-size=32` is a strong default candidate, but not a global static optimum across all days.

View File

@@ -0,0 +1,98 @@
# qwen235b-thinking-prefill-ttft-1s-2s-0-32k
qwen3-235b-a22b `thinking` trace, prefill-only replay with `output_length=1`, internal vLLM (`/usr/local/bin/vllm`), tuned on the `0~32k` input bucket under a stricter stepped TTFT SLO.
## Setup
- Hardware: `dash1`, `8x H20`
- Model: `/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717`
- Engine: internal vLLM, baseline aligned to `~/run_qwen235b.sh`
- Baseline topology: `TP=4, DP=1, EP=1`
- Trace: `thinking_w20260327_1000`
- Trace source: `trace_windows/traces/thinking_w20260327_1000.jsonl`
- Window duration: `600s` (`10:00-10:10`, `2026-03-27`)
- Request mode: `chat`
- Replay override: `min_tokens=max_tokens=1`
- Input bucket: `0 <= input_length <= 32768`
- SLO:
- pass target: `95%`
- `TTFT <= 1000ms` for `<=8191` input tokens
- `TTFT <= 2000ms` for `<=32767` input tokens
- `TTFT <= 2000ms` fallback bucket
- Search:
- `sampling_u in [0, 0.125]`
- `max_probes = 6`
- `12` trials total
- Proposal model: `codex / gpt-5.4`
## Run assets
- Study root: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-prefill/dash1-qwen235b-prefill-thinking-run5-ttft-1s-2s-0-32k-topology`
- State: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-prefill/dash1-qwen235b-prefill-thinking-run5-ttft-1s-2s-0-32k-topology/state.json`
- Log: `/home/admin/cpfs/wjh/aituner/aituner/logs/q235b_prefill_1s2s_0_32k.log`
- Spec: `/home/admin/cpfs/wjh/aituner/aituner/configs/examples/dash1_qwen235b_prefill_thinking_run5_ttft_1s_2s_0_32k.json`
## Best result
- Best trial: `trial-0011`
- Best config:
- `tensor-parallel-size=8`
- `data-parallel-size=1`
- `enable-expert-parallel=false`
- `max-num-batched-tokens=4096`
- `max-num-seqs=16`
- `VLLM_ENABLE_TORCH_COMPILE=0`
- Best `sampling_u`: `0.073767628521`
- Best request rate: `1.8516666666666666 req/s`
- Best request rate per GPU: `0.23145833333333332 req/s/gpu`
- Best pass rate: `0.9558955895589559`
Compared with baseline:
- `trial-0001`: `0.47 req/s`, `0.1175 req/s/gpu`
- `trial-0011`: `1.8516666666666666 req/s`, `0.23145833333333332 req/s/gpu`
- Raw throughput gain: `3.94x`
- Per-GPU throughput gain: `1.97x`
Best-point latency:
- baseline `trial-0001` TTFT mean/p50/p90/p95/p99 = `236.68 / 75.19 / 294.39 / 1378.79 / 3118.86 ms`
- best `trial-0011` TTFT mean/p50/p90/p95/p99 = `223.70 / 65.67 / 261.69 / 1065.31 / 3648.34 ms`
## 12-trial summary
| Trial | Proposed config delta | Result |
| --- | --- | --- |
| `trial-0001` | baseline `TP4/DP1/EP-off`, compile on | `0.4700 req/s`, feasible |
| `trial-0002` | `TP4/DP2`, `EP-off` | probe-search failure |
| `trial-0003` | `TP4/DP1/EP4`, `max-num-batched-tokens=4096` | launch fail |
| `trial-0004` | `VLLM_ENABLE_TORCH_COMPILE=0`, `max-num-batched-tokens=6144` | infeasible |
| `trial-0005` | compile off, `max-num-batched-tokens=4096` | infeasible |
| `trial-0006` | compile off, `max-num-seqs=32` | infeasible |
| `trial-0007` | compile off, `TP8/DP1/EP-off` | `1.3817 req/s`, feasible |
| `trial-0008` | `trial-0007 + max-num-seqs=32` | `1.5983 req/s`, feasible |
| `trial-0009` | `trial-0008 + max-num-batched-tokens=6144` | `1.8017 req/s`, feasible |
| `trial-0010` | `trial-0008 + max-num-batched-tokens=4096` | `1.8300 req/s`, feasible |
| `trial-0011` | `trial-0010 + max-num-seqs=16` | `1.8517 req/s`, feasible, best |
| `trial-0012` | `trial-0011 + max-num-batched-tokens=3072` | infeasible |
## Key insights
- Under the stricter `1s/2s` TTFT SLO, the main win still came from topology first: `TP4 -> TP8`.
- `TP4/DP2` and `EP4` remain negative evidence in this stack. The former failed in probe search; the latter failed at engine launch.
- Runtime-only tuning inside the 4-GPU topology did not beat baseline at all. The useful search space opened only after moving to `TP8/DP1`.
- After the `TP8` switch, the winning runtime shape became more conservative than the looser prefill studies: `max-num-batched-tokens=4096` and `max-num-seqs=16`.
- This run shows that even under a much tighter TTFT target, the `TP8` shape still improves both raw throughput and per-GPU throughput materially over baseline.
## Recommendation
For `qwen235b thinking prefill-only` on the `0~32k` bucket under the `1s/2s` stepped TTFT SLO, use:
- `tensor-parallel-size=8`
- `data-parallel-size=1`
- `enable-expert-parallel=false`
- `max-num-batched-tokens=4096`
- `max-num-seqs=16`
- `VLLM_ENABLE_TORCH_COMPILE=0`
Keep the rest of the `run_qwen235b.sh` baseline unchanged.

View File

@@ -0,0 +1,102 @@
# qwen235b-thinking-prefill-ttft-tight-0327
qwen3-235b-a22b `thinking` trace, prefill-only replay with `output_length=1`, internal vLLM (`/usr/local/bin/vllm`), tuned on `thinking_w20260327_1000` under tighter stepped TTFT SLO.
## Setup
- Hardware: `dash0`, `8x H20`
- Model: `/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717`
- Engine: internal vLLM, baseline aligned to `~/run_qwen235b.sh`
- Baseline topology: `TP=4, DP=1, EP=1`
- Trace: `thinking_w20260327_1000`
- Trace source: `trace_windows/traces/thinking_w20260327_1000.jsonl`
- Window duration: `600s` (`10:00-10:10`, `2026-03-27`)
- Request mode: `chat`
- Replay override: `min_tokens=max_tokens=1`
- SLO:
- pass target: `95%`
- `TTFT <= 2000ms` for `<=8191` input tokens
- `TTFT <= 4000ms` for `<=32767` input tokens
- `TTFT <= 6000ms` for `>32767` input tokens
- Search:
- `sampling_u in [0, 0.125]`
- `max_probes = 6`
- `12` trials total
- Proposal model: `codex / gpt-5.4`
## Run assets
- Study root: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-prefill/dash0-qwen235b-prefill-thinking-run2-ttft-tight-topology`
- State: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-prefill/dash0-qwen235b-prefill-thinking-run2-ttft-tight-topology/state.json`
- Spec: `/home/admin/cpfs/wjh/aituner/aituner/configs/examples/dash0_qwen235b_prefill_thinking_run2_ttft_tight.json`
## Best result
- Best trial: `trial-0012`
- Best config:
- `tensor-parallel-size=8`
- `data-parallel-size=1`
- `enable-expert-parallel=false`
- `max-num-batched-tokens=6144`
- `max-num-seqs=48`
- `block-size=32`
- Best `sampling_u`: `0.098106384277`
- Best request rate: `2.4966666666666666 req/s`
- Best request rate per GPU: `0.3120833333333333 req/s/gpu`
- Best pass rate: `0.9506008010680908`
Compared with baseline:
- `trial-0001`: `0.4716666666666667 req/s`, `0.11791666666666667 req/s/gpu`
- `trial-0012`: `2.4966666666666666 req/s`, `0.3120833333333333 req/s/gpu`
- Raw throughput gain: `5.29x`
- Per-GPU throughput gain: `2.65x`
Compared with the looser TTFT study on the same `2026-03-27` window:
- looser-SLO best: `3.035 req/s`, `0.379375 req/s/gpu`
- tighter-SLO best: `2.4966666666666666 req/s`, `0.3120833333333333 req/s/gpu`
- throughput retained: `82.26%`
- throughput drop: `17.74%`
Best-point latency:
- `TTFT mean/p50/p90/p95/p99 = 413.92 / 67.86 / 1456.32 / 2286.90 / 5326.23 ms`
## 12-trial summary
| Trial | Proposed config delta | Result |
| --- | --- | --- |
| `trial-0001` | baseline `TP4/DP1/EP-off`, `max-num-batched-tokens=8192` | `0.4717 req/s`, feasible |
| `trial-0002` | `TP4/DP2` | probe-search failure |
| `trial-0003` | `TP8/DP1/EP-off` | `1.9200 req/s`, feasible |
| `trial-0004` | `TP8/DP1/EP8` | launch fail |
| `trial-0005` | `trial-0003 + max-num-batched-tokens=6144` | `2.2517 req/s`, feasible |
| `trial-0006` | `trial-0003 + max-num-batched-tokens=4096` | infeasible |
| `trial-0007` | `trial-0003 + max-num-batched-tokens=5120` | infeasible |
| `trial-0008` | `trial-0003 + max-num-batched-tokens=5632` | infeasible |
| `trial-0009` | `trial-0005 + max-num-seqs=32` | infeasible |
| `trial-0010` | `trial-0005 + max-num-seqs=48` | infeasible |
| `trial-0011` | `trial-0005 + block-size=32` | infeasible |
| `trial-0012` | `trial-0005 + max-num-seqs=48, block-size=32` | `2.4967 req/s`, feasible, best |
## Key insights
- This tuning is also on `2026-03-27`, not a different day. The change is the tighter TTFT step SLO.
- The best topology still moved to `TP8/DP1/no-EP`; tighter TTFT did not change the topology conclusion.
- Tighter TTFT did change the runtime sweet spot. The best runtime shape is not the looser-study `3712` token batch, but `6144 + max-num-seqs=48 + block-size=32`.
- `DP2` and `EP` remained negative evidence in this stack: `TP4/DP2` failed during probing, and `TP8 + EP8` failed at launch.
- Relative to the looser TTFT study on the same day, stricter TTFT costs about `17.7%` throughput, but the tuned result still keeps a large margin over baseline.
## Recommendation
For the tighter stepped TTFT SLO on `thinking_w20260327_1000`, use:
- `tensor-parallel-size=8`
- `data-parallel-size=1`
- `enable-expert-parallel=false`
- `max-num-batched-tokens=6144`
- `max-num-seqs=48`
- `block-size=32`
Keep the rest of the `run_qwen235b.sh` baseline unchanged.

View File

@@ -0,0 +1,114 @@
# qwen27b-chat-0-8k-7day-compare
qwen3.5-27b `chat` trace, `0~8k` input bucket, tuned-best vs baseline cross-day compare on internal vLLM (`/usr/local/bin/vllm`), compared by `request_rate_per_gpu`.
## Setup
- Hardware: `dash1`, `8x H20`
- Model: `/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal`
- Engine: internal vLLM
- Baseline: empty patch over the study spec baseline, aligned to `~/run_qwen27b.sh` `TP=1, DP=1`
- Tuned best source: `trial-0004` from `dash0-qwen27b-tight-slo-10min-run9-chat-0-8k-codex-topology`
- Tuned best config:
- `tensor-parallel-size=2`
- `data-parallel-size=1`
- Trace family: `chat`
- Input bucket: `0 <= input_length <= 8192`
- Time range scanned: `2026-03-11` to `2026-03-17`
- Available windows in this slot: `7`
- `chat_w20260311_1000`
- `chat_w20260312_1000`
- `chat_w20260313_1000`
- `chat_w20260314_1000`
- `chat_w20260315_1000`
- `chat_w20260316_1000`
- `chat_w20260317_1000`
- Window duration: `600s` (`10:00-10:10`)
- Request mode: `chat`
- SLO:
- pass target: `95%`
- `TTFT <= 2000ms` for `<=4096` input tokens
- `TTFT <= 4000ms` for `<=32768` input tokens
- `TTFT <= 6000ms` for `>32768` input tokens
- `TPOT <= 50ms`
- Search:
- binary search on `sampling_u`
- `max_probes = 6`
- Proposal model for tuned source: `codex / gpt-5.4`
## Run assets
- Compare root: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-compare/dash1-qwen27b-chat-0-8k-7days-compare`
- Summary: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-compare/dash1-qwen27b-chat-0-8k-7days-compare/summary.json`
- Report: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-compare/dash1-qwen27b-chat-0-8k-7days-compare/report.md`
- Compare spec: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-compare/specs/qwen27b_chat_0_8k_compare_dash1.json`
- Tuned study root: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-tight/dash0-qwen27b-tight-slo-10min-run9-chat-0-8k-codex-topology`
## Tuned-source result
- Best trial: `trial-0004`
- Best config:
- `tensor-parallel-size=2`
- `data-parallel-size=1`
- Best `sampling_u`: `0.013061523438`
- Best request rate: `0.405 req/s`
- Best request rate per GPU: `0.2025 req/s/gpu`
- Best pass rate: `0.9629629629629629`
Compared with the single-day baseline on `chat_w20260311_1000`:
- `trial-0001`: `0.035 req/s`, `0.035 req/s/gpu`
- `trial-0004`: `0.405 req/s`, `0.2025 req/s/gpu`
- Raw throughput gain: `11.57x`
- Per-GPU throughput gain: `5.79x`
## 12-trial summary
| Trial | Proposed config delta | Result |
| --- | --- | --- |
| `trial-0001` | baseline `TP1/DP1` | `0.0350 req/s`, `0.0350 req/s/gpu`, feasible |
| `trial-0002` | `DP=2` | `0.1233 req/s`, `0.0617 req/s/gpu`, feasible |
| `trial-0003` | `DP=4` | `0.1567 req/s`, `0.0392 req/s/gpu`, feasible |
| `trial-0004` | `TP=2, DP=1` | `0.4050 req/s`, `0.2025 req/s/gpu`, feasible, best |
| `trial-0005` | `trial-0004 + max-num-batched-tokens=16384` | infeasible |
| `trial-0006` | `trial-0004 + max-num-seqs=24` | infeasible |
| `trial-0007` | `trial-0004 + max-num-batched-tokens=12288` | infeasible |
| `trial-0008` | `trial-0004 + block-size=32` | infeasible |
| `trial-0009` | `trial-0004 + gpu-memory-utilization=0.93` | infeasible |
| `trial-0010` | `trial-0004 + max-num-seqs=16, max-num-batched-tokens=6144` | infeasible |
| `trial-0011` | `trial-0004 + enable-prefix-caching=false` | infeasible |
| `trial-0012` | `trial-0004 + block-size=128` | infeasible |
## Aggregate result
- Comparable wins: tuned `5`, baseline `0`
- Incomparable windows: `2`
- Baseline mean request rate: `0.046 req/s`
- Tuned mean request rate: `0.4723809523809524 req/s`
- Baseline mean request rate per GPU: `0.046 req/s/gpu`
- Tuned mean request rate per GPU: `0.2361904761904762 req/s/gpu`
## Per-window result
| Window | Date | Baseline req/s/gpu | Tuned req/s/gpu | Winner |
| --- | --- | ---: | ---: | --- |
| `chat_w20260311_1000` | `2026-03-11` | `0.035` | `0.21416666666666667` | `tuned` |
| `chat_w20260312_1000` | `2026-03-12` | `None` | `0.28` | `incomparable` |
| `chat_w20260313_1000` | `2026-03-13` | `0.03166666666666667` | `0.265` | `tuned` |
| `chat_w20260314_1000` | `2026-03-14` | `0.021666666666666667` | `0.24083333333333334` | `tuned` |
| `chat_w20260315_1000` | `2026-03-15` | `0.12166666666666667` | `0.23083333333333333` | `tuned` |
| `chat_w20260316_1000` | `2026-03-16` | `0.02` | `0.2275` | `tuned` |
| `chat_w20260317_1000` | `2026-03-17` | `None` | `0.195` | `incomparable` |
## Key insights
- The tuned-source tuning itself was simple and topology-driven. The winning patch is only `TP1 -> TP2`; later runtime-only tweaks all failed to beat it.
- This compare does not support the conclusion that the tuned config lacks generalization. Across the full 7-day slice, tuned wins every directly comparable window.
- The two `incomparable` days are not execution failures. Baseline completed probing but never found a single feasible `sampling_u` under the target SLO, while tuned still found feasible operating points.
- The tuned `TP=2, DP=1` shape is materially more robust than the `TP=1, DP=1` baseline for this `0~8k` chat bucket.
- The weekend windows do not break the result. `2026-03-14` is another clear tuned win, and even on `2026-03-15`, where baseline is relatively stronger than other days, tuned still wins by about `1.90x` on `req/s/gpu`.
- The throughput gap remains large even after normalizing by GPU count, so this is not just a raw-card-count artifact.
## Recommendation
For `qwen27b chat 0~8k`, keep using the tuned `TP=2, DP=1` serving shape as the default candidate over the `TP=1, DP=1` baseline, and treat cross-day robustness as confirmed on the full 7-day window set.

View File

@@ -0,0 +1,173 @@
# qwen27b-chat-0-8k Current-Config Fig18 Plan
## Question
The earlier tables used best-so-far throughput. That is useful for deciding the
best deployable incumbent, but it hides bad proposals because the curve is
monotonic by construction. To judge whether the harness makes tuning more
directional, the primary curve must be each iteration's measured current config
performance.
## Why Final Performance Can Be Close
Harness and no-harness can converge to similar final throughput when the search
space contains one dominant simple family. In this setup the dominant family is
`TP=2, DP=1` over the `run_qwen27b.sh` baseline. The no-harness LLM can still
eventually discover that family within 12 iterations, so final best performance
can be close.
The difference the harness is expected to improve is not necessarily the final
12-iter maximum. It should improve:
- iterations-to-first-good-config;
- number of worse or infeasible proposals after an incumbent is found;
- measured-current config oscillation;
- early-stop behavior once adjacent harness probes no longer justify more GPU
trials.
## Metrics
- `measured-current`: each trial's own feasible `request_rate_per_gpu`.
Failed or no-feasible-point trials are recorded as `NA`.
- `accepted-incumbent`: best deployable value after each trial. This is the
standard best-so-far curve and is monotonic by definition.
- `iters-to-best`: first iteration where the final best value or equivalent
config family appears.
- `wasted-trials-after-best`: trials after first best that are worse, infeasible,
or no-feasible-point.
## Historical Run9 Re-Read
Source:
`.aituner-tight/dash0-qwen27b-tight-slo-10min-run9-chat-0-8k-codex-topology`
on dash0.
| Variant | Curve | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness run9 | measured-current | 0.0350 | 0.0617 | 0.0392 | 0.2025 | NA | NA | NA | NA | NA | NA | NA | NA |
| no-harness run9 | accepted-incumbent | 0.0350 | 0.0617 | 0.0617 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 |
Interpretation: the no-harness current-config curve already has a regression at
iter 3 and then many no-feasible-point runtime probes. The monotonic curve only
shows the incumbent policy, not proposal quality.
## New Paired Test Plan
Run on dash0 with internal vLLM and the real `chat_w20260311_1000` 0-8k replay:
- Base spec: `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`.
- Model path:
`/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal`.
- Naming note: local configs and dash0 model directories expose this setup as
Qwen3.5-27B/Qwen35-27B, not `qwen32b`.
- Engine: `/usr/local/bin/vllm`, baseline aligned with `~/run_qwen27b.sh`.
- SLO: 95% pass, stepped TTFT `2s/4s/6s`, TPOT `<=50ms`.
- Search: `low=0`, `high=0.0625`, `max_probes=6`, `tolerance=0.001`.
- no-harness study:
`.aituner-tight/dash0-qwen27b-tight-slo-10min-run10-chat-0-8k-current-noharness`.
- harness study:
`.aituner-tight/dash0-qwen27b-tight-slo-10min-run10-chat-0-8k-current-harness`.
The result table will report both curves. The harness is considered successful
only if it reaches the same or better incumbent in fewer iterations and reduces
the measured-current regressions or replaces them with an explicit harness stop.
## Run Status
- 2026-05-06 07:05 CST: dash0 checked, 8 H20 GPUs idle.
- 2026-05-06 07:05 CST: generated paired specs under
`.aituner-tight/specs/`.
- 2026-05-06 07:05 CST: started no-harness full 12-iter run in tmux session
`qwen27b_run10_noharness_20260506`.
- 2026-05-06 07:18 CST: stopped the duplicate fresh no-harness run before
completion. Reason: run9 is already a completed real 12-iter no-harness run
for the same internal vLLM 0-8k setup, while the fresh full-chat run would
spend a multi-hour dash0 slot duplicating that curve.
- 2026-05-06 07:20 CST: seeded the harness study with the real run9 baseline
measurement as `trial-0001`, then started the harness run with
`--skip-baseline` in tmux session `qwen27b_run10_harness_skipbase_20260506`.
- 2026-05-06 07:20 CST: harness generated deterministic `trial-0002`:
`{"tensor-parallel-size": 2}`.
- 2026-05-06 08:11 CST: harness `trial-0002` completed:
`TP=2`, `0.2142 request_rate_per_gpu`.
- 2026-05-06 08:19 CST: harness `trial-0003` failed at engine launch.
Root cause: the old runtime refinement coupled `gpu-memory-utilization=0.95`
with larger `max-num-batched-tokens`, causing speculative sampler warmup OOM.
This is a generic harness safety bug; fixed locally by removing the automatic
memory-utilization bump from runtime refinement.
- 2026-05-06 09:24 CST: harness `trial-0004` completed:
`TP=4`, `0.4429 request_rate_per_gpu`. All six probes were feasible up to
`sampling_u=0.0615234375`, so this study is near the configured
`search.high=0.0625` ceiling.
- 2026-05-06 09:25 CST: old harness repeated the same unsafe runtime refinement
for TP4 and `trial-0005` failed at engine launch for the same OOM reason. The
old process was stopped before continuing.
- 2026-05-06 09:37 CST: pulled commit `5d96689` on dash0 and resumed. The
runtime-refinement OOM was fixed, but the stop guard was still too strict: it
did not treat a feasible high-edge probe with a small number of SLO failures
as saturation, even though the probe already met the 95% pass-rate target.
- 2026-05-06 09:50 CST: stopped the unnecessary product-8 validation. The queued
`trial-0006`/`trial-0007` are not used for convergence claims.
- 2026-05-06 09:56 CST: pulled commit `f653af0` on dash0. The fixed high-edge
stop guard produced `harness-stop-0008` without launching another GPU trial.
## Current Results
Unit: feasible `request_rate_per_gpu`. `NA` means the current trial did not
produce a feasible deployable config.
| Variant | Curve | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness run9 | measured-current | 0.0350 | 0.0617 | 0.0392 | 0.2025 | NA | NA | NA | NA | NA | NA | NA | NA |
| no-harness run9 | accepted-incumbent | 0.0350 | 0.0617 | 0.0617 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 |
| harness run10 | measured-current | 0.0350 | 0.2142 | NA | 0.4429 | NA | skipped | skipped | stop | | | | |
| harness run10 | accepted-incumbent | 0.0350 | 0.2142 | 0.2142 | 0.4429 | 0.4429 | 0.4429 | 0.4429 | 0.4429 stop | | | | |
The harness result is stronger than the earlier strict replay. It did not merely
reach the same TP2 region earlier; it then used the bottleneck/topology evidence
to validate TP4 and found a much higher current config.
## Interpretation
- Why both variants can look close when only best-so-far is shown: no-harness can
eventually find a good simple topology, and best-so-far hides every bad
proposal after that point.
- What the current-config curve shows: no-harness regresses at iter 3 and then
spends many iterations on no-feasible-point runtime probes. Harness reaches a
stronger TP2 config at iter 2 and a stronger TP4 config at iter 4.
- Why harness helped: the baseline diagnostics identify TTFT/prefill as the
active bottleneck on low-prefix-reuse long prompts. The harness maps that to
adjacent TP validation before DP/runtime exploration. The no-harness LLM chose
DP2 then DP4 first, which diluted per-GPU throughput and delayed TP.
- Defect fixed during the run: runtime refinement was too aggressive because it
combined larger MBT with higher memory utilization. It now changes batching
headroom without also raising memory pressure.
- Stop defect fixed during the run: high-edge probes can have a few individual
latency failures and still be feasible under the configured pass-rate SLO. The
stop guard now keys on `feasible=true` near `search.high`, not on an empty
failed-reason map.
- Search-high implication: TP4 reached `sampling_u=0.0615234375` with
`search.high=0.0625`, so the current spec is saturated for this topology. A
higher `search.high` would be required to distinguish whether TP4 can go even
higher in absolute throughput; it is not needed to show that harness converged
faster than no-harness under this spec.
## Mechanism
The harness contributes structured, non-testcase-specific information:
- Workload features: long-prompt 0-8k distribution, low prefix reuse, and smooth
arrivals.
- Bottleneck diagnosis from probes: baseline failures are TTFT/prefill-heavy, so
topology changes that reduce long-prefill latency should be tried before DP or
runtime batching.
- Topology adjacency: validate TP1 -> TP2 -> TP4 rather than jumping randomly or
repeating a failing runtime family.
- Stop condition: once the incumbent's feasible probe is within one binary-search
resolution of `search.high`, stop instead of spending more GPU trials.
Without the harness, the LLM response in run9 chose DP2 and DP4 before TP2. That
temporarily improved total request rate but reduced per-GPU efficiency, so the
measured-current curve dipped at iter 3 and reached the old best only at iter 4.
With the harness, the LLM receives the bottleneck/topology frame and chooses
TP-oriented validation; TP2 is reached at iter 2 and TP4 at iter 4.

View File

@@ -0,0 +1,58 @@
# qwen27b-chat-0-8k Harness Fig18
## Setup
- Workload: `qwen3.5-27b` chat, `0 <= input_length <= 8192`.
- Window: `chat_w20260311_1000`.
- Engine: dash0 internal vLLM, baseline aligned to `run_qwen27b.sh`.
- SLO: 95% pass rate, stepped TTFT `2s/4s/6s`, TPOT `<=50ms`.
- Search metric: best-so-far feasible `request_rate_per_gpu`.
- Before-harness source: actual 12-trial run
`.aituner-tight/dash0-qwen27b-tight-slo-10min-run9-chat-0-8k-codex-topology`.
- After-harness source: strict harness replay over already measured run9 configs:
- Iter 1 uses the measured baseline trial.
- Iter 2 uses the current harness proposal after seeing only iter 1 history. It proposes `TP=2, DP=1`, whose performance is the measured run9 `trial-0004` result for the same config and spec.
- Iter 3 uses the current harness proposal after seeing only baseline + `TP=2, DP=1`. With the strong-incumbent guard, it returns `should_stop=true`.
The replay is intentionally strict: the LLM prompt does not receive future `best_by_parallel_size` entries or later failed trials.
## Fig18-Style Best-So-Far Curve
Unit: feasible `request_rate_per_gpu`. Infeasible trials leave the best-so-far value unchanged.
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| Before harness, actual run9 | 0.0350 | 0.0617 | 0.0617 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 |
| After harness, strict replay | 0.0350 | 0.2025 | 0.2025 stop | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 |
## Trial-Level Interpretation
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5-12 |
| --- | --- | --- | --- | --- | --- |
| Before harness | baseline `TP1/DP1`, 0.0350 | `DP=2`, 0.0617 | `DP=4`, 0.0392, worse per GPU | `TP=2, DP=1`, 0.2025, best | runtime-only probes, all infeasible |
| After harness | baseline `TP1/DP1`, 0.0350 | `TP=2, DP=1`, 0.2025, best | `should_stop=true` | no GPU trial | no GPU trial |
## Convergence Judgment
- Before harness reaches the final best value at iter 4.
- After harness reaches the same best value at iter 2.
- The speedup is `2x` by iterations-to-best: `4 -> 2`.
- The harness also avoids the post-best weak proposals: before harness spent iters 5-12 on infeasible runtime-only probes; after harness stops at iter 3.
## Implementation Changes From This Check
- Added a strong-incumbent convergence guard:
- if the latest trial is the incumbent,
- and it improves `request_rate_per_gpu` by at least `3x` over the baseline,
- then runtime-only probes require direct same-topology evidence; otherwise the LLM should return `should_stop=true`.
- Strengthened the MBT harness guard:
- do not raise `max-num-batched-tokens` when incumbent MBT already covers prompt p99 unless same-topology evidence proves prefill fragmentation.
- Made early-stop engine relaunch opt-in. A real r2 run showed that default relaunch changes warm-state behavior and makes full-chat results incomparable with run9, so the default remains drain-based for comparable production measurements.
- Added LLM empty-response retry to avoid crashing `study tune` on a transient empty streamed response.
## Remote Checks
- Local: `python3 -m compileall -q src tests` passed.
- Local: `PYTHONPATH=src python3 -m unittest tests.test_core_flow` passed, 63 tests.
- dash0: `python3 -m compileall -q src tests` passed.
- dash0: `PYTHONPATH=src python3 -m unittest discover -s tests -p "test_*.py"` passed, 63 tests.

View File

@@ -0,0 +1,169 @@
# qwen27b-chat-0-8k Setup and Result Audit
## Purpose
This note audits the 2026-05-06 qwen27b chat 0-8k harness result because the
new best `0.4429 request_rate_per_gpu` is much higher than the previous
no-harness best `0.2025`.
## Setup
- Host: `dash0`.
- Hardware: 8 NVIDIA H20 GPUs.
- Engine: internal vLLM at `/usr/local/bin/vllm`.
- Model:
`/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal`.
- Served model name: `qwen35-27b-aituner`.
- Workload window: `chat_w20260311_1000`.
- Trace file source: `trace_windows/windows.json`.
- Request mode: `chat`.
- Input bucket: `0 <= input_length <= 8192`.
- Replay scale: `1.0`.
- Max concurrency: `32`.
- Max requests per probe: unset, so full selected trace subset is replayed.
- Search field: `sampling_u`.
- Search range: `low=0.0`, `high=0.0625`.
- Search probes: `max_probes=6`, `tolerance=0.001`.
- Sampling seed: `20260325`.
The local configs and dash0 model directories name this setup Qwen3.5-27B /
Qwen35-27B. I did not find a `qwen32b` model/config for this internal chat
0-8k setup.
## SLO
- Target pass rate: `0.95`.
- TTFT rule: stepped by input length.
| Input tokens | TTFT threshold |
| ---: | ---: |
| `<=4096` | `2000 ms` |
| `<=32768` | `4000 ms` |
| otherwise | `6000 ms` |
- TPOT rule: fixed `<=50 ms`.
A probe is feasible when its pass rate is at least `0.95`. Individual requests
may still fail TTFT/TPOT while the whole probe remains feasible.
## Compared Studies
| Variant | Study root | Notes |
| --- | --- | --- |
| no-harness | `.aituner-tight/dash0-qwen27b-tight-slo-10min-run9-chat-0-8k-codex-topology` | completed 12-trial historical run |
| harness | `.aituner-tight/dash0-qwen27b-tight-slo-10min-run10-chat-0-8k-current-harness` | seeded with run9 baseline, then ran real harness trials |
The harness run reused the real run9 baseline as `trial-0001` to avoid
duplicating a multi-hour cold-start baseline measurement. Later harness trials
were real dash0 runs.
## Metric
The reported metric is `request_rate_per_gpu`:
```text
request_rate_per_gpu = best_feasible_request_rate / parallel_size
parallel_size = tensor_parallel_size * data_parallel_size
```
The result JSON stores `best_request_rate`; `StudyStore.ingest_trial_results`
derives `best_request_rate_per_gpu` from the trial spec topology.
## Result Table
Unit: feasible `request_rate_per_gpu`.
| Variant | Curve | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 |
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness run9 | measured-current | 0.0350 | 0.0617 | 0.0392 | 0.2025 | NA | NA | NA | NA |
| no-harness run9 | accepted-incumbent | 0.0350 | 0.0617 | 0.0617 | 0.2025 | 0.2025 | 0.2025 | 0.2025 | 0.2025 |
| harness run10 | measured-current | 0.0350 | 0.2142 | NA | 0.4429 | NA | skipped | skipped | stop |
| harness run10 | accepted-incumbent | 0.0350 | 0.2142 | 0.2142 | 0.4429 | 0.4429 | 0.4429 | 0.4429 | 0.4429 stop |
## Why `0.4429` Is Plausible
The new value is not the old TP2 config suddenly doubling. The comparable TP2
results are close:
| Study | Trial | Config | Best request rate | Parallel size | request_rate_per_gpu |
| --- | --- | --- | ---: | ---: | ---: |
| run9 | `trial-0004` | `TP=2, DP=1` | 0.4050 | 2 | 0.2025 |
| run10 | `trial-0002` | `TP=2` | 0.4283 | 2 | 0.2142 |
The large jump comes from a new topology that run9 did not evaluate:
| Study | Trial | Config | Best request rate | Parallel size | request_rate_per_gpu |
| --- | --- | --- | ---: | ---: | ---: |
| run10 | `trial-0004` | `TP=4` | 1.7717 | 4 | 0.4429 |
At the winning TP4 probe:
- `sampling_u=0.0615234375`;
- request count `1063`;
- request rate `1.7717 req/s`;
- pass rate `0.9680`;
- p95 TTFT `1476.9 ms`;
- p95 TPOT `44.4 ms`.
This satisfies the configured SLO and is within one binary-search resolution of
`search.high=0.0625`.
## Correctness Audit
The following fields match between run9 and run10 except for intentionally
different identity fields such as study id and port:
- model path and served model name;
- internal vLLM executable;
- base launch flags other than port;
- trace window `chat_w20260311_1000`;
- input-length filter `0-8192`;
- replay scale `1.0`;
- max concurrency `32`;
- full selected trace replay, no `max_requests_per_probe`;
- SLO target and TTFT/TPOT thresholds;
- search `high=0.0625`, `max_probes=6`, `tolerance=0.001`, seed `20260325`;
- metric definition `best_request_rate / (TP * DP)`.
Checked differences and their impact:
- Port differs: run9 used `18087`, run10 used `18082`; this should not affect
measured throughput.
- run10 has explicit `restart_engine_after_early_stop=false`; chat studies
default to the same behavior.
- run10 has explicit `completion_tokens_override=null`; equivalent to run9's
absent field.
- run9 `trial-0004` search floor was `0.00390625` because it reused the
incumbent for the same parallel-size group. run10 `trial-0004` search floor
was `0.0` because pure `TP=4` had not been tried. Both have the same high and
probe budget; this does not explain the higher result.
No metric-code logic error was found in the audit. The result JSONs store raw
request rate, and the state computes per-GPU throughput by dividing by
`TP*DP`. For run10 TP4, `1.7716666667 / 4 = 0.4429166667`.
## Issues Found During The Test
Two harness bugs were found and fixed:
- Runtime refinement coupled larger `max-num-batched-tokens` with
`gpu-memory-utilization=0.95`, which caused launch-time OOM. Fixed in commit
`5d96689`.
- The search-high stop guard incorrectly required no individual SLO failures at
a feasible high-edge probe. Fixed in commit `f653af0`; feasibility already
means the probe passed the configured pass-rate SLO.
The queued product-8 `trial-0006` and `trial-0007` were stopped after the stop
guard fix and are not used in the convergence claim.
## Conclusion
The `0.4429` result is being compared under the same workload, SLO, search
range, and metric definition as the previous `0.2025` result. The reason it is
higher is that no-harness run9 did not evaluate pure `TP=4`; the harness guided
the search from the TTFT/prefill bottleneck to adjacent TP validation and found
that topology by iter 4.
Because TP4 nearly saturates the configured `search.high`, a follow-up run with
a higher `search.high` is needed to measure the absolute ceiling. That follow-up
is separate from the current convergence comparison.

View File

@@ -0,0 +1,131 @@
# Qwen27B Chat 0-8k TPOT 40ms Baseline Infeasible Run
Date: 2026-05-07
## Goal
Re-run the internal vLLM + Qwen3.5-27B chat 0-8k tuning comparison after adding a study-level guard:
- if the automatic baseline trial has no feasible probe;
- and the lowest sampled request rate still fails the SLO target pass rate;
- then AITuner stops the whole study and reports that the SLO is too tight for the current setup.
This prevents spending the remaining tuning budget on LLM or harness proposals when the baseline itself demonstrates that the workload/SLO is infeasible at the search floor.
## Implementation
Commit: `f212673 Stop tuning when baseline is infeasible`
Changed behavior:
- `study tune` now persists `tuning_stop_reason` and `tuning_stop_diagnosis` in `state.json`.
- `study tune` also persists `tuning_stop_details`, including the lowest sampled probe's TTFT/TPOT mean, p50, p95, and p99.
- After the automatic baseline trial is ingested, AITuner checks the worker result:
- `status == completed`
- `best_request_rate is None`
- at least one probe exists
- all probes are infeasible
- If true, AITuner stops before asking the LLM or harness for any proposal.
- Re-running the same study respects the persisted stop state and does not resume tuning.
Validation:
```bash
python3 -m compileall -q src tests
PYTHONPATH=src python3 -m unittest tests.test_core_flow
```
Local and `dash0` both passed.
## Setup
Host: `dash0`
Remote repo: `/home/admin/cpfs/wjh/aituner/aituner`
Base spec: `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`
Model: `/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal`
Workload: chat, 0-8k input window
SLO:
- TTFT: existing step rule from the base spec
- TPOT: fixed `40ms`
- target pass rate: `0.95`
Search:
- Direct AITuner command: `python3 -m aituner.cli study tune ... --max-trials 12`
- No manual proposal/state edits during either run.
- Both variants used `CUDA_VISIBLE_DEVICES=0,1,2,4,5,6,7`; this was identical for both specs.
- The two specs were verified equal after normalizing only `study_id` and `llm.use_harness`.
Specs:
- no-harness: `.aituner-tight/specs/dash0-qwen27b-chat-0-8k-tpot40-gpu3skip-12iter-noharness-20260507.json`
- harness: `.aituner-tight/specs/dash0-qwen27b-chat-0-8k-tpot40-gpu3skip-12iter-harness-20260507.json`
## Commands
No harness:
```bash
PYTHONPATH=src python3 -m aituner.cli study tune \
--spec .aituner-tight/specs/dash0-qwen27b-chat-0-8k-tpot40-gpu3skip-12iter-noharness-20260507.json \
--store-root .aituner-tight \
--max-trials 12
```
Harness:
```bash
PYTHONPATH=src python3 -m aituner.cli study tune \
--spec .aituner-tight/specs/dash0-qwen27b-chat-0-8k-tpot40-gpu3skip-12iter-harness-20260507.json \
--store-root .aituner-tight \
--max-trials 12
```
## Results
Both runs stopped after the baseline trial. No LLM/harness proposal was evaluated because baseline had no feasible probe.
| Variant | Trials executed | Best request rate | Best request rate / GPU | Stop reason |
| --- | ---: | ---: | ---: | --- |
| no-harness | 1 | - | - | `baseline_all_infeasible` |
| harness | 1 | - | - | `baseline_all_infeasible` |
Baseline probe curve:
| sampling_u | request rate | pass rate | feasible | early stop reason |
| ---: | ---: | ---: | --- | --- |
| 0.03125 | 0.895 | 0.000000 | false | `slo_pass_rate_unrecoverable` |
| 0.015625 | 0.483333 | 0.137931 | false | `slo_pass_rate_unrecoverable` |
| 0.0078125 | 0.246667 | 0.236486 | false | `slo_pass_rate_unrecoverable` |
| 0.00390625 | 0.123333 | 0.189189 | false | `slo_pass_rate_unrecoverable` |
| 0.001953125 | 0.065000 | 0.205128 | false | `slo_pass_rate_unrecoverable` |
| 0.0009765625 | 0.035000 | 0.142857 | false | `slo_pass_rate_unrecoverable` |
Lowest request rate latency summary:
| Variant | request rate | pass rate | TTFT mean | TTFT p50 | TTFT p95 | TTFT p99 | TPOT mean | TPOT p50 | TPOT p95 | TPOT p99 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness | 0.035000 | 0.142857 | 1288.953ms | 446.586ms | 3011.814ms | 3011.814ms | 12.661ms | 13.141ms | 15.097ms | 15.097ms |
| harness | 0.035000 | 0.142857 | 1268.090ms | 445.274ms | 2889.080ms | 2889.080ms | 12.658ms | 13.170ms | 15.102ms | 15.102ms |
This shows that the TPOT threshold of `40ms` is not the binding constraint at the lowest sampled rate. The observed TPOT p99 is about `15.1ms`; failures are driven by TTFT and by the unrecoverable-pass-rate early stop after too many requests have already failed or been skipped.
Final diagnosis written by AITuner:
```text
Baseline configuration has no feasible probe under the current SLO. Stopping tuning because even the lowest sampled request rate did not meet the target pass rate. lowest_sampled_request_rate=0.035 lowest_sampling_u=0.000976562 lowest_probe_pass_rate=0.142857 early_stop_reason=slo_pass_rate_unrecoverable
```
## Interpretation
This run does not measure harness acceleration. It proves that the TPOT 40ms setup is infeasible for the current baseline and search floor: even at `0.035` aggregate request rate, only `14.29%` of requests pass the SLO, far below the `95%` target.
The correct behavior is to stop the study early and report SLO infeasibility instead of spending the remaining 11 trial slots. Harness cannot accelerate convergence when there is no feasible baseline point and no incumbent for guided tuning.
For a Fig. 18-style convergence comparison, the next setup must first have at least one feasible baseline or feasible low-rate point under the same metric definitions.

View File

@@ -0,0 +1,175 @@
# Qwen3-30B-A3B Community vLLM Harness Ablation, 2026-05-02
## Goal
Run a fresh dash0 experiment on the community vLLM latest release with the local community model:
`/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
The comparison is:
| Variant | Spec | Harness |
| --- | --- | --- |
| no-harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json` | disabled via `llm.use_harness=false` |
| harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json` | enabled, including deterministic stop proposal |
Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model.
The launch environment sets `HOME=/tmp/wjh` and `XDG_CACHE_HOME=/tmp/wjh/.cache` so vLLM, torch.compile, and FlashInfer build caches land on dash0 local storage instead of CPFS. This is a startup/cache placement choice, not a vLLM performance flag.
## vLLM Install
PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 runtime venv is on local rootfs rather than CPFS, because installing torch/CUDA wheels into CPFS was I/O-bound:
`/tmp/wjh/venvs/vllm-0.20.0-cu129`
The first plain `pip install vllm==0.20.0` smoke pulled `torch 2.11.0+cu130` and failed on dash0's driver (`570.133.20`, CUDA 12.9). The active install uses the vLLM 0.20.0 GitHub release `+cu129` wheel and the PyTorch CUDA 12.9 index, matching the vLLM documented CUDA 12.9 install path for this driver.
Install log:
`/home/admin/cpfs/wjh/aituner/aituner/logs/install_vllm_0.20.0_20260502.log`
## Workload
The experiment reuses the 0-8k chat window that has already been used for qwen27b harness work:
| Field | Value |
| --- | --- |
| window | `chat_w20260311_1000` |
| source rows | 32606 |
| input filter | 0 to 8192 tokens |
| completion tokens | fixed 128 via `trace.completion_tokens_override` |
| max requests per probe | 512 |
| replay time scale | 0.1 |
| target pass rate | 0.95 |
| TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above |
| TPOT SLO | 50ms |
| search high | 0.125 sampling_u |
| max probes per trial | 4 |
The `max_requests_per_probe=512` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe. A trace-only count check gives 31 to 65 selected requests across the six binary-search thresholds, avoiding the invalid low-cap case where early thresholds can select zero requests.
The first full-output attempt showed why a bounded replay is needed for a 12-iteration ablation: at the first threshold (`0.0625`), 31 selected requests contained 14,849 output tokens with `out_max=2981`. That makes one probe too slow to finish a full no-harness/harness pair. The first out128 attempt with `replay_time_scale=1.0` was still bounded by real window time, so each probe waited close to the original window duration. The active ablation therefore fixes output length at 128 tokens, uses `replay_time_scale=0.1`, and limits each trial to four binary-search probes. `load_trace_requests` scales both request arrivals and the window duration, so reported request rates are the actual compressed replay request rates. This changes the load/decode mix, so the result should be interpreted as a community-vLLM harness convergence test under a bounded, time-compressed chat replay, not as a full-output production benchmark.
## Harness Update Under Test
This run tests a stricter early-stop harness:
- The harness still injects L-C-A workload features, recent trial diagnostics, active bottleneck, legal topology candidates, tested signatures, and knob-family rules.
- A strong incumbent no longer means immediate stop. It means "validate nearby alternatives".
- Deterministic stop is allowed only after completed validation evidence says continuing is unlikely to be useful:
- the incumbent beats baseline by a generic large-gain ratio,
- at least two post-incumbent validation trials have run,
- those validation trials did not produce a feasible per-GPU improvement,
- the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts.
- If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal.
- A single-family all-infeasible plateau is not enough to stop deterministically. It only blocks repeating that family; the LLM must either justify a different family or later satisfy the validation/convergence stop rule.
- A search-high saturation guard stops immediately when the incumbent's highest measured probe is feasible and is within the configured binary-search resolution of `search.high`. A feasible probe may still contain individual SLO failures as long as it meets the configured pass-rate target. In that case the current study cannot measure a better config without increasing the workload search range, so more config proposals only waste tuning iterations.
This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number.
## Unit Tests
Local test command:
```bash
PYTHONPATH=src python3 -m unittest tests.test_core_flow -q
```
Result at the time of this note: passed. The current repository test count may be higher; use the command above as the source of truth.
The added coverage checks:
| Test | Purpose |
| --- | --- |
| `test_harness_does_not_stop_immediately_after_strong_incumbent` | strong incumbent requires validation first |
| `test_harness_stop_after_post_incumbent_validation_is_exhausted` | deterministic stop after validation exhaustion |
| `test_cli_tune_uses_harness_stop_before_llm` | `study tune` can stop without calling the LLM or launching another GPU trial |
| `test_prompt_can_disable_harness_for_ablation` | no-harness prompt removes structured harness context |
| `test_harness_stop_when_incumbent_saturates_search_high` | deterministic stop when the incumbent saturates the configured workload search high |
| `test_harness_guided_first_tp_probe_for_latency_bottleneck` | deterministic first TP probe after baseline latency bottleneck evidence |
| `test_harness_guided_runtime_seed_preserves_tp_incumbent` | deterministic same-topology runtime refinement after a TP incumbent |
## Experiment Tracking
Completed dash0 runs:
| Variant | tmux session | Log | Study root |
| --- | --- | --- | --- |
| no-harness | `qwen30b_vllm020_noharness_out128_scale01_20260502` | `logs/qwen30b_vllm020_noharness_out128_scale01_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-noharness` |
| harness | `qwen30b_vllm020_harness_highstop_gpu4_7_20260502` | `logs/qwen30b_vllm020_harness_highstop_gpu4_7_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-harness-highstop-gpu4-7` |
The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.
## Results
Metric: best-so-far `request_rate_per_gpu` under the bounded, time-compressed replay.
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 |
| harness | 1.0333 | 1.0333 stop | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 | 1.0333 |
Actual per-iteration outcomes:
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness | 1.0333 | 0.5167 | fail | fail | fail | fail | fail | fail | fail | fail | fail | fail |
| harness | 1.0333 | stop | stop | stop | stop | stop | stop | stop | stop | stop | stop | stop |
Interpretation:
- The best config is the default community vLLM config for this bounded replay. It reaches the configured search high: the last baseline probe at `sampling_u=0.1171875` is feasible, has pass rate `1.0`, and has no TTFT/TPOT SLO failures. With `search.high=0.125` and `max_probes=4`, this is exactly one binary-search resolution below the configured high.
- The harness stops at iter 2 without calling the LLM or launching another GPU trial. This is not a claim that the engine is globally optimal; it is a claim that the current study cannot measure an improvement until `search.high` is increased.
- No-harness spends all 12 tuning iterations anyway. Iter 2 changes to DP=2 and halves per-GPU throughput (`0.5167`). Iter 3-12 are launch failures from unguided or weakly guided proposals.
- The harness therefore reaches the best measured config in one executed GPU trial and saves 11 tuning iterations on this setup.
Operational note:
- The no-harness run left driver-side orphan GPU memory on GPU0/1 after repeated launch failures. An earlier pre-high-stop harness attempt left the same kind of residue on GPU2/3. The final harness run was executed on dash0 GPU4-7 via a runtime-derived spec to avoid this contamination. Its executed GPU trial used a single H20, matching the no-harness best trial's single-GPU default configuration.
## High=1.0 Rerun
The `search.high=0.125` run answered only "can this config handle up to about 1.08 req/s in the compressed replay?" It could not answer "which config is best?" because the default config already reached the measurement ceiling.
Trace request counts after raising `search.high` show the difference:
| search.high | Near-top selected requests | Near-top request rate |
| ---: | ---: | ---: |
| 0.125 | 65 | 1.0833 req/s |
| 0.25 | 141 | 2.3500 req/s |
| 0.5 | 269 | 4.4833 req/s |
| 1.0 | 502 | 8.3667 req/s |
The high=1.0 run used the same bounded replay (`completion_tokens_override=128`, `replay_time_scale=0.1`, `max_requests_per_probe=512`) but set `search.high=1.0` and `max_probes=6`.
Completed dash0 high=1.0 runs:
| Variant | tmux session | Study root |
| --- | --- | --- |
| no-harness | `qwen30b_vllm020_noharness_high1_20260506` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-high1-noharness` |
| harness-guided-v2 | `qwen30b_vllm020_harness_high1_guided_v2_20260506` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-high1-harness-guided-v2` |
Metric: best-so-far `request_rate_per_gpu`.
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8 | Iter 9 | Iter 10 | Iter 11 | Iter 12 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| no-harness | 2.2000 | 3.2583 | 3.2583 | 3.2583 | 3.2583 | 3.3000 | 3.3500 | 3.3500 | 3.3500 | 3.3500 | 3.3500 | 3.3500 |
| harness-guided-v2 | 2.3833 | 3.2583 | 3.2833 | 3.3000 | 3.3000 stop | 3.3000 | 3.3000 | 3.3000 | 3.3000 | 3.3000 | 3.3000 | 3.3000 |
Actual per-iteration outcomes:
| Variant | Iter 1 | Iter 2 | Iter 3 | Iter 4 | Iter 5 | Iter 6 | Iter 7 | Iter 8-12 |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |
| no-harness | 2.2000 | 3.2583 | launch fail | infeasible | 1.1042 | 3.3000 | 3.3500 | infeasible |
| harness-guided-v2 | 2.3833 | 3.2583 | 3.2833 | 3.3000 | stop | stop | stop | stop |
Interpretation:
- Raising `search.high` was necessary. The default config was not optimal under the expanded workload range; `TP=2` immediately improved per-GPU throughput from about `2.2` to `3.2583`.
- The updated harness now provides deterministic proposals, not just early stop:
- iter2: adjacent TP probe (`tensor-parallel-size=2`),
- iter3: same-topology runtime seed (`gpu-memory-utilization=0.95`, chunked prefill, `max-num-batched-tokens=16384`),
- iter4: controlled MBT growth to `24576`.
- No-harness reached the same config family at iter7, after an EP launch failure, an infeasible DP probe, a poor TP/DP probe, and then runtime refinement.
- Harness reached the same config family at iter4 and stopped at iter5. Its measured best was `3.3000`, while no-harness measured `3.3500` for the same `TP=2 + MBT=24576` family; the 1.5% gap is within the observed boundary/noise of repeated high-load replay. The convergence claim is therefore "same config family in fewer iterations", not an exact higher single-run number.

View File

@@ -0,0 +1,59 @@
# Repo Audit Repair Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Repair the audit findings that affect measurement integrity, state correctness, documentation accuracy, and open-source readiness.
**Architecture:** Keep changes localized to the existing stdlib-only Python package. Measurement validation lives at the HTTP/worker boundary, study state fixes remain in `StudyStore`, compare reporting gains explicit failed/no-feasible accounting, and project metadata/docs are added at repo root.
**Tech Stack:** Python 3.11+ stdlib, `unittest`, setuptools `pyproject.toml`.
---
### Task 1: Measurement Integrity
**Files:**
- Modify: `src/aituner/http_client.py`
- Modify: `src/aituner/slo.py`
- Modify: `src/aituner/worker.py`
- Test: `tests/test_core_flow.py`
- [ ] Write failing tests for completion token source/mismatch failures and persisted per-request probe details.
- [ ] Run the targeted tests and confirm they fail for the expected reason.
- [ ] Add token source metadata to streamed metrics and request outcomes.
- [ ] Fail requests when configured completion length cannot be verified from usage or differs from expected.
- [ ] Persist probe outcome details under each trial artifact directory.
- [ ] Run targeted tests and the full unittest suite.
### Task 2: State, Spec, And Compare Guards
**Files:**
- Modify: `src/aituner/spec.py`
- Modify: `src/aituner/store.py`
- Modify: `src/aituner/compare.py`
- Modify: `scripts/run_multi_compare.py`
- Test: `tests/test_core_flow.py`
- [ ] Write failing tests for state list isolation, invalid trace numeric bounds, and compare aggregate failure accounting.
- [ ] Run targeted tests and confirm expected failures.
- [ ] Deep-copy/replace trial lists when materializing trials.
- [ ] Validate positive trace controls in `TraceSpec.from_dict`.
- [ ] Report failed/no-feasible counts in compare aggregates without changing existing winner semantics.
- [ ] Run targeted tests and the full unittest suite.
### Task 3: Docs And Open-Source Readiness
**Files:**
- Create: `README.md`
- Create: `LICENSE`
- Create: `CONTRIBUTING.md`
- Create: `SECURITY.md`
- Modify: `pyproject.toml`
- Modify: selected docs under `docs/`
- [ ] Add concise repo usage, verification, and experiment integrity guidance.
- [ ] Add MIT license and contribution/security notes.
- [ ] Add project metadata and optional test extra.
- [ ] Update stale docs about high-stop behavior and current test count.
- [ ] Run JSON validation and full unittest suite.
- [ ] Commit changes in logical groups.

View File

@@ -6,8 +6,23 @@ build-backend = "setuptools.build_meta"
name = "aituner"
version = "0.1.0"
description = "AITuner study orchestrator for OpenAI-compatible serving engines"
readme = "README.md"
requires-python = ">=3.11"
license = {text = "MIT"}
authors = [{name = "AITuner contributors"}]
dependencies = []
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
[project.optional-dependencies]
test = []
[project.scripts]
aituner = "aituner.cli:main"

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import argparse
import hashlib
import json
import os
from pathlib import Path
from typing import Any
from dataclasses import dataclass
@@ -16,7 +17,15 @@ DEFAULT_THINKING_SOURCE = Path(
"/home/admin/cpfs/wjh/bailian-trace/qwen-trace-260321-260327-formatted"
)
DEFAULT_OUTPUT_ROOT = REPO_ROOT / "trace_windows"
LEGACY_TARGET_DATES = ["2026-03-11", "2026-03-12", "2026-03-13", "2026-03-16", "2026-03-17"]
LEGACY_TARGET_DATES = [
"2026-03-11",
"2026-03-12",
"2026-03-13",
"2026-03-14",
"2026-03-15",
"2026-03-16",
"2026-03-17",
]
THINKING_WINDOWS = [
("2026-03-21", "1000"),
("2026-03-22", "1000"),
@@ -214,10 +223,19 @@ def materialize_windows(
stats_by_window = {str(window["window_id"]): WindowStats() for window in windows}
handles: dict[str, Any] = {}
final_paths: dict[str, Path] = {}
temp_paths: dict[str, Path] = {}
completed = False
try:
for window in windows:
window_id = str(window["window_id"])
handles[window_id] = (traces_dir / f"{window_id}.jsonl").open("w", encoding="utf-8")
final_path = traces_dir / f"{window_id}.jsonl"
temp_path = traces_dir / f".{window_id}.jsonl.tmp.{os.getpid()}"
if temp_path.exists():
temp_path.unlink()
final_paths[window_id] = final_path
temp_paths[window_id] = temp_path
handles[window_id] = temp_path.open("w", encoding="utf-8")
for trace_path, prompt_path in sorted(grouped.keys()):
bucket = grouped[(trace_path, prompt_path)]
@@ -262,9 +280,17 @@ def materialize_windows(
f"materialized {trace_path.name} -> matched_rows={matched_rows}",
flush=True,
)
completed = True
finally:
for handle in handles.values():
handle.close()
if completed:
for window_id, temp_path in temp_paths.items():
os.replace(temp_path, final_paths[window_id])
else:
for temp_path in temp_paths.values():
if temp_path.exists():
temp_path.unlink()
return stats_by_window
@@ -334,10 +360,17 @@ def main() -> int:
"window_duration_seconds": 600.0,
"windows": rendered_windows,
}
(output_root / "windows.json").write_text(
json.dumps(windows_payload, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
windows_path = output_root / "windows.json"
windows_tmp_path = output_root / f".windows.json.tmp.{os.getpid()}"
try:
windows_tmp_path.write_text(
json.dumps(windows_payload, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
os.replace(windows_tmp_path, windows_path)
finally:
if windows_tmp_path.exists():
windows_tmp_path.unlink()
print(output_root)
print(f"windows={len(rendered_windows)}")
return 0

View File

@@ -0,0 +1,587 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, replace
from pathlib import Path
from typing import Any
from aituner.spec import (
CompareCandidateSpec,
ConfigPatch,
SpecError,
TrialSpec,
load_study_spec,
load_structured_file,
to_jsonable,
)
from aituner.store import StudyStore
from aituner.worker import run_trial
@dataclass(frozen=True)
class RuntimeOverride:
cuda_visible_devices: str
port: int
host: str = "127.0.0.1"
@classmethod
def from_dict(cls, data: dict[str, Any], *, context: str) -> "RuntimeOverride":
cuda_visible_devices = str(data.get("cuda_visible_devices") or "").strip()
if not cuda_visible_devices:
raise SpecError(f"{context}.cuda_visible_devices must be a non-empty string.")
port_value = data.get("port")
if isinstance(port_value, bool) or not isinstance(port_value, int):
raise SpecError(f"{context}.port must be an integer.")
host = str(data.get("host") or "127.0.0.1").strip()
if not host:
raise SpecError(f"{context}.host must be a non-empty string.")
return cls(
cuda_visible_devices=cuda_visible_devices,
port=port_value,
host=host,
)
@dataclass(frozen=True)
class MultiCompareCandidate:
name: str
phase: int
candidate: CompareCandidateSpec
runtime: RuntimeOverride
@classmethod
def from_dict(cls, data: dict[str, Any], *, context: str) -> "MultiCompareCandidate":
name = str(data.get("name") or "").strip()
if not name:
raise SpecError(f"{context}.name must be a non-empty string.")
phase_value = data.get("phase", 1)
if isinstance(phase_value, bool) or not isinstance(phase_value, int) or phase_value < 1:
raise SpecError(f"{context}.phase must be a positive integer.")
candidate = CompareCandidateSpec.from_dict(data, context=context)
runtime = RuntimeOverride.from_dict(
dict(data.get("runtime") or {}),
context=f"{context}.runtime",
)
return cls(name=name, phase=phase_value, candidate=candidate, runtime=runtime)
@dataclass(frozen=True)
class MultiCompareSpec:
compare_id: str
study_spec_path: str
output_root: str | None
window_ids: list[str]
candidates: list[MultiCompareCandidate]
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "MultiCompareSpec":
compare_id = str(data.get("compare_id") or "").strip()
if not compare_id:
raise SpecError("compare_id must be a non-empty string.")
study_spec_path = str(data.get("study_spec_path") or "").strip()
if not study_spec_path:
raise SpecError("study_spec_path must be a non-empty string.")
raw_window_ids = data.get("window_ids")
if not isinstance(raw_window_ids, list) or not raw_window_ids:
raise SpecError("window_ids must be a non-empty list.")
window_ids = [str(item).strip() for item in raw_window_ids if str(item).strip()]
if not window_ids:
raise SpecError("window_ids must contain at least one non-empty string.")
raw_candidates = data.get("candidates")
if not isinstance(raw_candidates, list) or not raw_candidates:
raise SpecError("candidates must be a non-empty list.")
candidates = [
MultiCompareCandidate.from_dict(dict(item), context=f"candidates[{idx}]")
for idx, item in enumerate(raw_candidates)
if isinstance(item, dict)
]
if len(candidates) != len(raw_candidates):
raise SpecError("Every candidates entry must be an object.")
names = [item.name for item in candidates]
if len(names) != len(set(names)):
raise SpecError("candidates names must be unique.")
return cls(
compare_id=compare_id,
study_spec_path=study_spec_path,
output_root=str(data.get("output_root") or "").strip() or None,
window_ids=window_ids,
candidates=candidates,
)
def _resolve_path(raw_path: str, *, base_dir: Path) -> Path:
path = Path(raw_path)
if not path.is_absolute():
path = (base_dir / path).resolve()
return path
def _load_windows_payload(study: Any, *, study_spec_path: Path) -> list[dict[str, Any]]:
windows_path = Path(study.trace.windows_path)
if not windows_path.is_absolute():
windows_path = (study_spec_path.parent / windows_path).resolve()
payload = json.loads(windows_path.read_text(encoding="utf-8"))
raw_windows = payload.get("windows") if isinstance(payload, dict) else payload
if not isinstance(raw_windows, list):
raise SpecError(f"windows payload must contain a list: {windows_path}")
return [
{str(key): value for key, value in item.items()}
for item in raw_windows
if isinstance(item, dict)
]
def _select_windows(spec: MultiCompareSpec, *, study: Any, study_spec_path: Path) -> list[dict[str, Any]]:
windows = _load_windows_payload(study, study_spec_path=study_spec_path)
indexed = {str(item.get("window_id") or "").strip(): item for item in windows}
selected: list[dict[str, Any]] = []
for window_id in spec.window_ids:
item = indexed.get(window_id)
if item is None:
raise SpecError(f"window_id not found in windows payload: {window_id}")
selected.append(item)
return selected
def _load_config_patch(
candidate: MultiCompareCandidate,
*,
spec_path: Path,
) -> tuple[ConfigPatch, dict[str, Any]]:
if candidate.candidate.config_patch is not None:
config_patch = candidate.candidate.config_patch
return config_patch, {
"kind": "config_patch",
"config_patch": {
"env_patch": dict(config_patch.env_patch),
"flag_patch": dict(config_patch.flag_patch),
},
}
assert candidate.candidate.trial_ref is not None
study_root = _resolve_path(candidate.candidate.trial_ref.study_root, base_dir=spec_path.parent)
trial_spec_path = study_root / "trials" / candidate.candidate.trial_ref.trial_id / "trial_spec.json"
if not trial_spec_path.exists():
raise SpecError(f"trial_ref target not found: {trial_spec_path}")
payload = json.loads(trial_spec_path.read_text(encoding="utf-8"))
config_patch = ConfigPatch.from_dict(payload.get("config_patch") or {})
return config_patch, {
"kind": "trial_ref",
"study_root": str(study_root),
"trial_id": candidate.candidate.trial_ref.trial_id,
"config_patch": {
"env_patch": dict(config_patch.env_patch),
"flag_patch": dict(config_patch.flag_patch),
},
}
def _parse_int_like(value: Any, *, default: int = 1) -> int:
if value is None:
return default
if isinstance(value, bool):
raise SpecError("Topology values must be integers.")
if isinstance(value, int):
return value
if isinstance(value, float) and value.is_integer():
return int(value)
if isinstance(value, str) and value.strip():
return int(value.strip())
raise SpecError(f"Unable to parse integer topology value: {value!r}")
def _parallel_size_for_candidate(*, study: Any, patch: ConfigPatch) -> int:
flags = dict(study.engine.base_flags)
flags.update(patch.flag_patch)
tp = _parse_int_like(flags.get("tensor-parallel-size"), default=1)
dp = _parse_int_like(flags.get("data-parallel-size"), default=1)
return tp * dp
def _trial_snapshot(trial: TrialSpec) -> dict[str, Any]:
return to_jsonable(trial)
def _study_snapshot(study: Any) -> dict[str, Any]:
return to_jsonable(study)
def _run_candidate_for_window(
*,
compare_id: str,
compare_root: Path,
study: Any,
study_spec_path: Path,
window_id: str,
candidate: MultiCompareCandidate,
config_patch: ConfigPatch,
source: dict[str, Any],
) -> dict[str, Any]:
run_root = compare_root / "runs" / window_id / candidate.name
run_root.mkdir(parents=True, exist_ok=True)
result_path = run_root / "result.json"
if result_path.exists():
result = json.loads(result_path.read_text(encoding="utf-8"))
parallel_size = _parallel_size_for_candidate(study=study, patch=config_patch)
best_rate = result.get("best_request_rate")
best_rate_per_gpu = (
float(best_rate) / float(parallel_size)
if isinstance(best_rate, (int, float)) and parallel_size > 0
else None
)
return {
"candidate": candidate.name,
"source": source,
"parallel_size": parallel_size,
"runtime": {
"cuda_visible_devices": candidate.runtime.cuda_visible_devices,
"port": candidate.runtime.port,
"host": candidate.runtime.host,
},
"config_patch": {
"env_patch": dict(config_patch.env_patch),
"flag_patch": dict(config_patch.flag_patch),
},
"status": result.get("status"),
"best_sampling_u": result.get("best_sampling_u"),
"best_request_rate": best_rate,
"best_request_rate_per_gpu": best_rate_per_gpu,
"best_pass_rate": result.get("best_pass_rate"),
"best_request_count": result.get("best_request_count"),
"failure_stage": result.get("failure_stage"),
"failure_reason": result.get("failure_reason"),
"artifact_dir": str(run_root),
"result_path": str(result_path),
"probe_log_path": str(run_root / "probe_history.json"),
"engine_log_path": str(run_root / "engine.log"),
"resumed": True,
}
engine_envs = dict(study.engine.base_envs)
engine_envs["CUDA_VISIBLE_DEVICES"] = candidate.runtime.cuda_visible_devices
engine_flags = dict(study.engine.base_flags)
engine_flags["port"] = candidate.runtime.port
runtime_study = replace(
study,
trace=replace(study.trace, window_id=window_id),
engine=replace(
study.engine,
host=candidate.runtime.host,
port=candidate.runtime.port,
base_envs=engine_envs,
base_flags=engine_flags,
),
)
actual_study_path = run_root / "study_spec.json"
source_path = run_root / "study_spec.source"
actual_study_path.write_text(
json.dumps(_study_snapshot(runtime_study), ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
source_path.write_text(str(actual_study_path) + "\n", encoding="utf-8")
trial = TrialSpec(
study_id=compare_id,
trial_id=candidate.name,
config_patch=config_patch,
search=runtime_study.search,
study_spec_path=str(source_path),
artifact_dir=str(run_root),
probe_log_path=str(run_root / "probe_history.json"),
engine_log_path=str(run_root / "engine.log"),
result_path=str(result_path),
)
StudyStore.write_json(run_root / "trial_spec.json", _trial_snapshot(trial))
result = run_trial(run_root / "trial_spec.json")
parallel_size = _parallel_size_for_candidate(study=runtime_study, patch=config_patch)
best_rate = result.get("best_request_rate")
best_rate_per_gpu = (
float(best_rate) / float(parallel_size)
if isinstance(best_rate, (int, float)) and parallel_size > 0
else None
)
return {
"candidate": candidate.name,
"source": source,
"parallel_size": parallel_size,
"runtime": {
"cuda_visible_devices": candidate.runtime.cuda_visible_devices,
"port": candidate.runtime.port,
"host": candidate.runtime.host,
},
"config_patch": {
"env_patch": dict(config_patch.env_patch),
"flag_patch": dict(config_patch.flag_patch),
},
"status": result.get("status"),
"best_sampling_u": result.get("best_sampling_u"),
"best_request_rate": best_rate,
"best_request_rate_per_gpu": best_rate_per_gpu,
"best_pass_rate": result.get("best_pass_rate"),
"best_request_count": result.get("best_request_count"),
"failure_stage": result.get("failure_stage"),
"failure_reason": result.get("failure_reason"),
"artifact_dir": str(run_root),
"result_path": str(result_path),
"probe_log_path": str(run_root / "probe_history.json"),
"engine_log_path": str(run_root / "engine.log"),
"resumed": False,
}
def _winner(candidates: dict[str, dict[str, Any]]) -> str:
scored = [
(name, float(result["best_request_rate_per_gpu"]))
for name, result in candidates.items()
if isinstance(result.get("best_request_rate_per_gpu"), (int, float))
]
if not scored:
return "incomparable"
scored.sort(key=lambda item: item[1], reverse=True)
if len(scored) > 1 and scored[0][1] == scored[1][1]:
return "tie"
return scored[0][0]
def _aggregate(rows: list[dict[str, Any]], candidates: list[MultiCompareCandidate]) -> dict[str, Any]:
candidate_names = [item.name for item in candidates]
wins = {name: 0 for name in candidate_names}
wins["tie"] = 0
wins["incomparable"] = 0
means: dict[str, dict[str, Any]] = {}
for name in candidate_names:
rates = [
float(row["candidates"][name]["best_request_rate"])
for row in rows
if isinstance(row["candidates"][name].get("best_request_rate"), (int, float))
]
rates_per_gpu = [
float(row["candidates"][name]["best_request_rate_per_gpu"])
for row in rows
if isinstance(row["candidates"][name].get("best_request_rate_per_gpu"), (int, float))
]
pass_rates = [
float(row["candidates"][name]["best_pass_rate"])
for row in rows
if isinstance(row["candidates"][name].get("best_pass_rate"), (int, float))
]
means[name] = {
"mean_request_rate": (sum(rates) / len(rates)) if rates else None,
"mean_request_rate_per_gpu": (sum(rates_per_gpu) / len(rates_per_gpu))
if rates_per_gpu
else None,
"mean_pass_rate": (sum(pass_rates) / len(pass_rates)) if pass_rates else None,
**_candidate_result_counts(rows, name),
}
for row in rows:
wins[row["winner"]] = wins.get(row["winner"], 0) + 1
return {
"window_count": len(rows),
"wins": wins,
"candidates": means,
}
def _candidate_result_counts(rows: list[dict[str, Any]], name: str) -> dict[str, int]:
counts = {
"completed_window_count": 0,
"failed_window_count": 0,
"no_feasible_window_count": 0,
}
for row in rows:
result = row.get("candidates", {}).get(name)
if not isinstance(result, dict):
continue
status = str(result.get("status") or "")
if status == "completed":
counts["completed_window_count"] += 1
elif status == "failed":
counts["failed_window_count"] += 1
if not isinstance(result.get("best_request_rate_per_gpu"), (int, float)):
counts["no_feasible_window_count"] += 1
return counts
def _render_report(summary: dict[str, Any], candidates: list[MultiCompareCandidate]) -> str:
candidate_names = [item.name for item in candidates]
lines = [
f"# {summary['compare_id']}",
"",
"## Setup",
"",
f"- Study spec: `{summary['study_spec_path']}`",
f"- Compare root: `{summary['compare_root']}`",
f"- Windows: `{len(summary['windows'])}`",
"",
"## Candidates",
"",
]
for item in candidates:
lines.append(
f"- `{item.name}`: phase=`{item.phase}`, gpus=`{item.runtime.cuda_visible_devices}`, port=`{item.runtime.port}`"
)
lines.extend(
[
"",
"## Aggregate",
"",
f"- Wins: `{json.dumps(summary['aggregate']['wins'], ensure_ascii=False)}`",
]
)
for name in candidate_names:
aggregate = summary["aggregate"]["candidates"][name]
lines.append(
f"- `{name}` mean req/s=`{aggregate['mean_request_rate']}`, mean req/s/gpu=`{aggregate['mean_request_rate_per_gpu']}`, mean pass_rate=`{aggregate['mean_pass_rate']}`"
)
lines.append(
f" completed/failed/no-feasible windows=`{aggregate['completed_window_count']}`/`{aggregate['failed_window_count']}`/`{aggregate['no_feasible_window_count']}`"
)
header = ["Window", "Date"]
for name in candidate_names:
header.extend([f"{name} req/s", f"{name} req/s/gpu"])
header.append("Winner")
lines.extend(
[
"",
"## Per Window",
"",
"| " + " | ".join(header) + " |",
"| " + " | ".join(["---"] * len(header)) + " |",
]
)
for row in summary["windows"]:
cells = [f"`{row['window_id']}`", f"`{row.get('date') or ''}`"]
for name in candidate_names:
candidate = row["candidates"][name]
cells.append(f"`{candidate.get('best_request_rate')}`")
cells.append(f"`{candidate.get('best_request_rate_per_gpu')}`")
cells.append(f"`{row['winner']}`")
lines.append("| " + " | ".join(cells) + " |")
lines.append("")
return "\n".join(lines)
def run_multi_compare(spec_path: Path) -> dict[str, Any]:
spec_path = spec_path.resolve()
spec = MultiCompareSpec.from_dict(dict(load_structured_file(spec_path)))
study_spec_path = _resolve_path(spec.study_spec_path, base_dir=spec_path.parent)
study = load_study_spec(study_spec_path)
compare_root = (
_resolve_path(spec.output_root, base_dir=spec_path.parent)
if spec.output_root
else (Path(".aituner-compare") / spec.compare_id).resolve()
)
compare_root.mkdir(parents=True, exist_ok=True)
windows = _select_windows(spec, study=study, study_spec_path=study_spec_path)
candidate_payloads = []
resolved_candidates: dict[str, tuple[MultiCompareCandidate, ConfigPatch, dict[str, Any]]] = {}
for candidate in spec.candidates:
config_patch, source = _load_config_patch(candidate, spec_path=spec_path)
resolved_candidates[candidate.name] = (candidate, config_patch, source)
candidate_payloads.append(
{
"name": candidate.name,
"phase": candidate.phase,
"runtime": {
"cuda_visible_devices": candidate.runtime.cuda_visible_devices,
"port": candidate.runtime.port,
"host": candidate.runtime.host,
},
"source": source,
}
)
snapshot = {
"compare_id": spec.compare_id,
"study_spec_path": str(study_spec_path),
"window_ids": spec.window_ids,
"candidates": candidate_payloads,
}
StudyStore.write_json(compare_root / "compare_spec.snapshot.json", snapshot)
phases = sorted({item.phase for item in spec.candidates})
per_window: list[dict[str, Any]] = []
for window in windows:
window_id = str(window["window_id"])
row = {
"window_id": window_id,
"trace_type": window.get("trace_type"),
"date": window.get("date"),
"slot_token": window.get("slot_token"),
"slot_label": window.get("slot_label"),
"window_start": window.get("window_start"),
"window_end": window.get("window_end"),
"candidates": {},
}
for phase in phases:
phase_candidates = [item for item in spec.candidates if item.phase == phase]
with ThreadPoolExecutor(max_workers=len(phase_candidates)) as executor:
future_map = {
executor.submit(
_run_candidate_for_window,
compare_id=spec.compare_id,
compare_root=compare_root,
study=study,
study_spec_path=study_spec_path,
window_id=window_id,
candidate=item,
config_patch=resolved_candidates[item.name][1],
source=resolved_candidates[item.name][2],
): item.name
for item in phase_candidates
}
for future in as_completed(future_map):
result = future.result()
row["candidates"][result["candidate"]] = result
row["winner"] = _winner(row["candidates"])
per_window.append(row)
partial_summary = {
"compare_id": spec.compare_id,
"study_spec_path": str(study_spec_path),
"compare_root": str(compare_root),
"windows": per_window,
"aggregate": _aggregate(per_window, spec.candidates),
}
StudyStore.write_json(compare_root / "summary.json", partial_summary)
(compare_root / "report.md").write_text(
_render_report(partial_summary, spec.candidates),
encoding="utf-8",
)
summary = {
"compare_id": spec.compare_id,
"study_spec_path": str(study_spec_path),
"compare_root": str(compare_root),
"windows": per_window,
"aggregate": _aggregate(per_window, spec.candidates),
}
StudyStore.write_json(compare_root / "summary.json", summary)
(compare_root / "report.md").write_text(
_render_report(summary, spec.candidates),
encoding="utf-8",
)
return summary
def main() -> int:
parser = argparse.ArgumentParser(description="Run a multi-candidate compare over trace windows.")
parser.add_argument("--spec", required=True)
args = parser.parse_args()
summary = run_multi_compare(Path(args.spec))
print(
json.dumps(
{
"compare_id": summary["compare_id"],
"compare_root": summary["compare_root"],
"window_count": summary["aggregate"]["window_count"],
"wins": summary["aggregate"]["wins"],
},
ensure_ascii=False,
indent=2,
)
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -6,14 +6,101 @@ import sys
from pathlib import Path
from .compare import run_compare
from .harness import (
build_harness_context,
build_harness_guided_proposal,
build_harness_stop_proposal,
)
from .job import append_job, build_trial_job
from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
from .spec import Proposal, SpecError, load_study_spec
from .spec import Proposal, SpecError, load_study_spec, to_jsonable
from .store import StudyStore
from .trace import load_trace_requests, summarize_window
from .worker import run_trial
def _is_empty_config_patch(proposal: Proposal) -> bool:
return not proposal.config_patch.env_patch and not proposal.config_patch.flag_patch
def _latency_percentiles(summary: object, metric: str) -> dict[str, float]:
if not isinstance(summary, dict):
return {}
payload = summary.get(metric)
if not isinstance(payload, dict):
return {}
selected: dict[str, float] = {}
for key in ("mean", "p50", "p95", "p99"):
value = payload.get(key)
if isinstance(value, (int, float)):
selected[key] = float(value)
return selected
def _format_latency_percentiles(metric: str, values: dict[str, float]) -> str:
if not values:
return ""
ordered = ", ".join(
f"{key}={values[key]:.3f}"
for key in ("mean", "p50", "p95", "p99")
if key in values
)
return f"{metric}({ordered})"
def _baseline_all_infeasible_stop(result: dict[str, object]) -> tuple[str, dict[str, object]] | None:
if result.get("status") != "completed":
return None
if isinstance(result.get("best_request_rate"), (int, float)):
return None
probes = result.get("probes")
if not isinstance(probes, list) or not probes:
return None
if any(isinstance(probe, dict) and probe.get("feasible") for probe in probes):
return None
diagnostics = result.get("all_infeasible_diagnostics")
if not isinstance(diagnostics, dict):
diagnostics = {}
lowest_rate = diagnostics.get("request_rate")
lowest_threshold = diagnostics.get("threshold")
pass_rate = diagnostics.get("pass_rate")
early_stop_reason = str(diagnostics.get("early_stop_reason") or "").strip()
latency_summary = diagnostics.get("latency_summary")
ttft = _latency_percentiles(latency_summary, "ttft_ms")
tpot = _latency_percentiles(latency_summary, "tpot_ms")
details: dict[str, object] = {
"lowest_sampled_request_rate": lowest_rate,
"lowest_sampling_u": lowest_threshold,
"lowest_probe_pass_rate": pass_rate,
"early_stop_reason": early_stop_reason,
"lowest_probe_latency_ms": {
"ttft": ttft,
"tpot": tpot,
},
"lowest_probe_latency_summary": latency_summary if isinstance(latency_summary, dict) else {},
}
pieces = [
"Baseline configuration has no feasible probe under the current SLO.",
"Stopping tuning because even the lowest sampled request rate did not meet the target pass rate.",
]
if isinstance(lowest_rate, (int, float)):
pieces.append(f"lowest_sampled_request_rate={float(lowest_rate):.6g}")
if isinstance(lowest_threshold, (int, float)):
pieces.append(f"lowest_sampling_u={float(lowest_threshold):.6g}")
if isinstance(pass_rate, (int, float)):
pieces.append(f"lowest_probe_pass_rate={float(pass_rate):.6g}")
if early_stop_reason:
pieces.append(f"early_stop_reason={early_stop_reason}")
for item in (
_format_latency_percentiles("lowest_probe_ttft_ms", ttft),
_format_latency_percentiles("lowest_probe_tpot_ms", tpot),
):
if item:
pieces.append(item)
return " ".join(pieces), details
def _study_source_path(study_root: Path) -> Path:
return Path((study_root / "study_spec.source").read_text(encoding="utf-8").strip())
@@ -113,48 +200,149 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
study_root = store.init_study(spec_path=spec_path, study=study)
capability_profile = load_capability_profile(study, study_spec_path=spec_path)
proposal_files = [Path(item).resolve() for item in (args.proposal_file or [])]
max_trials = args.max_trials or (len(proposal_files) if proposal_files else 1)
max_trials = args.max_trials or (len(proposal_files) if proposal_files else 2)
if max_trials <= 0:
raise SpecError("max_trials must be positive")
if proposal_files and max_trials > len(proposal_files):
max_trials = len(proposal_files)
if not proposal_files and study.llm.endpoint is None:
raise SpecError("No proposal files provided and study.llm.endpoint is not configured")
executed: list[dict[str, object]] = []
for idx in range(max_trials):
state = store.load_state(study.study_id)
if state.tuning_stop_reason:
executed.append(
{
"trial_id": None,
"stopped": True,
"reason": state.tuning_stop_reason,
"diagnosis": state.tuning_stop_diagnosis,
"details": state.tuning_stop_details,
"state_best_trial_id": state.best_trial_id,
"state_best_request_rate": state.best_request_rate,
}
)
break
if state.next_trial_index > max_trials:
break
window, requests = load_trace_requests(study, study_spec_path=spec_path)
window_summary = summarize_window(requests, window)
harness_context = (
build_harness_context(
study=study,
window_summary=window_summary,
state=state,
)
if study.llm.use_harness
else None
)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
window_summary=window_summary,
state=state,
capability_profile=capability_profile,
)
prompt_name = f"prompt-{state.next_trial_index:04d}"
store.write_prompt(study.study_id, prompt_name, prompt)
if proposal_files:
proposal_source = proposal_files[idx]
if (
not proposal_files
and not args.skip_baseline
and state.next_trial_index == 1
and not state.trials
):
proposal_source = None
proposal_name = "baseline-0001"
proposal_text = json.dumps(
{
"observation": "Evaluate the study's initial engine configuration before LLM-guided edits.",
"diagnosis": "Baseline trial aligned with the AITuner evaluate-then-search loop.",
"config_patch": {"env_patch": {}, "flag_patch": {}},
"expected_effects": [
"establish incumbent performance",
"provide bottleneck evidence for harness-guided proposals",
],
"why_not_previous_failures": "No config changes are applied.",
"should_stop": False,
},
ensure_ascii=False,
)
elif proposal_files:
proposal_index = state.next_trial_index - 1
if proposal_index >= len(proposal_files):
break
proposal_source = proposal_files[proposal_index]
proposal_text = proposal_source.read_text(encoding="utf-8")
proposal_name = proposal_source.stem
else:
proposal_source = None
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
proposal_name = f"proposal-{state.next_trial_index:04d}"
stop_proposal = (
build_harness_stop_proposal(harness_context)
if harness_context is not None
else None
)
if stop_proposal is not None:
proposal_text = json.dumps(to_jsonable(stop_proposal), ensure_ascii=False)
proposal_name = f"harness-stop-{state.next_trial_index:04d}"
else:
guided_proposal = (
build_harness_guided_proposal(harness_context)
if harness_context is not None
else None
)
if guided_proposal is not None:
proposal_text = json.dumps(
to_jsonable(guided_proposal),
ensure_ascii=False,
)
proposal_name = f"harness-proposal-{state.next_trial_index:04d}"
else:
if study.llm.endpoint is None:
raise SpecError(
"No proposal files provided, study.llm.endpoint is not configured, "
"and the harness stop guard did not fire."
)
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
proposal_name = f"proposal-{state.next_trial_index:04d}"
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
raw_proposal_path.write_text(proposal_text, encoding="utf-8")
proposal = parse_proposal_text(proposal_text, study)
store.write_proposal(study.study_id, proposal_name, proposal)
if proposal.should_stop:
if proposal_name.startswith("harness-stop-"):
proposal_source_label = "harness"
else:
proposal_source_label = str(proposal_source) if proposal_source else "llm"
executed.append(
{
"trial_id": None,
"proposal_name": proposal_name,
"proposal_source": proposal_source_label,
"stopped": True,
"diagnosis": proposal.diagnosis,
"state_best_trial_id": state.best_trial_id,
"state_best_request_rate": state.best_request_rate,
}
)
break
is_auto_baseline = (
not proposal_files
and not args.skip_baseline
and state.next_trial_index == 1
and not state.trials
and _is_empty_config_patch(proposal)
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
trial_spec_path = Path(trial.artifact_dir) / "trial_spec.json"
result = run_trial(trial_spec_path)
state = store.ingest_trial_results(study.study_id)
executed.append(
{
"trial_id": trial.trial_id,
"proposal_name": proposal_name,
"proposal_source": str(proposal_source) if proposal_source else "llm",
{
"trial_id": trial.trial_id,
"proposal_name": proposal_name,
"proposal_source": (
"harness"
if proposal_name.startswith("harness-proposal-")
else str(proposal_source) if proposal_source else "llm"
),
"best_sampling_u": result.get("best_sampling_u"),
"best_request_rate": result.get("best_request_rate"),
"best_pass_rate": result.get("best_pass_rate"),
@@ -162,6 +350,26 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"state_best_request_rate": state.best_request_rate,
}
)
if is_auto_baseline:
stop = _baseline_all_infeasible_stop(result)
if stop is not None:
diagnosis, details = stop
state.tuning_stop_reason = "baseline_all_infeasible"
state.tuning_stop_diagnosis = diagnosis
state.tuning_stop_details = details
store.save_state(state)
executed.append(
{
"trial_id": None,
"stopped": True,
"reason": state.tuning_stop_reason,
"diagnosis": diagnosis,
"details": details,
"state_best_trial_id": state.best_trial_id,
"state_best_request_rate": state.best_request_rate,
}
)
break
final_state = store.load_state(study.study_id)
print(
@@ -171,6 +379,9 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
"executed_trials": executed,
"best_trial_id": final_state.best_trial_id,
"best_request_rate": final_state.best_request_rate,
"tuning_stop_reason": final_state.tuning_stop_reason,
"tuning_stop_diagnosis": final_state.tuning_stop_diagnosis,
"tuning_stop_details": final_state.tuning_stop_details,
},
ensure_ascii=False,
)
@@ -251,6 +462,11 @@ def build_parser() -> argparse.ArgumentParser:
tune.add_argument("--store-root")
tune.add_argument("--proposal-file", action="append")
tune.add_argument("--max-trials", type=int)
tune.add_argument(
"--skip-baseline",
action="store_true",
help="Do not automatically evaluate the initial config before LLM proposals.",
)
tune.set_defaults(func=cmd_study_tune)
worker = subparsers.add_parser("worker")

View File

@@ -382,6 +382,8 @@ def _aggregate_summary(rows: list[dict[str, Any]]) -> dict[str, Any]:
wins = {"baseline": 0, "tuned": 0, "tie": 0, "incomparable": 0}
for row in rows:
wins[row["delta"]["winner"]] += 1
baseline_counts = _candidate_result_counts(rows, "baseline")
tuned_counts = _candidate_result_counts(rows, "tuned")
return {
"window_count": len(rows),
"wins": wins,
@@ -389,9 +391,31 @@ def _aggregate_summary(rows: list[dict[str, Any]]) -> dict[str, Any]:
"tuned_mean_request_rate": _mean_or_none(tuned_rates),
"baseline_mean_request_rate_per_gpu": _mean_or_none(baseline_per_gpu),
"tuned_mean_request_rate_per_gpu": _mean_or_none(tuned_per_gpu),
"baseline_completed_window_count": baseline_counts["completed"],
"baseline_failed_window_count": baseline_counts["failed"],
"baseline_no_feasible_window_count": baseline_counts["no_feasible"],
"tuned_completed_window_count": tuned_counts["completed"],
"tuned_failed_window_count": tuned_counts["failed"],
"tuned_no_feasible_window_count": tuned_counts["no_feasible"],
}
def _candidate_result_counts(rows: list[dict[str, Any]], name: str) -> dict[str, int]:
counts = {"completed": 0, "failed": 0, "no_feasible": 0}
for row in rows:
result = row.get(name)
if not isinstance(result, dict):
continue
status = str(result.get("status") or "")
if status == "completed":
counts["completed"] += 1
elif status == "failed":
counts["failed"] += 1
if not isinstance(result.get("best_request_rate_per_gpu"), (int, float)):
counts["no_feasible"] += 1
return counts
def _mean_or_none(values: list[float]) -> float | None:
if not values:
return None
@@ -417,6 +441,8 @@ def _render_report(summary: dict[str, Any]) -> str:
f"- Tuned mean request rate: `{summary['aggregate']['tuned_mean_request_rate']}`",
f"- Baseline mean request rate per GPU: `{summary['aggregate']['baseline_mean_request_rate_per_gpu']}`",
f"- Tuned mean request rate per GPU: `{summary['aggregate']['tuned_mean_request_rate_per_gpu']}`",
f"- Baseline completed/failed/no-feasible windows: `{summary['aggregate']['baseline_completed_window_count']}`/`{summary['aggregate']['baseline_failed_window_count']}`/`{summary['aggregate']['baseline_no_feasible_window_count']}`",
f"- Tuned completed/failed/no-feasible windows: `{summary['aggregate']['tuned_completed_window_count']}`/`{summary['aggregate']['tuned_failed_window_count']}`/`{summary['aggregate']['tuned_no_feasible_window_count']}`",
"",
"## Per Window",
"",

1230
src/aituner/harness.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -240,6 +240,8 @@ class StreamMetrics:
ttft_ms: float | None
tpot_ms: float | None
completion_tokens: int | None
completion_tokens_source: str = "usage"
streamed_chunk_count: int = 0
def stream_chat_completion(
@@ -260,6 +262,7 @@ def stream_chat_completion(
last_token_at: float | None = None
chunk_token_count = 0
completion_tokens: int | None = None
completion_tokens_source = "none"
try:
with _urlopen(request, timeout=timeout_s) as response:
for raw in _iter_sse_lines(response):
@@ -273,6 +276,7 @@ def stream_chat_completion(
comp = usage.get("completion_tokens")
if isinstance(comp, int) and comp >= 0:
completion_tokens = comp
completion_tokens_source = "usage"
choices = payload.get("choices")
if not isinstance(choices, list) or not choices:
continue
@@ -290,7 +294,10 @@ def stream_chat_completion(
detail = exc.read().decode("utf-8", errors="replace")
raise HttpClientError(f"stream_chat_completion failed: {exc.code} {detail}") from exc
ttft_ms = None if first_token_at is None else (first_token_at - start) * 1000.0
used_tokens = completion_tokens if completion_tokens is not None else chunk_token_count
if completion_tokens is None and chunk_token_count > 0:
completion_tokens = chunk_token_count
completion_tokens_source = "stream_chunks"
used_tokens = completion_tokens
if (
first_token_at is None
or last_token_at is None
@@ -304,6 +311,8 @@ def stream_chat_completion(
ttft_ms=ttft_ms,
tpot_ms=tpot_ms,
completion_tokens=used_tokens if used_tokens > 0 else None,
completion_tokens_source=completion_tokens_source,
streamed_chunk_count=chunk_token_count,
)

View File

@@ -1,9 +1,11 @@
from __future__ import annotations
import json
import time
from pathlib import Path
from typing import Any
from .harness import build_harness_context, render_harness_context
from .http_client import chat_completion, stream_text_completion
from .spec import LLMPolicySpec, Proposal, SpecError, StudySpec, StudyState
@@ -212,9 +214,10 @@ def build_prompt(
parallel_candidates = _enumerate_parallel_candidates(study)
sections = [
"You are tuning an OpenAI-compatible serving engine.",
"Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures.",
"Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures, should_stop.",
"config_patch must contain env_patch and flag_patch.",
"expected_effects must be a JSON array of short strings, not an object.",
"should_stop must be a boolean. Use true only when the harness convergence guard says another adjacent probe is not justified.",
"Only use allowed tunable env keys and allowed tunable flag keys.",
"Do not wrap the JSON in markdown fences or any extra text.",
"Do not repeat a config that previously failed to launch unless the new patch explicitly removes the failing knob.",
@@ -249,6 +252,9 @@ def build_prompt(
"window_id": study.trace.window_id,
"request_mode": study.trace.request_mode,
"completion_tokens_override": study.trace.completion_tokens_override,
"early_stop_max_lag_s": study.trace.early_stop_max_lag_s,
"early_stop_max_elapsed_s": study.trace.early_stop_max_elapsed_s,
"restart_engine_after_early_stop": study.trace.restart_engine_after_early_stop,
"input_length_filter": (
{
"min_input_tokens": study.trace.input_length_filter.min_input_tokens,
@@ -305,15 +311,70 @@ def build_prompt(
"Parallel space candidates:",
json.dumps(parallel_candidates, ensure_ascii=False, indent=2),
"",
"Tested config signatures:",
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
]
if study.llm.use_harness:
sections.extend(
[
"",
"Harnesses:",
render_harness_context(
build_harness_context(
study=study,
window_summary=window_summary,
state=state,
)
),
"",
]
)
else:
sections.extend(
[
"",
"Harnesses:",
"Disabled by llm.use_harness=false for ablation.",
"",
]
)
sections.extend(
[
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
"The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
"The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
"If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
]
(
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
if study.llm.use_harness
else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
),
]
)
return "\n".join(sections)
def _tested_config_signatures(state: StudyState) -> list[dict[str, Any]]:
signatures: list[dict[str, Any]] = []
seen: set[str] = set()
for trial in state.trials:
config_patch = trial.config_patch or {}
signature = json.dumps(config_patch, sort_keys=True, ensure_ascii=False)
if signature in seen:
continue
seen.add(signature)
signatures.append(
{
"trial_id": trial.trial_id,
"status": trial.status,
"best_request_rate_per_gpu": trial.best_request_rate_per_gpu,
"config_patch": config_patch,
}
)
return signatures
def load_capability_profile(study: StudySpec, *, study_spec_path: Path) -> dict[str, Any] | None:
if not study.capability_profile_path:
return None
@@ -543,27 +604,41 @@ def call_llm_for_proposal(
) -> str:
if policy.endpoint is None:
raise RuntimeError("study.llm.endpoint is not configured")
if policy.endpoint.stream:
return stream_text_completion(
base_url=policy.endpoint.base_url,
api_key_env=policy.endpoint.api_key_env,
provider=policy.endpoint.provider,
wire_api=policy.endpoint.wire_api,
model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort,
)
response = chat_completion(
base_url=policy.endpoint.base_url,
api_key_env=policy.endpoint.api_key_env,
provider=policy.endpoint.provider,
wire_api=policy.endpoint.wire_api,
model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort,
)
return _extract_response_text(response)
last_error: Exception | None = None
max_attempts = 4
for attempt in range(max_attempts):
try:
if policy.endpoint.stream:
text = stream_text_completion(
base_url=policy.endpoint.base_url,
api_key_env=policy.endpoint.api_key_env,
provider=policy.endpoint.provider,
wire_api=policy.endpoint.wire_api,
model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort,
)
else:
response = chat_completion(
base_url=policy.endpoint.base_url,
api_key_env=policy.endpoint.api_key_env,
provider=policy.endpoint.provider,
wire_api=policy.endpoint.wire_api,
model=policy.endpoint.model,
messages=[{"role": "user", "content": prompt}],
timeout_s=policy.endpoint.timeout_s,
system_prompt=policy.system_prompt,
reasoning_effort=policy.endpoint.reasoning_effort,
)
text = _extract_response_text(response)
if text.strip():
return text
last_error = RuntimeError("LLM response content is empty")
except Exception as exc: # noqa: BLE001
last_error = exc
if attempt < max_attempts - 1:
time.sleep(min(30.0, 2.0 * (2**attempt)))
continue
raise RuntimeError(f"LLM proposal failed after retry: {last_error}") from last_error

View File

@@ -15,6 +15,7 @@ class RequestOutcome:
prompt_tokens: int | None
completion_tokens: int | None
error: str = ""
completion_tokens_source: str = ""
@dataclass(frozen=True)

View File

@@ -63,6 +63,16 @@ def _coerce_str_list(value: Any, *, context: str) -> list[str]:
return result
def _coerce_text(value: Any, *, context: str) -> str:
if isinstance(value, str) and value.strip():
return value.strip()
if isinstance(value, Mapping) or isinstance(value, list):
text = json.dumps(value, ensure_ascii=False, sort_keys=True)
if text:
return text
raise SpecError(f"{context} must be a non-empty string.")
def _coerce_int_list(value: Any, *, context: str) -> list[int]:
if value is None:
return []
@@ -327,6 +337,7 @@ class TraceSpec:
replay_time_scale: float = 1.0
early_stop_max_lag_s: float | None = None
early_stop_max_elapsed_s: float | None = None
restart_engine_after_early_stop: bool = False
@classmethod
def from_dict(cls, data: Mapping[str, Any]) -> "TraceSpec":
@@ -343,6 +354,33 @@ class TraceSpec:
)
if completion_tokens_override < 0:
raise SpecError("trace.completion_tokens_override must be >= 0.")
max_requests_value = (
_require_int(max_requests, context="trace.max_requests_per_probe")
if max_requests is not None
else None
)
if max_requests_value is not None and max_requests_value <= 0:
raise SpecError("trace.max_requests_per_probe must be > 0.")
synthetic_prompt_cap_value = (
_require_int(
synthetic_prompt_cap,
context="trace.synthetic_prompt_cap_tokens",
)
if synthetic_prompt_cap is not None
else None
)
if synthetic_prompt_cap_value is not None and synthetic_prompt_cap_value < 0:
raise SpecError("trace.synthetic_prompt_cap_tokens must be >= 0.")
replay_time_scale = _require_float(
data.get("replay_time_scale", 1.0), context="trace.replay_time_scale"
)
if replay_time_scale <= 0:
raise SpecError("trace.replay_time_scale must be > 0.")
max_concurrency = _require_int(
data.get("max_concurrency", 64), context="trace.max_concurrency"
)
if max_concurrency <= 0:
raise SpecError("trace.max_concurrency must be > 0.")
return cls(
windows_path=_require_str(data.get("windows_path"), context="trace.windows_path"),
window_id=_require_str(data.get("window_id"), context="trace.window_id"),
@@ -353,9 +391,7 @@ class TraceSpec:
completion_tokens_override=completion_tokens_override,
u_field=str(data.get("u_field") or "sampling_u").strip(),
timestamp_field=str(data.get("timestamp_field") or "timestamp").strip(),
max_concurrency=_require_int(
data.get("max_concurrency", 64), context="trace.max_concurrency"
),
max_concurrency=max_concurrency,
input_length_filter=(
InputLengthFilterSpec.from_dict(
_require_mapping(
@@ -367,13 +403,9 @@ class TraceSpec:
if data.get("input_length_filter") is not None
else None
),
max_requests_per_probe=int(max_requests) if max_requests is not None else None,
synthetic_prompt_cap_tokens=(
int(synthetic_prompt_cap) if synthetic_prompt_cap is not None else None
),
replay_time_scale=_require_float(
data.get("replay_time_scale", 1.0), context="trace.replay_time_scale"
),
max_requests_per_probe=max_requests_value,
synthetic_prompt_cap_tokens=synthetic_prompt_cap_value,
replay_time_scale=replay_time_scale,
early_stop_max_lag_s=(
_require_float(
data.get("early_stop_max_lag_s"), context="trace.early_stop_max_lag_s"
@@ -389,6 +421,14 @@ class TraceSpec:
if data.get("early_stop_max_elapsed_s") is not None
else None
),
restart_engine_after_early_stop=(
_require_bool(
data.get("restart_engine_after_early_stop"),
context="trace.restart_engine_after_early_stop",
)
if data.get("restart_engine_after_early_stop") is not None
else request_mode == "decode_only"
),
)
@@ -557,6 +597,7 @@ class LLMPolicySpec:
endpoint: LLMEndpointSpec | None
system_prompt: str
max_history_trials: int
use_harness: bool = True
@classmethod
def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
@@ -574,6 +615,11 @@ class LLMPolicySpec:
max_history_trials=_require_int(
payload.get("max_history_trials", 8), context="llm.max_history_trials"
),
use_harness=(
_require_bool(payload.get("use_harness"), context="llm.use_harness")
if payload.get("use_harness") is not None
else True
),
)
@@ -642,6 +688,7 @@ class Proposal:
config_patch: ConfigPatch
expected_effects: list[str]
why_not_previous_failures: str = ""
should_stop: bool = False
@classmethod
def from_dict(cls, data: Mapping[str, Any]) -> "Proposal":
@@ -664,13 +711,18 @@ class Proposal:
expected_effects, context="proposal.expected_effects"
)
return cls(
observation=_require_str(data.get("observation"), context="proposal.observation"),
diagnosis=_require_str(data.get("diagnosis"), context="proposal.diagnosis"),
observation=_coerce_text(data.get("observation"), context="proposal.observation"),
diagnosis=_coerce_text(data.get("diagnosis"), context="proposal.diagnosis"),
config_patch=ConfigPatch.from_dict(
_require_mapping(data.get("config_patch"), context="proposal.config_patch")
),
expected_effects=expected_effects_value,
why_not_previous_failures=str(data.get("why_not_previous_failures") or "").strip(),
should_stop=(
_require_bool(data.get("should_stop"), context="proposal.should_stop")
if data.get("should_stop") is not None
else False
),
)
@@ -712,6 +764,9 @@ class StudyState:
best_request_rate: float | None = None
best_request_rate_per_gpu: float | None = None
next_trial_index: int = 1
tuning_stop_reason: str = ""
tuning_stop_diagnosis: str = ""
tuning_stop_details: dict[str, Any] = field(default_factory=dict)
best_by_parallel_size: dict[str, dict[str, Any]] = field(default_factory=dict)
trials: list[TrialSummary] = field(default_factory=list)

View File

@@ -5,7 +5,15 @@ from dataclasses import replace
from pathlib import Path
from typing import Any
from .spec import Proposal, StudySpec, StudyState, TrialSpec, TrialSummary, to_jsonable
from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSpec, TrialSummary, to_jsonable
_TOPOLOGY_FLAG_KEYS = {
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"enable-expert-parallel",
}
class StudyStore:
@@ -37,6 +45,9 @@ class StudyStore:
best_request_rate=payload.get("best_request_rate"),
best_request_rate_per_gpu=payload.get("best_request_rate_per_gpu"),
next_trial_index=int(payload.get("next_trial_index", 1)),
tuning_stop_reason=str(payload.get("tuning_stop_reason") or ""),
tuning_stop_diagnosis=str(payload.get("tuning_stop_diagnosis") or ""),
tuning_stop_details=dict(payload.get("tuning_stop_details") or {}),
best_by_parallel_size={
str(key): value
for key, value in (payload.get("best_by_parallel_size") or {}).items()
@@ -65,6 +76,11 @@ class StudyStore:
state: StudyState,
proposal: Proposal,
) -> tuple[TrialSpec, StudyState]:
proposal = _inherit_incumbent_topology_for_runtime_patch(
study=study,
state=state,
proposal=proposal,
)
trial_id = f"trial-{state.next_trial_index:04d}"
trial_root = self.study_root(study.study_id) / "trials" / trial_id
trial_root.mkdir(parents=True, exist_ok=True)
@@ -85,8 +101,7 @@ class StudyStore:
result_path=str(trial_root / "result.json"),
)
self.write_json(trial_root / "trial_spec.json", to_jsonable(spec))
next_state = replace(state, next_trial_index=state.next_trial_index + 1)
next_state.trials.append(
next_trial = (
TrialSummary(
trial_id=trial_id,
status="queued",
@@ -95,6 +110,11 @@ class StudyStore:
config_patch=to_jsonable(proposal.config_patch),
)
)
next_state = replace(
state,
next_trial_index=state.next_trial_index + 1,
trials=[*state.trials, next_trial],
)
self.save_state(next_state)
return spec, next_state
@@ -225,6 +245,47 @@ def _parallel_size_for_proposal(*, study: StudySpec, proposal: Proposal) -> int:
return _parallel_size_for_config(study=study, flag_patch=proposal.config_patch.flag_patch)
def _inherit_incumbent_topology_for_runtime_patch(
*,
study: StudySpec,
state: StudyState,
proposal: Proposal,
) -> Proposal:
flag_patch = dict(proposal.config_patch.flag_patch)
env_patch = dict(proposal.config_patch.env_patch)
if not flag_patch and not env_patch:
return proposal
if _TOPOLOGY_FLAG_KEYS.intersection(flag_patch):
return proposal
if not state.best_trial_id:
return proposal
incumbent = next(
(trial for trial in state.trials if trial.trial_id == state.best_trial_id),
None,
)
if incumbent is None or not isinstance(incumbent.config_patch, dict):
return proposal
incumbent_patch = incumbent.config_patch.get("flag_patch")
if not isinstance(incumbent_patch, dict):
return proposal
inherited_topology = {
key: value
for key, value in incumbent_patch.items()
if key in _TOPOLOGY_FLAG_KEYS and study.engine.base_flags.get(key) != value
}
if not inherited_topology:
return proposal
merged_flag_patch = dict(inherited_topology)
merged_flag_patch.update(flag_patch)
return replace(
proposal,
config_patch=ConfigPatch(
env_patch=env_patch,
flag_patch=merged_flag_patch,
),
)
def _parallel_size_for_trial_id(*, study: StudySpec, study_root: Path, trial_id: str) -> int | None:
trial_spec_path = study_root / "trials" / trial_id / "trial_spec.json"
if not trial_spec_path.exists():

View File

@@ -2,7 +2,8 @@ from __future__ import annotations
import json
import math
from dataclasses import dataclass
import statistics
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Mapping
@@ -39,6 +40,7 @@ class TraceRequest:
body: dict[str, Any]
prompt_tokens_hint: int | None
completion_tokens_hint: int | None
metadata: dict[str, Any] = field(default_factory=dict)
def resolve_window_record(study: StudySpec, *, study_spec_path: Path) -> WindowRecord:
@@ -223,6 +225,12 @@ def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[Win
body=body,
prompt_tokens_hint=prompt_tokens_hint,
completion_tokens_hint=completion_tokens,
metadata={
"hash_ids": row.get("hash_ids") if isinstance(row.get("hash_ids"), list) else None,
"turn": row.get("turn"),
"parent_chat_id": row.get("parent_chat_id"),
"type": row.get("type"),
},
)
)
requests.sort(key=lambda item: item.arrival_s)
@@ -241,6 +249,23 @@ def summarize_window(requests: list[TraceRequest], window: WindowRecord) -> dict
requests[-1].arrival_s - requests[0].arrival_s if len(requests) >= 2 else 0.0
)
qps = (len(requests) / duration) if duration > 0 else 0.0
interarrivals = [
max(0.0, requests[idx].arrival_s - requests[idx - 1].arrival_s)
for idx in range(1, len(requests))
]
mean_interarrival = statistics.fmean(interarrivals) if interarrivals else 0.0
stdev_interarrival = statistics.pstdev(interarrivals) if len(interarrivals) >= 2 else 0.0
interarrival_cv = (
float(stdev_interarrival / mean_interarrival) if mean_interarrival > 0 else 0.0
)
one_second_bins: dict[int, int] = {}
for request in requests:
bin_id = int(math.floor(request.arrival_s))
one_second_bins[bin_id] = one_second_bins.get(bin_id, 0) + 1
one_second_counts = [float(value) for value in one_second_bins.values()]
cache_summary = _cache_summary(requests, window)
p50_prompt = _percentile(prompt_tokens, 50.0)
p95_prompt = _percentile(prompt_tokens, 95.0)
return {
"window_id": window.window_id,
"trace_path": str(window.trace_path),
@@ -248,10 +273,66 @@ def summarize_window(requests: list[TraceRequest], window: WindowRecord) -> dict
"request_count": len(requests),
"duration_s": duration,
"request_rate": qps,
"prompt_tokens_p50": _percentile(prompt_tokens, 50.0),
"prompt_tokens_p95": _percentile(prompt_tokens, 95.0),
"prompt_tokens_p50": p50_prompt,
"prompt_tokens_p95": p95_prompt,
"prompt_tokens_p99": _percentile(prompt_tokens, 99.0),
"prompt_tail_ratio_p95_p50": (
float(p95_prompt / max(p50_prompt, 1.0)) if prompt_tokens else 0.0
),
"completion_tokens_p50": _percentile(completion_tokens, 50.0),
"completion_tokens_p95": _percentile(completion_tokens, 95.0),
"arrival_interarrival_cv": interarrival_cv,
"arrival_qps_1s_p50": _percentile(one_second_counts, 50.0),
"arrival_qps_1s_p95": _percentile(one_second_counts, 95.0),
"arrival_burst_ratio_p95_to_mean": (
float(_percentile(one_second_counts, 95.0) / max(qps, 1e-9))
if one_second_counts and qps > 0
else 0.0
),
"prefix_cache": cache_summary,
}
def _cache_summary(requests: list[TraceRequest], window: WindowRecord) -> dict[str, Any]:
block_size = int(window.source_payload.get("block_size") or 1)
seen_hashes: set[Any] = set()
repeated_blocks = 0
total_blocks = 0
repeated_token_estimate = 0
total_token_estimate = 0
multi_turn_count = 0
rows_with_hashes = 0
for request in requests:
prompt_tokens = int(request.prompt_tokens_hint or 0)
total_token_estimate += prompt_tokens
turn = request.metadata.get("turn")
if isinstance(turn, (int, float)) and turn > 1:
multi_turn_count += 1
hash_ids = request.metadata.get("hash_ids")
if not isinstance(hash_ids, list):
continue
rows_with_hashes += 1
request_repeated_blocks = 0
for hash_id in hash_ids:
total_blocks += 1
if hash_id in seen_hashes:
repeated_blocks += 1
request_repeated_blocks += 1
else:
seen_hashes.add(hash_id)
repeated_token_estimate += min(prompt_tokens, request_repeated_blocks * block_size)
return {
"block_size": block_size,
"rows_with_hash_ids": rows_with_hashes,
"multi_turn_request_ratio": (
float(multi_turn_count / len(requests)) if requests else 0.0
),
"repeated_block_ratio": float(repeated_blocks / total_blocks) if total_blocks else 0.0,
"repeated_token_ratio_estimate": (
float(repeated_token_estimate / total_token_estimate)
if total_token_estimate
else 0.0
),
}

View File

@@ -105,13 +105,49 @@ def _run_one_request(
) -> RequestOutcome:
try:
metrics = stream_chat_completion(base_url=base_url, body=request.body, timeout_s=timeout_s)
expected_completion_tokens = request.completion_tokens_hint
actual_completion_tokens = metrics.completion_tokens
completion_tokens_source = getattr(metrics, "completion_tokens_source", "")
if expected_completion_tokens is not None:
if completion_tokens_source != "usage":
return RequestOutcome(
request_id=request.row_id,
success=False,
ttft_ms=metrics.ttft_ms,
tpot_ms=metrics.tpot_ms,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=actual_completion_tokens,
error=(
"completion_tokens_unverified "
f"source={completion_tokens_source or 'unknown'} "
f"expected={expected_completion_tokens} "
f"actual={actual_completion_tokens}"
),
completion_tokens_source=completion_tokens_source,
)
if actual_completion_tokens != expected_completion_tokens:
return RequestOutcome(
request_id=request.row_id,
success=False,
ttft_ms=metrics.ttft_ms,
tpot_ms=metrics.tpot_ms,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=actual_completion_tokens,
error=(
"completion_tokens_mismatch "
f"expected={expected_completion_tokens} "
f"actual={actual_completion_tokens}"
),
completion_tokens_source=completion_tokens_source,
)
return RequestOutcome(
request_id=request.row_id,
success=True,
ttft_ms=metrics.ttft_ms,
tpot_ms=metrics.tpot_ms,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=metrics.completion_tokens or request.completion_tokens_hint,
completion_tokens=actual_completion_tokens or request.completion_tokens_hint,
completion_tokens_source=completion_tokens_source,
)
except HttpClientError as exc:
return RequestOutcome(
@@ -125,6 +161,53 @@ def _run_one_request(
)
def _probe_outcome_details(
*,
threshold: float,
selected: list[TraceRequest],
outcomes: list[RequestOutcome],
evaluations: list[Any],
early_stopped: bool,
early_stop_reason: str,
) -> dict[str, Any]:
selected_by_id = {request.row_id: request for request in selected}
return {
"threshold": threshold,
"early_stopped": early_stopped,
"early_stop_reason": early_stop_reason,
"outcomes": [
{
"request_id": outcome.request_id,
"sampling_u": (
selected_by_id[outcome.request_id].sampling_u
if outcome.request_id in selected_by_id
else None
),
"arrival_s": (
selected_by_id[outcome.request_id].arrival_s
if outcome.request_id in selected_by_id
else None
),
"success": outcome.success,
"ttft_ms": outcome.ttft_ms,
"tpot_ms": outcome.tpot_ms,
"prompt_tokens": outcome.prompt_tokens,
"expected_completion_tokens": (
selected_by_id[outcome.request_id].completion_tokens_hint
if outcome.request_id in selected_by_id
else None
),
"completion_tokens": outcome.completion_tokens,
"completion_tokens_source": outcome.completion_tokens_source,
"error": outcome.error,
"evaluation": evaluation.passed,
"reasons": evaluation.reasons,
}
for outcome, evaluation in zip(outcomes, evaluations)
],
}
def _replay_requests(
requests: list[TraceRequest],
*,
@@ -135,6 +218,7 @@ def _replay_requests(
max_lag_s: float | None,
max_elapsed_s: float | None,
evaluate_outcome: Callable[[RequestOutcome], Any],
drain_inflight_on_early_stop: bool = True,
) -> tuple[list[RequestOutcome], bool, str]:
outcomes_by_id: dict[str, RequestOutcome] = {}
lock = threading.Lock()
@@ -209,18 +293,47 @@ def _replay_requests(
if sleep_for > 0:
time.sleep(min(sleep_for, 0.1))
if early_stopped:
for future in list(futures_by_request):
future.cancel()
for request in futures_by_request.values():
outcomes_by_id[request.row_id] = RequestOutcome(
request_id=request.row_id,
success=False,
ttft_ms=None,
tpot_ms=None,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
error=early_stop_reason or "probe_early_stopped",
)
if drain_inflight_on_early_stop and futures_by_request:
done, not_done = wait(list(futures_by_request), timeout=timeout_s)
for future in done:
request = futures_by_request[future]
try:
outcomes_by_id[request.row_id] = future.result(timeout=0)
except Exception: # noqa: BLE001
outcomes_by_id[request.row_id] = RequestOutcome(
request_id=request.row_id,
success=False,
ttft_ms=None,
tpot_ms=None,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
error=early_stop_reason or "probe_early_stopped",
)
for future in not_done:
future.cancel()
request = futures_by_request[future]
outcomes_by_id[request.row_id] = RequestOutcome(
request_id=request.row_id,
success=False,
ttft_ms=None,
tpot_ms=None,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
error=early_stop_reason or "probe_early_stopped",
)
else:
for future in list(futures_by_request):
future.cancel()
for request in futures_by_request.values():
outcomes_by_id[request.row_id] = RequestOutcome(
request_id=request.row_id,
success=False,
ttft_ms=None,
tpot_ms=None,
prompt_tokens=request.prompt_tokens_hint,
completion_tokens=request.completion_tokens_hint,
error=early_stop_reason or "probe_early_stopped",
)
for request in requests:
if request.row_id in submitted_ids:
continue
@@ -310,16 +423,22 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
artifact_dir = Path(trial.artifact_dir)
artifact_dir.mkdir(parents=True, exist_ok=True)
engine_log_path = Path(trial.engine_log_path)
probe_details_path = artifact_dir / "probe_details.jsonl"
if probe_details_path.exists():
probe_details_path.unlink()
with engine_log_path.open("w", encoding="utf-8") as engine_log:
process = subprocess.Popen( # noqa: S603
recipe.argv,
cwd=recipe.cwd,
env=recipe.env,
stdout=engine_log,
stderr=subprocess.STDOUT,
text=True,
start_new_session=True,
)
def launch_process() -> subprocess.Popen[str]:
return subprocess.Popen( # noqa: S603
recipe.argv,
cwd=recipe.cwd,
env=recipe.env,
stdout=engine_log,
stderr=subprocess.STDOUT,
text=True,
start_new_session=True,
)
process = launch_process()
probe_history: list[dict[str, Any]] = []
failure_stage = "engine_launch"
try:
@@ -332,7 +451,9 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
failure_stage = "probe_search"
def evaluator(threshold: float) -> ThresholdProbe[ProbePayload]:
nonlocal process
selected = select_requests_for_threshold(requests, threshold=threshold)
restart_after_early_stop = study.trace.restart_engine_after_early_stop
outcomes, early_stopped, early_stop_reason = _replay_requests(
selected,
base_url=recipe.base_url,
@@ -342,8 +463,21 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
max_lag_s=study.trace.early_stop_max_lag_s,
max_elapsed_s=study.trace.early_stop_max_elapsed_s,
evaluate_outcome=lambda outcome: evaluate_request(outcome, study.slo),
drain_inflight_on_early_stop=not restart_after_early_stop,
)
evaluations, summary = summarize_evaluations(outcomes, study.slo)
probe_details = _probe_outcome_details(
threshold=threshold,
selected=selected,
outcomes=outcomes,
evaluations=evaluations,
early_stopped=early_stopped,
early_stop_reason=early_stop_reason,
)
with probe_details_path.open("a", encoding="utf-8") as details_handle:
details_handle.write(
json.dumps(probe_details, ensure_ascii=False) + "\n"
)
request_rate = (
len(selected) / max(window.window_end - window.window_start, 1e-9)
if selected
@@ -370,6 +504,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
"tpot_ms": outcome.tpot_ms,
"prompt_tokens": outcome.prompt_tokens,
"completion_tokens": outcome.completion_tokens,
"completion_tokens_source": outcome.completion_tokens_source,
"evaluation": evaluation.passed,
"reasons": evaluation.reasons,
}
@@ -388,6 +523,15 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
}
probe_history.append(probe_record)
StudyStore.write_json(Path(trial.probe_log_path), probe_history)
if early_stopped and restart_after_early_stop:
_terminate_process_tree(process, timeout_s=30.0)
process = launch_process()
_wait_for_server_or_exit(
process,
base_url=recipe.base_url,
healthcheck_path=recipe.healthcheck_path,
ready_timeout_s=recipe.ready_timeout_s,
)
return ThresholdProbe(
threshold=threshold,
feasible=payload.feasible,

File diff suppressed because it is too large Load Diff