Add trace length bucket tuning support

This commit is contained in:
2026-04-07 11:03:16 +08:00
parent e9b5e9b957
commit 46ed688ace
12 changed files with 922 additions and 14 deletions

View File

@@ -0,0 +1,28 @@
{
"observation": "The incumbent should start from the known launch-safe qwen3.5-27b serving recipe on dash0 before asking the LLM to optimize throughput above that baseline.",
"diagnosis": "This model uses a long-context hybrid stack and fp8 quantization. The safest first measurement is to preserve the existing warmup, hybrid-model, chunked-prefill, and prefix-caching behavior from run_qwen27b.sh, while keeping a conservative sequence cap.",
"config_patch": {
"env_patch": {
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_ENABLE_TORCH_COMPILE": "1",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1"
},
"flag_patch": {
"tensor-parallel-size": 4,
"gpu-memory-utilization": 0.9,
"block-size": 64,
"max-num-batched-tokens": 8192,
"max-num-seqs": 16,
"enable-prefix-caching": true,
"enable-chunked-prefill": true,
"disable-cascade-attn": true
}
},
"expected_effects": [
"Launch-safe baseline aligned with the current hand-tuned qwen27b recipe while using all 4 visible H20 GPUs",
"Reliable first incumbent under the tighter TTFT and TPOT SLO",
"Clear trial history for the LLM to propose a higher-throughput follow-up patch"
],
"why_not_previous_failures": "This baseline intentionally avoids speculative new kernels or batching spikes before we have an incumbent under the new SLO."
}

View File

@@ -0,0 +1,147 @@
{
"study_id": "dash0-qwen27b-tight-slo-10min-run1",
"hardware": {
"gpu_count": 4,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "qwen3.5-27b-256k-0223-internal",
"served_model_name": "qwen35-27b-aituner"
},
"engine": {
"engine_name": "vllm",
"engine_version": "latest-release-on-dash0",
"exec_path": "/usr/local/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18082,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
"VLLM_DISABLE_COMPILE_CACHE": "1",
"DS_LLM_IGNORE_WARMUP": "1",
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
"PARAM_TOTAL_MAX": "262144",
"PARAM_IN_LENGTH_MAX": "262144",
"PARAM_MAX_LENGTH_MAX": "131072",
"DS_LLM_MAX_THINK_TOKENS": "81920",
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
"VLLM_FP8_USE_BLADNN": "1",
"VLLM_MOE_USE_BLADNN": "1",
"VLLM_GDN_USE_BLADNN": "0",
"VLLM_USE_V1": "1",
"VLLM_IS_HYBRID_MODEL": "1",
"VLLM_ENABLE_TORCH_COMPILE": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_RESPONSE_TIMEOUT": "300",
"VLLM_LOG_REQ_KV_LENS": "1",
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18082,
"served-model-name": "qwen35-27b-aituner",
"trust-remote-code": true,
"dtype": "bfloat16",
"gpu-memory-utilization": 0.9,
"enable-prefix-caching": true,
"mamba-cache-mode": "light",
"distributed-executor-backend": "mp",
"block-size": 64,
"enable-chunked-prefill": true,
"max-num-batched-tokens": 8192,
"disable-cascade-attn": true,
"max-model-len": 262144,
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
"mm-processor-cache-gb": 0,
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
"mamba-cache-dtype": "float32",
"skip-mm-profiling": true,
"quantization": "fp8",
"tensor-parallel-size": 1,
"max-num-seqs": 16,
"disable-log-requests": true
},
"tunable_envs": [
"VLLM_ATTENTION_BACKEND",
"VLLM_ENABLE_TORCH_COMPILE",
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
],
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
"block-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-prefix-caching",
"enable-chunked-prefill",
"disable-cascade-attn"
],
"python_executable": "python3"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 32,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 1.0,
"tolerance": 0.01,
"max_probes": 8,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
"max_history_trials": 8,
"endpoint": {
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
"model": "gpt-5.4",
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 180
}
}
}

View File

@@ -0,0 +1,147 @@
{
"study_id": "dash0-qwen27b-tight-slo-10min-run2",
"hardware": {
"gpu_count": 4,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "qwen3.5-27b-256k-0223-internal",
"served_model_name": "qwen35-27b-aituner"
},
"engine": {
"engine_name": "vllm",
"engine_version": "latest-release-on-dash0",
"exec_path": "/usr/local/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18082,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
"VLLM_DISABLE_COMPILE_CACHE": "1",
"DS_LLM_IGNORE_WARMUP": "1",
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
"PARAM_TOTAL_MAX": "262144",
"PARAM_IN_LENGTH_MAX": "262144",
"PARAM_MAX_LENGTH_MAX": "131072",
"DS_LLM_MAX_THINK_TOKENS": "81920",
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
"VLLM_FP8_USE_BLADNN": "1",
"VLLM_MOE_USE_BLADNN": "1",
"VLLM_GDN_USE_BLADNN": "0",
"VLLM_USE_V1": "1",
"VLLM_IS_HYBRID_MODEL": "1",
"VLLM_ENABLE_TORCH_COMPILE": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_RESPONSE_TIMEOUT": "300",
"VLLM_LOG_REQ_KV_LENS": "1",
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18082,
"served-model-name": "qwen35-27b-aituner",
"trust-remote-code": true,
"dtype": "bfloat16",
"gpu-memory-utilization": 0.9,
"enable-prefix-caching": true,
"mamba-cache-mode": "light",
"distributed-executor-backend": "mp",
"block-size": 64,
"enable-chunked-prefill": true,
"max-num-batched-tokens": 8192,
"disable-cascade-attn": true,
"max-model-len": 262144,
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
"mm-processor-cache-gb": 0,
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
"mamba-cache-dtype": "float32",
"skip-mm-profiling": true,
"quantization": "fp8",
"tensor-parallel-size": 4,
"max-num-seqs": 16,
"disable-log-requests": true
},
"tunable_envs": [
"VLLM_ATTENTION_BACKEND",
"VLLM_ENABLE_TORCH_COMPILE",
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
],
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
"block-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-prefix-caching",
"enable-chunked-prefill",
"disable-cascade-attn"
],
"python_executable": "python3"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 32,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.0625,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
"max_history_trials": 8,
"endpoint": {
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
"model": "gpt-5.4",
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 180
}
}
}

View File

@@ -0,0 +1,147 @@
{
"study_id": "dash0-qwen27b-tight-slo-10min-run3",
"hardware": {
"gpu_count": 4,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "qwen3.5-27b-256k-0223-internal",
"served_model_name": "qwen35-27b-aituner"
},
"engine": {
"engine_name": "vllm",
"engine_version": "latest-release-on-dash0",
"exec_path": "/usr/local/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18082,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
"VLLM_DISABLE_COMPILE_CACHE": "1",
"DS_LLM_IGNORE_WARMUP": "1",
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
"PARAM_TOTAL_MAX": "262144",
"PARAM_IN_LENGTH_MAX": "262144",
"PARAM_MAX_LENGTH_MAX": "131072",
"DS_LLM_MAX_THINK_TOKENS": "81920",
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
"VLLM_FP8_USE_BLADNN": "1",
"VLLM_MOE_USE_BLADNN": "1",
"VLLM_GDN_USE_BLADNN": "0",
"VLLM_USE_V1": "1",
"VLLM_IS_HYBRID_MODEL": "1",
"VLLM_ENABLE_TORCH_COMPILE": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_RESPONSE_TIMEOUT": "300",
"VLLM_LOG_REQ_KV_LENS": "1",
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18082,
"served-model-name": "qwen35-27b-aituner",
"trust-remote-code": true,
"dtype": "bfloat16",
"gpu-memory-utilization": 0.9,
"enable-prefix-caching": true,
"mamba-cache-mode": "light",
"distributed-executor-backend": "mp",
"block-size": 64,
"enable-chunked-prefill": true,
"max-num-batched-tokens": 8192,
"disable-cascade-attn": true,
"max-model-len": 262144,
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
"mm-processor-cache-gb": 0,
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
"mamba-cache-dtype": "float32",
"skip-mm-profiling": true,
"quantization": "fp8",
"tensor-parallel-size": 4,
"max-num-seqs": 16,
"disable-log-requests": true
},
"tunable_envs": [
"VLLM_ATTENTION_BACKEND",
"VLLM_ENABLE_TORCH_COMPILE",
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
],
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
"block-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-prefix-caching",
"enable-chunked-prefill",
"disable-cascade-attn"
],
"python_executable": "python3"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 32,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.0625,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
"max_history_trials": 8,
"endpoint": {
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
"model": "gpt-5.4",
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 180
}
}
}

View File

@@ -0,0 +1,151 @@
{
"study_id": "dash0-qwen27b-tight-slo-10min-run4-chat-0-8k",
"hardware": {
"gpu_count": 4,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "qwen3.5-27b-256k-0223-internal",
"served_model_name": "qwen35-27b-aituner"
},
"engine": {
"engine_name": "vllm",
"engine_version": "latest-release-on-dash0",
"exec_path": "/usr/local/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18082,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
"VLLM_DISABLE_COMPILE_CACHE": "1",
"DS_LLM_IGNORE_WARMUP": "1",
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
"PARAM_TOTAL_MAX": "262144",
"PARAM_IN_LENGTH_MAX": "262144",
"PARAM_MAX_LENGTH_MAX": "131072",
"DS_LLM_MAX_THINK_TOKENS": "81920",
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
"VLLM_FP8_USE_BLADNN": "1",
"VLLM_MOE_USE_BLADNN": "1",
"VLLM_GDN_USE_BLADNN": "0",
"VLLM_USE_V1": "1",
"VLLM_IS_HYBRID_MODEL": "1",
"VLLM_ENABLE_TORCH_COMPILE": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
"VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_RESPONSE_TIMEOUT": "300",
"VLLM_LOG_REQ_KV_LENS": "1",
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18082,
"served-model-name": "qwen35-27b-aituner",
"trust-remote-code": true,
"dtype": "bfloat16",
"gpu-memory-utilization": 0.9,
"enable-prefix-caching": true,
"mamba-cache-mode": "light",
"distributed-executor-backend": "mp",
"block-size": 64,
"enable-chunked-prefill": true,
"max-num-batched-tokens": 8192,
"disable-cascade-attn": true,
"max-model-len": 262144,
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
"mm-processor-cache-gb": 0,
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
"mamba-cache-dtype": "float32",
"skip-mm-profiling": true,
"quantization": "fp8",
"tensor-parallel-size": 4,
"max-num-seqs": 16,
"disable-log-requests": true
},
"tunable_envs": [
"VLLM_ATTENTION_BACKEND",
"VLLM_ENABLE_TORCH_COMPILE",
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
],
"tunable_flags": [
"tensor-parallel-size",
"gpu-memory-utilization",
"block-size",
"max-num-batched-tokens",
"max-num-seqs",
"enable-prefix-caching",
"enable-chunked-prefill",
"disable-cascade-attn"
],
"python_executable": "python3"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 32,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
},
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.0625,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
"max_history_trials": 8,
"endpoint": {
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
"model": "gpt-5.4",
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 180
}
}
}

View File

@@ -50,7 +50,11 @@
"window_id": "chat_w_example_0001", "window_id": "chat_w_example_0001",
"u_field": "sampling_u", "u_field": "sampling_u",
"timestamp_field": "timestamp", "timestamp_field": "timestamp",
"max_concurrency": 64 "max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
}
}, },
"slo": { "slo": {
"target_pass_rate": 0.95, "target_pass_rate": 0.95,

View File

@@ -42,6 +42,11 @@ def build_prompt(
json.dumps( json.dumps(
{ {
"study_id": study.study_id, "study_id": study.study_id,
"current_best": {
"trial_id": state.best_trial_id,
"best_sampling_u": state.best_sampling_u,
"best_request_rate": state.best_request_rate,
},
"hardware": { "hardware": {
"gpu_count": study.hardware.gpu_count, "gpu_count": study.hardware.gpu_count,
"gpu_model": study.hardware.gpu_model, "gpu_model": study.hardware.gpu_model,
@@ -50,6 +55,17 @@ def build_prompt(
"model_id": study.model.model_id, "model_id": study.model.model_id,
"served_model_name": study.model.served_model_name, "served_model_name": study.model.served_model_name,
}, },
"trace": {
"window_id": study.trace.window_id,
"input_length_filter": (
{
"min_input_tokens": study.trace.input_length_filter.min_input_tokens,
"max_input_tokens": study.trace.input_length_filter.max_input_tokens,
}
if study.trace.input_length_filter is not None
else None
),
},
"engine": { "engine": {
"engine_name": study.engine.engine_name, "engine_name": study.engine.engine_name,
"engine_version": study.engine.engine_version, "engine_version": study.engine.engine_version,
@@ -84,6 +100,8 @@ def build_prompt(
"Trial history:", "Trial history:",
json.dumps(history, ensure_ascii=False, indent=2), json.dumps(history, ensure_ascii=False, indent=2),
"", "",
"The proposal must beat the current incumbent. Do not propose a config that is only likely to be feasible below the current best_sampling_u/request_rate.",
"The evaluator for a new trial will start searching from the current best feasible sampling_u and only look for improvements above it.",
"The proposal should improve the maximum feasible sampling_u under the 95%+ SLO target.", "The proposal should improve the maximum feasible sampling_u under the 95%+ SLO target.",
] ]
return "\n".join(sections) return "\n".join(sections)
@@ -110,8 +128,22 @@ def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal:
return proposal return proposal
def _parse_json_object_text(text: str) -> dict[str, Any]:
try:
payload = json.loads(text)
except json.JSONDecodeError:
start = text.find("{")
end = text.rfind("}")
if start < 0 or end < start:
raise
payload = json.loads(text[start : end + 1])
if not isinstance(payload, dict):
raise SpecError("proposal payload must be a JSON object")
return payload
def parse_proposal_text(text: str, study: StudySpec) -> Proposal: def parse_proposal_text(text: str, study: StudySpec) -> Proposal:
payload = json.loads(text) payload = _parse_json_object_text(text)
proposal = Proposal.from_dict(payload) proposal = Proposal.from_dict(payload)
return validate_proposal(proposal, study) return validate_proposal(proposal, study)

View File

@@ -142,6 +142,42 @@ class EngineLaunchSpec:
) )
@dataclass(frozen=True)
class InputLengthFilterSpec:
min_input_tokens: int | None = None
max_input_tokens: int | None = None
@classmethod
def from_dict(cls, data: Mapping[str, Any], *, context: str) -> "InputLengthFilterSpec":
min_input_tokens = data.get("min_input_tokens")
max_input_tokens = data.get("max_input_tokens")
spec = cls(
min_input_tokens=(
_require_int(min_input_tokens, context=f"{context}.min_input_tokens")
if min_input_tokens is not None
else None
),
max_input_tokens=(
_require_int(max_input_tokens, context=f"{context}.max_input_tokens")
if max_input_tokens is not None
else None
),
)
if spec.min_input_tokens is None and spec.max_input_tokens is None:
raise SpecError(
f"{context} must define at least one of min_input_tokens/max_input_tokens."
)
if (
spec.min_input_tokens is not None
and spec.max_input_tokens is not None
and spec.min_input_tokens > spec.max_input_tokens
):
raise SpecError(
f"{context}.min_input_tokens must be <= {context}.max_input_tokens."
)
return spec
@dataclass(frozen=True) @dataclass(frozen=True)
class TraceSpec: class TraceSpec:
windows_path: str windows_path: str
@@ -150,6 +186,7 @@ class TraceSpec:
u_field: str u_field: str
timestamp_field: str timestamp_field: str
max_concurrency: int max_concurrency: int
input_length_filter: InputLengthFilterSpec | None = None
max_requests_per_probe: int | None = None max_requests_per_probe: int | None = None
synthetic_prompt_cap_tokens: int | None = None synthetic_prompt_cap_tokens: int | None = None
replay_time_scale: float = 1.0 replay_time_scale: float = 1.0
@@ -171,6 +208,17 @@ class TraceSpec:
max_concurrency=_require_int( max_concurrency=_require_int(
data.get("max_concurrency", 64), context="trace.max_concurrency" data.get("max_concurrency", 64), context="trace.max_concurrency"
), ),
input_length_filter=(
InputLengthFilterSpec.from_dict(
_require_mapping(
data.get("input_length_filter"),
context="trace.input_length_filter",
),
context="trace.input_length_filter",
)
if data.get("input_length_filter") is not None
else None
),
max_requests_per_probe=int(max_requests) if max_requests is not None else None, max_requests_per_probe=int(max_requests) if max_requests is not None else None,
synthetic_prompt_cap_tokens=( synthetic_prompt_cap_tokens=(
int(synthetic_prompt_cap) if synthetic_prompt_cap is not None else None int(synthetic_prompt_cap) if synthetic_prompt_cap is not None else None
@@ -454,6 +502,7 @@ class TrialSummary:
class StudyState: class StudyState:
study_id: str study_id: str
best_trial_id: str | None = None best_trial_id: str | None = None
best_sampling_u: float | None = None
best_request_rate: float | None = None best_request_rate: float | None = None
next_trial_index: int = 1 next_trial_index: int = 1
trials: list[TrialSummary] = field(default_factory=list) trials: list[TrialSummary] = field(default_factory=list)

View File

@@ -32,6 +32,7 @@ class StudyStore:
return StudyState( return StudyState(
study_id=str(payload["study_id"]), study_id=str(payload["study_id"]),
best_trial_id=payload.get("best_trial_id"), best_trial_id=payload.get("best_trial_id"),
best_sampling_u=payload.get("best_sampling_u"),
best_request_rate=payload.get("best_request_rate"), best_request_rate=payload.get("best_request_rate"),
next_trial_index=int(payload.get("next_trial_index", 1)), next_trial_index=int(payload.get("next_trial_index", 1)),
trials=trials, trials=trials,
@@ -64,7 +65,18 @@ class StudyStore:
study_id=study.study_id, study_id=study.study_id,
trial_id=trial_id, trial_id=trial_id,
config_patch=proposal.config_patch, config_patch=proposal.config_patch,
search=study.search, search=replace(
study.search,
low=min(
study.search.high,
max(
study.search.low,
float(state.best_sampling_u)
if isinstance(state.best_sampling_u, (int, float))
else study.search.low,
),
),
),
study_spec_path=str((self.study_root(study.study_id) / "study_spec.source").resolve()), study_spec_path=str((self.study_root(study.study_id) / "study_spec.source").resolve()),
artifact_dir=str(trial_root), artifact_dir=str(trial_root),
probe_log_path=str(trial_root / "probe_history.json"), probe_log_path=str(trial_root / "probe_history.json"),
@@ -89,6 +101,7 @@ class StudyStore:
by_id = {item.trial_id: item for item in state.trials} by_id = {item.trial_id: item for item in state.trials}
trials_dir = self.study_root(study_id) / "trials" trials_dir = self.study_root(study_id) / "trials"
best_trial_id = state.best_trial_id best_trial_id = state.best_trial_id
best_sampling_u = state.best_sampling_u
best_rate = state.best_request_rate best_rate = state.best_request_rate
for trial_dir in sorted(trials_dir.glob("trial-*")): for trial_dir in sorted(trials_dir.glob("trial-*")):
result_path = trial_dir / "result.json" result_path = trial_dir / "result.json"
@@ -112,7 +125,13 @@ class StudyStore:
and (best_rate is None or summary.best_request_rate > best_rate) and (best_rate is None or summary.best_request_rate > best_rate)
): ):
best_rate = float(summary.best_request_rate) best_rate = float(summary.best_request_rate)
best_sampling_u = (
float(summary.best_sampling_u)
if isinstance(summary.best_sampling_u, (int, float))
else None
)
best_trial_id = trial_id best_trial_id = trial_id
state.best_sampling_u = best_sampling_u
state.best_request_rate = best_rate state.best_request_rate = best_rate
state.best_trial_id = best_trial_id state.best_trial_id = best_trial_id
self.save_state(state) self.save_state(state)

View File

@@ -132,6 +132,25 @@ def _downsample_requests(
return [requests[idx] for idx in indexes] return [requests[idx] for idx in indexes]
def _matches_input_length_filter(study: StudySpec, *, prompt_tokens_hint: int | None) -> bool:
length_filter = study.trace.input_length_filter
if length_filter is None:
return True
if prompt_tokens_hint is None:
return False
if (
length_filter.min_input_tokens is not None
and prompt_tokens_hint < length_filter.min_input_tokens
):
return False
if (
length_filter.max_input_tokens is not None
and prompt_tokens_hint > length_filter.max_input_tokens
):
return False
return True
def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[WindowRecord, list[TraceRequest]]: def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[WindowRecord, list[TraceRequest]]:
window = resolve_window_record(study, study_spec_path=study_spec_path) window = resolve_window_record(study, study_spec_path=study_spec_path)
time_scale = float(study.trace.replay_time_scale) time_scale = float(study.trace.replay_time_scale)
@@ -163,6 +182,8 @@ def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[Win
if isinstance(sampling_u, bool) or not isinstance(sampling_u, (int, float)): if isinstance(sampling_u, bool) or not isinstance(sampling_u, (int, float)):
raise TraceError(f"trace row {idx} is missing numeric {study.trace.u_field}") raise TraceError(f"trace row {idx} is missing numeric {study.trace.u_field}")
prompt_tokens_hint = _coerce_prompt_tokens(row) prompt_tokens_hint = _coerce_prompt_tokens(row)
if not _matches_input_length_filter(study, prompt_tokens_hint=prompt_tokens_hint):
continue
try: try:
messages = _coerce_messages(row) messages = _coerce_messages(row)
except TraceError: except TraceError:

View File

@@ -177,14 +177,19 @@ def _replay_requests(
if early_stopped: if early_stopped:
break break
if futures_by_request: if futures_by_request:
timeout = None timeout = 0.5
if next_index < len(requests): if next_index < len(requests):
timeout = max(0.0, requests[next_index].arrival_s - elapsed) timeout = min(timeout, max(0.0, requests[next_index].arrival_s - elapsed))
if max_elapsed_s is not None:
remaining_elapsed = max(0.0, max_elapsed_s - elapsed)
timeout = min(timeout, remaining_elapsed)
done, _ = wait( done, _ = wait(
list(futures_by_request), list(futures_by_request),
timeout=timeout, timeout=timeout,
return_when=FIRST_COMPLETED, return_when=FIRST_COMPLETED,
) )
if not done:
continue
for future in done: for future in done:
request = futures_by_request.pop(future) request = futures_by_request.pop(future)
outcome = future.result() outcome = future.result()

View File

@@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job
from aituner.llm import build_prompt, parse_proposal_text from aituner.llm import build_prompt, parse_proposal_text
from aituner.search import ThresholdProbe, binary_search_max_feasible from aituner.search import ThresholdProbe, binary_search_max_feasible
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec
from aituner.store import StudyStore from aituner.store import StudyStore
from aituner.trace import load_trace_requests, summarize_window from aituner.trace import load_trace_requests, summarize_window
from aituner.worker import ( from aituner.worker import (
@@ -25,7 +25,9 @@ from aituner.worker import (
from aituner.trace import TraceRequest from aituner.trace import TraceRequest
def _write_study_assets(tmp_path: Path) -> Path: def _write_study_assets(
tmp_path: Path, *, trace_overrides: dict[str, object] | None = None
) -> Path:
trace_dir = tmp_path / "trace_windows" / "traces" trace_dir = tmp_path / "trace_windows" / "traces"
trace_dir.mkdir(parents=True) trace_dir.mkdir(parents=True)
trace_path = trace_dir / "chat_w1.jsonl" trace_path = trace_dir / "chat_w1.jsonl"
@@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path:
) )
study_path = tmp_path / "study.json" study_path = tmp_path / "study.json"
trace_payload: dict[str, object] = {
"windows_path": str(windows_path),
"window_id": "chat_w1",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 4,
}
if trace_overrides:
trace_payload.update(trace_overrides)
study_payload = { study_payload = {
"study_id": "study-1", "study_id": "study-1",
"hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]}, "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
@@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path:
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"], "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
"python_executable": "python3" "python_executable": "python3"
}, },
"trace": { "trace": trace_payload,
"windows_path": str(windows_path),
"window_id": "chat_w1",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 4
},
"slo": { "slo": {
"target_pass_rate": 0.95, "target_pass_rate": 0.95,
"ttft_rule": { "ttft_rule": {
@@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase):
) )
self.assertIn("allowed_flag_keys", prompt) self.assertIn("allowed_flag_keys", prompt)
self.assertIn("study-1", prompt) self.assertIn("study-1", prompt)
self.assertIn('"current_best"', prompt)
self.assertIn("queueing_knee_by_bucket", prompt) self.assertIn("queueing_knee_by_bucket", prompt)
self.assertTrue(study_root.exists()) self.assertTrue(study_root.exists())
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192,
}
},
)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
summary = summarize_window(requests, window)
self.assertEqual(len(requests), 2)
self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
self.assertEqual(summary["request_count"], 2)
self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
prompt = build_prompt(
study=study,
window_summary=summary,
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertIn('"input_length_filter"', prompt)
self.assertIn('"max_input_tokens": 8192', prompt)
def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={
"input_length_filter": {
"min_input_tokens": 8193,
"max_input_tokens": 8192,
}
},
)
with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
load_study_spec(study_path)
def test_prompt_includes_failed_trial_context(self) -> None: def test_prompt_includes_failed_trial_context(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)
@@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase):
) )
next_state = store.ingest_trial_results(study.study_id) next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.best_trial_id, trial.trial_id) self.assertEqual(next_state.best_trial_id, trial.trial_id)
self.assertEqual(next_state.best_sampling_u, 0.75)
self.assertEqual(next_state.best_request_rate, 12.5) self.assertEqual(next_state.best_request_rate, 12.5)
def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_sampling_u=0.375,
best_request_rate=3.0,
next_trial_index=2,
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.375)
self.assertEqual(trial.search.high, 1.0)
def test_ingest_trial_results_records_failure_reason(self) -> None: def test_ingest_trial_results_records_failure_reason(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)
@@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase):
store = StudyStore(store_root) store = StudyStore(store_root)
state = store.load_state("study-1") state = store.load_state("study-1")
self.assertEqual(state.best_trial_id, "trial-0002") self.assertEqual(state.best_trial_id, "trial-0002")
self.assertEqual(state.best_sampling_u, 0.75)
self.assertEqual(state.best_request_rate, 2.0) self.assertEqual(state.best_request_rate, 2.0)
self.assertEqual(state.next_trial_index, 3) self.assertEqual(state.next_trial_index, 3)
@@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase):
["throughput: higher", "ttft: lower"], ["throughput: higher", "ttft: lower"],
) )
def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
proposal = parse_proposal_text(
"""Here is the proposal:
```json
{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
```""",
study,
)
self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None: def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
requests = [ requests = [
TraceRequest( TraceRequest(
@@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase):
self.assertEqual(len(replayed), 2) self.assertEqual(len(replayed), 2)
self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable") self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")
def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
requests = [
TraceRequest(
row_id="r0",
arrival_s=0.0,
sampling_u=0.1,
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
prompt_tokens_hint=8,
completion_tokens_hint=4,
)
]
class FakeFuture:
def result(self, timeout=None):
raise AssertionError("future should not be awaited after elapsed early stop")
def cancel(self):
return True
submitted = []
class FakeExecutor:
def __init__(self, max_workers):
self.max_workers = max_workers
def submit(self, fn, request, **kwargs):
submitted.append(request.row_id)
return FakeFuture()
def shutdown(self, wait=False, cancel_futures=True):
return None
wait_timeouts: list[float] = []
def fake_wait(futures, timeout=None, return_when=None):
wait_timeouts.append(timeout)
return set(), set(futures)
def fake_evaluate(outcome: RequestOutcome):
return type("Eval", (), {"passed": outcome.success})()
monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
with mock.patch("aituner.worker.wait", side_effect=fake_wait):
with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
replayed, early_stopped, reason = _replay_requests(
requests,
base_url="http://127.0.0.1:8000",
timeout_s=30.0,
max_concurrency=1,
target_pass_rate=0.95,
max_lag_s=None,
max_elapsed_s=1.0,
evaluate_outcome=fake_evaluate,
)
self.assertEqual(submitted, ["r0"])
self.assertTrue(early_stopped)
self.assertEqual(reason, "probe_elapsed_s>1.0")
self.assertEqual(len(replayed), 1)
self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
self.assertTrue(wait_timeouts)
self.assertLessEqual(wait_timeouts[0], 0.5)
def test_latency_summary_reports_quantiles_and_slo(self) -> None: def test_latency_summary_reports_quantiles_and_slo(self) -> None:
study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp()))) study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
outcomes = [ outcomes = [