Add trace length bucket tuning support
This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"observation": "The incumbent should start from the known launch-safe qwen3.5-27b serving recipe on dash0 before asking the LLM to optimize throughput above that baseline.",
|
||||
"diagnosis": "This model uses a long-context hybrid stack and fp8 quantization. The safest first measurement is to preserve the existing warmup, hybrid-model, chunked-prefill, and prefix-caching behavior from run_qwen27b.sh, while keeping a conservative sequence cap.",
|
||||
"config_patch": {
|
||||
"env_patch": {
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_ENABLE_TORCH_COMPILE": "1",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1"
|
||||
},
|
||||
"flag_patch": {
|
||||
"tensor-parallel-size": 4,
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"block-size": 64,
|
||||
"max-num-batched-tokens": 8192,
|
||||
"max-num-seqs": 16,
|
||||
"enable-prefix-caching": true,
|
||||
"enable-chunked-prefill": true,
|
||||
"disable-cascade-attn": true
|
||||
}
|
||||
},
|
||||
"expected_effects": [
|
||||
"Launch-safe baseline aligned with the current hand-tuned qwen27b recipe while using all 4 visible H20 GPUs",
|
||||
"Reliable first incumbent under the tighter TTFT and TPOT SLO",
|
||||
"Clear trial history for the LLM to propose a higher-throughput follow-up patch"
|
||||
],
|
||||
"why_not_previous_failures": "This baseline intentionally avoids speculative new kernels or batching spikes before we have an incumbent under the new SLO."
|
||||
}
|
||||
147
configs/examples/dash0_qwen27b_tight_slo_run1.json
Normal file
147
configs/examples/dash0_qwen27b_tight_slo_run1.json
Normal file
@@ -0,0 +1,147 @@
|
||||
{
|
||||
"study_id": "dash0-qwen27b-tight-slo-10min-run1",
|
||||
"hardware": {
|
||||
"gpu_count": 4,
|
||||
"gpu_model": "H20",
|
||||
"host_candidates": [
|
||||
"dash0"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"model_id": "qwen3.5-27b-256k-0223-internal",
|
||||
"served_model_name": "qwen35-27b-aituner"
|
||||
},
|
||||
"engine": {
|
||||
"engine_name": "vllm",
|
||||
"engine_version": "latest-release-on-dash0",
|
||||
"exec_path": "/usr/local/bin/vllm",
|
||||
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"healthcheck_path": "/v1/models",
|
||||
"ready_timeout_s": 900,
|
||||
"request_timeout_s": 900,
|
||||
"launch_args": [
|
||||
"serve",
|
||||
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
|
||||
],
|
||||
"base_envs": {
|
||||
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
|
||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||
"DS_LLM_IGNORE_WARMUP": "1",
|
||||
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
|
||||
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
|
||||
"PARAM_TOTAL_MAX": "262144",
|
||||
"PARAM_IN_LENGTH_MAX": "262144",
|
||||
"PARAM_MAX_LENGTH_MAX": "131072",
|
||||
"DS_LLM_MAX_THINK_TOKENS": "81920",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
|
||||
"VLLM_FP8_USE_BLADNN": "1",
|
||||
"VLLM_MOE_USE_BLADNN": "1",
|
||||
"VLLM_GDN_USE_BLADNN": "0",
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_IS_HYBRID_MODEL": "1",
|
||||
"VLLM_ENABLE_TORCH_COMPILE": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||
"VLLM_RESPONSE_TIMEOUT": "300",
|
||||
"VLLM_LOG_REQ_KV_LENS": "1",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
|
||||
},
|
||||
"base_flags": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"served-model-name": "qwen35-27b-aituner",
|
||||
"trust-remote-code": true,
|
||||
"dtype": "bfloat16",
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"enable-prefix-caching": true,
|
||||
"mamba-cache-mode": "light",
|
||||
"distributed-executor-backend": "mp",
|
||||
"block-size": 64,
|
||||
"enable-chunked-prefill": true,
|
||||
"max-num-batched-tokens": 8192,
|
||||
"disable-cascade-attn": true,
|
||||
"max-model-len": 262144,
|
||||
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
|
||||
"mm-processor-cache-gb": 0,
|
||||
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
|
||||
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
|
||||
"mamba-cache-dtype": "float32",
|
||||
"skip-mm-profiling": true,
|
||||
"quantization": "fp8",
|
||||
"tensor-parallel-size": 1,
|
||||
"max-num-seqs": 16,
|
||||
"disable-log-requests": true
|
||||
},
|
||||
"tunable_envs": [
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_ENABLE_TORCH_COMPILE",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
|
||||
],
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
"block-size",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
"enable-prefix-caching",
|
||||
"enable-chunked-prefill",
|
||||
"disable-cascade-attn"
|
||||
],
|
||||
"python_executable": "python3"
|
||||
},
|
||||
"trace": {
|
||||
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
|
||||
"window_id": "chat_w20260311_1000",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 32,
|
||||
"replay_time_scale": 1.0,
|
||||
"early_stop_max_lag_s": 120.0,
|
||||
"early_stop_max_elapsed_s": 900.0
|
||||
},
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
"ttft_rule": {
|
||||
"kind": "step_ms",
|
||||
"buckets": [
|
||||
{
|
||||
"max_input_tokens": 4096,
|
||||
"threshold_ms": 2000
|
||||
},
|
||||
{
|
||||
"max_input_tokens": 32768,
|
||||
"threshold_ms": 4000
|
||||
},
|
||||
{
|
||||
"threshold_ms": 6000
|
||||
}
|
||||
]
|
||||
},
|
||||
"tpot_rule": {
|
||||
"kind": "fixed_ms",
|
||||
"threshold_ms": 50
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"low": 0.0,
|
||||
"high": 1.0,
|
||||
"tolerance": 0.01,
|
||||
"max_probes": 8,
|
||||
"sample_seed": 20260325
|
||||
},
|
||||
"llm": {
|
||||
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
|
||||
"max_history_trials": 8,
|
||||
"endpoint": {
|
||||
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
|
||||
"model": "gpt-5.4",
|
||||
"api_key_env": "OPENAI_API_KEY",
|
||||
"timeout_s": 180
|
||||
}
|
||||
}
|
||||
}
|
||||
147
configs/examples/dash0_qwen27b_tight_slo_run2.json
Normal file
147
configs/examples/dash0_qwen27b_tight_slo_run2.json
Normal file
@@ -0,0 +1,147 @@
|
||||
{
|
||||
"study_id": "dash0-qwen27b-tight-slo-10min-run2",
|
||||
"hardware": {
|
||||
"gpu_count": 4,
|
||||
"gpu_model": "H20",
|
||||
"host_candidates": [
|
||||
"dash0"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"model_id": "qwen3.5-27b-256k-0223-internal",
|
||||
"served_model_name": "qwen35-27b-aituner"
|
||||
},
|
||||
"engine": {
|
||||
"engine_name": "vllm",
|
||||
"engine_version": "latest-release-on-dash0",
|
||||
"exec_path": "/usr/local/bin/vllm",
|
||||
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"healthcheck_path": "/v1/models",
|
||||
"ready_timeout_s": 900,
|
||||
"request_timeout_s": 900,
|
||||
"launch_args": [
|
||||
"serve",
|
||||
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
|
||||
],
|
||||
"base_envs": {
|
||||
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
|
||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||
"DS_LLM_IGNORE_WARMUP": "1",
|
||||
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
|
||||
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
|
||||
"PARAM_TOTAL_MAX": "262144",
|
||||
"PARAM_IN_LENGTH_MAX": "262144",
|
||||
"PARAM_MAX_LENGTH_MAX": "131072",
|
||||
"DS_LLM_MAX_THINK_TOKENS": "81920",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
|
||||
"VLLM_FP8_USE_BLADNN": "1",
|
||||
"VLLM_MOE_USE_BLADNN": "1",
|
||||
"VLLM_GDN_USE_BLADNN": "0",
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_IS_HYBRID_MODEL": "1",
|
||||
"VLLM_ENABLE_TORCH_COMPILE": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||
"VLLM_RESPONSE_TIMEOUT": "300",
|
||||
"VLLM_LOG_REQ_KV_LENS": "1",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
|
||||
},
|
||||
"base_flags": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"served-model-name": "qwen35-27b-aituner",
|
||||
"trust-remote-code": true,
|
||||
"dtype": "bfloat16",
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"enable-prefix-caching": true,
|
||||
"mamba-cache-mode": "light",
|
||||
"distributed-executor-backend": "mp",
|
||||
"block-size": 64,
|
||||
"enable-chunked-prefill": true,
|
||||
"max-num-batched-tokens": 8192,
|
||||
"disable-cascade-attn": true,
|
||||
"max-model-len": 262144,
|
||||
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
|
||||
"mm-processor-cache-gb": 0,
|
||||
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
|
||||
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
|
||||
"mamba-cache-dtype": "float32",
|
||||
"skip-mm-profiling": true,
|
||||
"quantization": "fp8",
|
||||
"tensor-parallel-size": 4,
|
||||
"max-num-seqs": 16,
|
||||
"disable-log-requests": true
|
||||
},
|
||||
"tunable_envs": [
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_ENABLE_TORCH_COMPILE",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
|
||||
],
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
"block-size",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
"enable-prefix-caching",
|
||||
"enable-chunked-prefill",
|
||||
"disable-cascade-attn"
|
||||
],
|
||||
"python_executable": "python3"
|
||||
},
|
||||
"trace": {
|
||||
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
|
||||
"window_id": "chat_w20260311_1000",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 32,
|
||||
"replay_time_scale": 1.0,
|
||||
"early_stop_max_lag_s": 120.0,
|
||||
"early_stop_max_elapsed_s": 900.0
|
||||
},
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
"ttft_rule": {
|
||||
"kind": "step_ms",
|
||||
"buckets": [
|
||||
{
|
||||
"max_input_tokens": 4096,
|
||||
"threshold_ms": 2000
|
||||
},
|
||||
{
|
||||
"max_input_tokens": 32768,
|
||||
"threshold_ms": 4000
|
||||
},
|
||||
{
|
||||
"threshold_ms": 6000
|
||||
}
|
||||
]
|
||||
},
|
||||
"tpot_rule": {
|
||||
"kind": "fixed_ms",
|
||||
"threshold_ms": 50
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"low": 0.0,
|
||||
"high": 0.0625,
|
||||
"tolerance": 0.001,
|
||||
"max_probes": 6,
|
||||
"sample_seed": 20260325
|
||||
},
|
||||
"llm": {
|
||||
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
|
||||
"max_history_trials": 8,
|
||||
"endpoint": {
|
||||
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
|
||||
"model": "gpt-5.4",
|
||||
"api_key_env": "OPENAI_API_KEY",
|
||||
"timeout_s": 180
|
||||
}
|
||||
}
|
||||
}
|
||||
147
configs/examples/dash0_qwen27b_tight_slo_run3.json
Normal file
147
configs/examples/dash0_qwen27b_tight_slo_run3.json
Normal file
@@ -0,0 +1,147 @@
|
||||
{
|
||||
"study_id": "dash0-qwen27b-tight-slo-10min-run3",
|
||||
"hardware": {
|
||||
"gpu_count": 4,
|
||||
"gpu_model": "H20",
|
||||
"host_candidates": [
|
||||
"dash0"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"model_id": "qwen3.5-27b-256k-0223-internal",
|
||||
"served_model_name": "qwen35-27b-aituner"
|
||||
},
|
||||
"engine": {
|
||||
"engine_name": "vllm",
|
||||
"engine_version": "latest-release-on-dash0",
|
||||
"exec_path": "/usr/local/bin/vllm",
|
||||
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"healthcheck_path": "/v1/models",
|
||||
"ready_timeout_s": 900,
|
||||
"request_timeout_s": 900,
|
||||
"launch_args": [
|
||||
"serve",
|
||||
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
|
||||
],
|
||||
"base_envs": {
|
||||
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
|
||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||
"DS_LLM_IGNORE_WARMUP": "1",
|
||||
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
|
||||
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
|
||||
"PARAM_TOTAL_MAX": "262144",
|
||||
"PARAM_IN_LENGTH_MAX": "262144",
|
||||
"PARAM_MAX_LENGTH_MAX": "131072",
|
||||
"DS_LLM_MAX_THINK_TOKENS": "81920",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
|
||||
"VLLM_FP8_USE_BLADNN": "1",
|
||||
"VLLM_MOE_USE_BLADNN": "1",
|
||||
"VLLM_GDN_USE_BLADNN": "0",
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_IS_HYBRID_MODEL": "1",
|
||||
"VLLM_ENABLE_TORCH_COMPILE": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||
"VLLM_RESPONSE_TIMEOUT": "300",
|
||||
"VLLM_LOG_REQ_KV_LENS": "1",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
|
||||
},
|
||||
"base_flags": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"served-model-name": "qwen35-27b-aituner",
|
||||
"trust-remote-code": true,
|
||||
"dtype": "bfloat16",
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"enable-prefix-caching": true,
|
||||
"mamba-cache-mode": "light",
|
||||
"distributed-executor-backend": "mp",
|
||||
"block-size": 64,
|
||||
"enable-chunked-prefill": true,
|
||||
"max-num-batched-tokens": 8192,
|
||||
"disable-cascade-attn": true,
|
||||
"max-model-len": 262144,
|
||||
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
|
||||
"mm-processor-cache-gb": 0,
|
||||
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
|
||||
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
|
||||
"mamba-cache-dtype": "float32",
|
||||
"skip-mm-profiling": true,
|
||||
"quantization": "fp8",
|
||||
"tensor-parallel-size": 4,
|
||||
"max-num-seqs": 16,
|
||||
"disable-log-requests": true
|
||||
},
|
||||
"tunable_envs": [
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_ENABLE_TORCH_COMPILE",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
|
||||
],
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
"block-size",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
"enable-prefix-caching",
|
||||
"enable-chunked-prefill",
|
||||
"disable-cascade-attn"
|
||||
],
|
||||
"python_executable": "python3"
|
||||
},
|
||||
"trace": {
|
||||
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
|
||||
"window_id": "chat_w20260311_1000",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 32,
|
||||
"replay_time_scale": 1.0,
|
||||
"early_stop_max_lag_s": 120.0,
|
||||
"early_stop_max_elapsed_s": 900.0
|
||||
},
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
"ttft_rule": {
|
||||
"kind": "step_ms",
|
||||
"buckets": [
|
||||
{
|
||||
"max_input_tokens": 4096,
|
||||
"threshold_ms": 2000
|
||||
},
|
||||
{
|
||||
"max_input_tokens": 32768,
|
||||
"threshold_ms": 4000
|
||||
},
|
||||
{
|
||||
"threshold_ms": 6000
|
||||
}
|
||||
]
|
||||
},
|
||||
"tpot_rule": {
|
||||
"kind": "fixed_ms",
|
||||
"threshold_ms": 50
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"low": 0.0,
|
||||
"high": 0.0625,
|
||||
"tolerance": 0.001,
|
||||
"max_probes": 6,
|
||||
"sample_seed": 20260325
|
||||
},
|
||||
"llm": {
|
||||
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
|
||||
"max_history_trials": 8,
|
||||
"endpoint": {
|
||||
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
|
||||
"model": "gpt-5.4",
|
||||
"api_key_env": "OPENAI_API_KEY",
|
||||
"timeout_s": 180
|
||||
}
|
||||
}
|
||||
}
|
||||
151
configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json
Normal file
151
configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json
Normal file
@@ -0,0 +1,151 @@
|
||||
{
|
||||
"study_id": "dash0-qwen27b-tight-slo-10min-run4-chat-0-8k",
|
||||
"hardware": {
|
||||
"gpu_count": 4,
|
||||
"gpu_model": "H20",
|
||||
"host_candidates": [
|
||||
"dash0"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"model_id": "qwen3.5-27b-256k-0223-internal",
|
||||
"served_model_name": "qwen35-27b-aituner"
|
||||
},
|
||||
"engine": {
|
||||
"engine_name": "vllm",
|
||||
"engine_version": "latest-release-on-dash0",
|
||||
"exec_path": "/usr/local/bin/vllm",
|
||||
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"healthcheck_path": "/v1/models",
|
||||
"ready_timeout_s": 900,
|
||||
"request_timeout_s": 900,
|
||||
"launch_args": [
|
||||
"serve",
|
||||
"/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
|
||||
],
|
||||
"base_envs": {
|
||||
"CUDA_VISIBLE_DEVICES": "4,5,6,7",
|
||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||
"DS_LLM_IGNORE_WARMUP": "1",
|
||||
"DS_LLM_IGNORE_CHECK_WARMUP": "1",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
|
||||
"VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
|
||||
"PARAM_TOTAL_MAX": "262144",
|
||||
"PARAM_IN_LENGTH_MAX": "262144",
|
||||
"PARAM_MAX_LENGTH_MAX": "131072",
|
||||
"DS_LLM_MAX_THINK_TOKENS": "81920",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
|
||||
"VLLM_FP8_USE_BLADNN": "1",
|
||||
"VLLM_MOE_USE_BLADNN": "1",
|
||||
"VLLM_GDN_USE_BLADNN": "0",
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_IS_HYBRID_MODEL": "1",
|
||||
"VLLM_ENABLE_TORCH_COMPILE": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||
"VLLM_RESPONSE_TIMEOUT": "300",
|
||||
"VLLM_LOG_REQ_KV_LENS": "1",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
|
||||
},
|
||||
"base_flags": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 18082,
|
||||
"served-model-name": "qwen35-27b-aituner",
|
||||
"trust-remote-code": true,
|
||||
"dtype": "bfloat16",
|
||||
"gpu-memory-utilization": 0.9,
|
||||
"enable-prefix-caching": true,
|
||||
"mamba-cache-mode": "light",
|
||||
"distributed-executor-backend": "mp",
|
||||
"block-size": 64,
|
||||
"enable-chunked-prefill": true,
|
||||
"max-num-batched-tokens": 8192,
|
||||
"disable-cascade-attn": true,
|
||||
"max-model-len": 262144,
|
||||
"speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
|
||||
"mm-processor-cache-gb": 0,
|
||||
"limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
|
||||
"compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
|
||||
"mamba-cache-dtype": "float32",
|
||||
"skip-mm-profiling": true,
|
||||
"quantization": "fp8",
|
||||
"tensor-parallel-size": 4,
|
||||
"max-num-seqs": 16,
|
||||
"disable-log-requests": true
|
||||
},
|
||||
"tunable_envs": [
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_ENABLE_TORCH_COMPILE",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
|
||||
],
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
"block-size",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
"enable-prefix-caching",
|
||||
"enable-chunked-prefill",
|
||||
"disable-cascade-attn"
|
||||
],
|
||||
"python_executable": "python3"
|
||||
},
|
||||
"trace": {
|
||||
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
|
||||
"window_id": "chat_w20260311_1000",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 32,
|
||||
"input_length_filter": {
|
||||
"min_input_tokens": 0,
|
||||
"max_input_tokens": 8192
|
||||
},
|
||||
"replay_time_scale": 1.0,
|
||||
"early_stop_max_lag_s": 120.0,
|
||||
"early_stop_max_elapsed_s": 900.0
|
||||
},
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
"ttft_rule": {
|
||||
"kind": "step_ms",
|
||||
"buckets": [
|
||||
{
|
||||
"max_input_tokens": 4096,
|
||||
"threshold_ms": 2000
|
||||
},
|
||||
{
|
||||
"max_input_tokens": 32768,
|
||||
"threshold_ms": 4000
|
||||
},
|
||||
{
|
||||
"threshold_ms": 6000
|
||||
}
|
||||
]
|
||||
},
|
||||
"tpot_rule": {
|
||||
"kind": "fixed_ms",
|
||||
"threshold_ms": 50
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"low": 0.0,
|
||||
"high": 0.0625,
|
||||
"tolerance": 0.001,
|
||||
"max_probes": 6,
|
||||
"sample_seed": 20260325
|
||||
},
|
||||
"llm": {
|
||||
"system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
|
||||
"max_history_trials": 8,
|
||||
"endpoint": {
|
||||
"base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
|
||||
"model": "gpt-5.4",
|
||||
"api_key_env": "OPENAI_API_KEY",
|
||||
"timeout_s": 180
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -50,7 +50,11 @@
|
||||
"window_id": "chat_w_example_0001",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 64
|
||||
"max_concurrency": 64,
|
||||
"input_length_filter": {
|
||||
"min_input_tokens": 0,
|
||||
"max_input_tokens": 8192
|
||||
}
|
||||
},
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
|
||||
@@ -42,6 +42,11 @@ def build_prompt(
|
||||
json.dumps(
|
||||
{
|
||||
"study_id": study.study_id,
|
||||
"current_best": {
|
||||
"trial_id": state.best_trial_id,
|
||||
"best_sampling_u": state.best_sampling_u,
|
||||
"best_request_rate": state.best_request_rate,
|
||||
},
|
||||
"hardware": {
|
||||
"gpu_count": study.hardware.gpu_count,
|
||||
"gpu_model": study.hardware.gpu_model,
|
||||
@@ -50,6 +55,17 @@ def build_prompt(
|
||||
"model_id": study.model.model_id,
|
||||
"served_model_name": study.model.served_model_name,
|
||||
},
|
||||
"trace": {
|
||||
"window_id": study.trace.window_id,
|
||||
"input_length_filter": (
|
||||
{
|
||||
"min_input_tokens": study.trace.input_length_filter.min_input_tokens,
|
||||
"max_input_tokens": study.trace.input_length_filter.max_input_tokens,
|
||||
}
|
||||
if study.trace.input_length_filter is not None
|
||||
else None
|
||||
),
|
||||
},
|
||||
"engine": {
|
||||
"engine_name": study.engine.engine_name,
|
||||
"engine_version": study.engine.engine_version,
|
||||
@@ -84,6 +100,8 @@ def build_prompt(
|
||||
"Trial history:",
|
||||
json.dumps(history, ensure_ascii=False, indent=2),
|
||||
"",
|
||||
"The proposal must beat the current incumbent. Do not propose a config that is only likely to be feasible below the current best_sampling_u/request_rate.",
|
||||
"The evaluator for a new trial will start searching from the current best feasible sampling_u and only look for improvements above it.",
|
||||
"The proposal should improve the maximum feasible sampling_u under the 95%+ SLO target.",
|
||||
]
|
||||
return "\n".join(sections)
|
||||
@@ -110,8 +128,22 @@ def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal:
|
||||
return proposal
|
||||
|
||||
|
||||
def parse_proposal_text(text: str, study: StudySpec) -> Proposal:
|
||||
def _parse_json_object_text(text: str) -> dict[str, Any]:
|
||||
try:
|
||||
payload = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
start = text.find("{")
|
||||
end = text.rfind("}")
|
||||
if start < 0 or end < start:
|
||||
raise
|
||||
payload = json.loads(text[start : end + 1])
|
||||
if not isinstance(payload, dict):
|
||||
raise SpecError("proposal payload must be a JSON object")
|
||||
return payload
|
||||
|
||||
|
||||
def parse_proposal_text(text: str, study: StudySpec) -> Proposal:
|
||||
payload = _parse_json_object_text(text)
|
||||
proposal = Proposal.from_dict(payload)
|
||||
return validate_proposal(proposal, study)
|
||||
|
||||
|
||||
@@ -142,6 +142,42 @@ class EngineLaunchSpec:
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InputLengthFilterSpec:
|
||||
min_input_tokens: int | None = None
|
||||
max_input_tokens: int | None = None
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Mapping[str, Any], *, context: str) -> "InputLengthFilterSpec":
|
||||
min_input_tokens = data.get("min_input_tokens")
|
||||
max_input_tokens = data.get("max_input_tokens")
|
||||
spec = cls(
|
||||
min_input_tokens=(
|
||||
_require_int(min_input_tokens, context=f"{context}.min_input_tokens")
|
||||
if min_input_tokens is not None
|
||||
else None
|
||||
),
|
||||
max_input_tokens=(
|
||||
_require_int(max_input_tokens, context=f"{context}.max_input_tokens")
|
||||
if max_input_tokens is not None
|
||||
else None
|
||||
),
|
||||
)
|
||||
if spec.min_input_tokens is None and spec.max_input_tokens is None:
|
||||
raise SpecError(
|
||||
f"{context} must define at least one of min_input_tokens/max_input_tokens."
|
||||
)
|
||||
if (
|
||||
spec.min_input_tokens is not None
|
||||
and spec.max_input_tokens is not None
|
||||
and spec.min_input_tokens > spec.max_input_tokens
|
||||
):
|
||||
raise SpecError(
|
||||
f"{context}.min_input_tokens must be <= {context}.max_input_tokens."
|
||||
)
|
||||
return spec
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TraceSpec:
|
||||
windows_path: str
|
||||
@@ -150,6 +186,7 @@ class TraceSpec:
|
||||
u_field: str
|
||||
timestamp_field: str
|
||||
max_concurrency: int
|
||||
input_length_filter: InputLengthFilterSpec | None = None
|
||||
max_requests_per_probe: int | None = None
|
||||
synthetic_prompt_cap_tokens: int | None = None
|
||||
replay_time_scale: float = 1.0
|
||||
@@ -171,6 +208,17 @@ class TraceSpec:
|
||||
max_concurrency=_require_int(
|
||||
data.get("max_concurrency", 64), context="trace.max_concurrency"
|
||||
),
|
||||
input_length_filter=(
|
||||
InputLengthFilterSpec.from_dict(
|
||||
_require_mapping(
|
||||
data.get("input_length_filter"),
|
||||
context="trace.input_length_filter",
|
||||
),
|
||||
context="trace.input_length_filter",
|
||||
)
|
||||
if data.get("input_length_filter") is not None
|
||||
else None
|
||||
),
|
||||
max_requests_per_probe=int(max_requests) if max_requests is not None else None,
|
||||
synthetic_prompt_cap_tokens=(
|
||||
int(synthetic_prompt_cap) if synthetic_prompt_cap is not None else None
|
||||
@@ -454,6 +502,7 @@ class TrialSummary:
|
||||
class StudyState:
|
||||
study_id: str
|
||||
best_trial_id: str | None = None
|
||||
best_sampling_u: float | None = None
|
||||
best_request_rate: float | None = None
|
||||
next_trial_index: int = 1
|
||||
trials: list[TrialSummary] = field(default_factory=list)
|
||||
|
||||
@@ -32,6 +32,7 @@ class StudyStore:
|
||||
return StudyState(
|
||||
study_id=str(payload["study_id"]),
|
||||
best_trial_id=payload.get("best_trial_id"),
|
||||
best_sampling_u=payload.get("best_sampling_u"),
|
||||
best_request_rate=payload.get("best_request_rate"),
|
||||
next_trial_index=int(payload.get("next_trial_index", 1)),
|
||||
trials=trials,
|
||||
@@ -64,7 +65,18 @@ class StudyStore:
|
||||
study_id=study.study_id,
|
||||
trial_id=trial_id,
|
||||
config_patch=proposal.config_patch,
|
||||
search=study.search,
|
||||
search=replace(
|
||||
study.search,
|
||||
low=min(
|
||||
study.search.high,
|
||||
max(
|
||||
study.search.low,
|
||||
float(state.best_sampling_u)
|
||||
if isinstance(state.best_sampling_u, (int, float))
|
||||
else study.search.low,
|
||||
),
|
||||
),
|
||||
),
|
||||
study_spec_path=str((self.study_root(study.study_id) / "study_spec.source").resolve()),
|
||||
artifact_dir=str(trial_root),
|
||||
probe_log_path=str(trial_root / "probe_history.json"),
|
||||
@@ -89,6 +101,7 @@ class StudyStore:
|
||||
by_id = {item.trial_id: item for item in state.trials}
|
||||
trials_dir = self.study_root(study_id) / "trials"
|
||||
best_trial_id = state.best_trial_id
|
||||
best_sampling_u = state.best_sampling_u
|
||||
best_rate = state.best_request_rate
|
||||
for trial_dir in sorted(trials_dir.glob("trial-*")):
|
||||
result_path = trial_dir / "result.json"
|
||||
@@ -112,7 +125,13 @@ class StudyStore:
|
||||
and (best_rate is None or summary.best_request_rate > best_rate)
|
||||
):
|
||||
best_rate = float(summary.best_request_rate)
|
||||
best_sampling_u = (
|
||||
float(summary.best_sampling_u)
|
||||
if isinstance(summary.best_sampling_u, (int, float))
|
||||
else None
|
||||
)
|
||||
best_trial_id = trial_id
|
||||
state.best_sampling_u = best_sampling_u
|
||||
state.best_request_rate = best_rate
|
||||
state.best_trial_id = best_trial_id
|
||||
self.save_state(state)
|
||||
|
||||
@@ -132,6 +132,25 @@ def _downsample_requests(
|
||||
return [requests[idx] for idx in indexes]
|
||||
|
||||
|
||||
def _matches_input_length_filter(study: StudySpec, *, prompt_tokens_hint: int | None) -> bool:
|
||||
length_filter = study.trace.input_length_filter
|
||||
if length_filter is None:
|
||||
return True
|
||||
if prompt_tokens_hint is None:
|
||||
return False
|
||||
if (
|
||||
length_filter.min_input_tokens is not None
|
||||
and prompt_tokens_hint < length_filter.min_input_tokens
|
||||
):
|
||||
return False
|
||||
if (
|
||||
length_filter.max_input_tokens is not None
|
||||
and prompt_tokens_hint > length_filter.max_input_tokens
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[WindowRecord, list[TraceRequest]]:
|
||||
window = resolve_window_record(study, study_spec_path=study_spec_path)
|
||||
time_scale = float(study.trace.replay_time_scale)
|
||||
@@ -163,6 +182,8 @@ def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[Win
|
||||
if isinstance(sampling_u, bool) or not isinstance(sampling_u, (int, float)):
|
||||
raise TraceError(f"trace row {idx} is missing numeric {study.trace.u_field}")
|
||||
prompt_tokens_hint = _coerce_prompt_tokens(row)
|
||||
if not _matches_input_length_filter(study, prompt_tokens_hint=prompt_tokens_hint):
|
||||
continue
|
||||
try:
|
||||
messages = _coerce_messages(row)
|
||||
except TraceError:
|
||||
|
||||
@@ -177,14 +177,19 @@ def _replay_requests(
|
||||
if early_stopped:
|
||||
break
|
||||
if futures_by_request:
|
||||
timeout = None
|
||||
timeout = 0.5
|
||||
if next_index < len(requests):
|
||||
timeout = max(0.0, requests[next_index].arrival_s - elapsed)
|
||||
timeout = min(timeout, max(0.0, requests[next_index].arrival_s - elapsed))
|
||||
if max_elapsed_s is not None:
|
||||
remaining_elapsed = max(0.0, max_elapsed_s - elapsed)
|
||||
timeout = min(timeout, remaining_elapsed)
|
||||
done, _ = wait(
|
||||
list(futures_by_request),
|
||||
timeout=timeout,
|
||||
return_when=FIRST_COMPLETED,
|
||||
)
|
||||
if not done:
|
||||
continue
|
||||
for future in done:
|
||||
request = futures_by_request.pop(future)
|
||||
outcome = future.result()
|
||||
|
||||
@@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job
|
||||
from aituner.llm import build_prompt, parse_proposal_text
|
||||
from aituner.search import ThresholdProbe, binary_search_max_feasible
|
||||
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
|
||||
from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec
|
||||
from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec
|
||||
from aituner.store import StudyStore
|
||||
from aituner.trace import load_trace_requests, summarize_window
|
||||
from aituner.worker import (
|
||||
@@ -25,7 +25,9 @@ from aituner.worker import (
|
||||
from aituner.trace import TraceRequest
|
||||
|
||||
|
||||
def _write_study_assets(tmp_path: Path) -> Path:
|
||||
def _write_study_assets(
|
||||
tmp_path: Path, *, trace_overrides: dict[str, object] | None = None
|
||||
) -> Path:
|
||||
trace_dir = tmp_path / "trace_windows" / "traces"
|
||||
trace_dir.mkdir(parents=True)
|
||||
trace_path = trace_dir / "chat_w1.jsonl"
|
||||
@@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path:
|
||||
)
|
||||
|
||||
study_path = tmp_path / "study.json"
|
||||
trace_payload: dict[str, object] = {
|
||||
"windows_path": str(windows_path),
|
||||
"window_id": "chat_w1",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 4,
|
||||
}
|
||||
if trace_overrides:
|
||||
trace_payload.update(trace_overrides)
|
||||
|
||||
study_payload = {
|
||||
"study_id": "study-1",
|
||||
"hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
|
||||
@@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path:
|
||||
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
|
||||
"python_executable": "python3"
|
||||
},
|
||||
"trace": {
|
||||
"windows_path": str(windows_path),
|
||||
"window_id": "chat_w1",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 4
|
||||
},
|
||||
"trace": trace_payload,
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
"ttft_rule": {
|
||||
@@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase):
|
||||
)
|
||||
self.assertIn("allowed_flag_keys", prompt)
|
||||
self.assertIn("study-1", prompt)
|
||||
self.assertIn('"current_best"', prompt)
|
||||
self.assertIn("queueing_knee_by_bucket", prompt)
|
||||
self.assertTrue(study_root.exists())
|
||||
|
||||
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
trace_overrides={
|
||||
"input_length_filter": {
|
||||
"min_input_tokens": 0,
|
||||
"max_input_tokens": 8192,
|
||||
}
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
window, requests = load_trace_requests(study, study_spec_path=study_path)
|
||||
summary = summarize_window(requests, window)
|
||||
self.assertEqual(len(requests), 2)
|
||||
self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
|
||||
self.assertEqual(summary["request_count"], 2)
|
||||
self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
|
||||
prompt = build_prompt(
|
||||
study=study,
|
||||
window_summary=summary,
|
||||
state=StudyState(study_id=study.study_id),
|
||||
capability_profile=None,
|
||||
)
|
||||
self.assertIn('"input_length_filter"', prompt)
|
||||
self.assertIn('"max_input_tokens": 8192', prompt)
|
||||
|
||||
def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
trace_overrides={
|
||||
"input_length_filter": {
|
||||
"min_input_tokens": 8193,
|
||||
"max_input_tokens": 8192,
|
||||
}
|
||||
},
|
||||
)
|
||||
with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
|
||||
load_study_spec(study_path)
|
||||
|
||||
def test_prompt_includes_failed_trial_context(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase):
|
||||
)
|
||||
next_state = store.ingest_trial_results(study.study_id)
|
||||
self.assertEqual(next_state.best_trial_id, trial.trial_id)
|
||||
self.assertEqual(next_state.best_sampling_u, 0.75)
|
||||
self.assertEqual(next_state.best_request_rate, 12.5)
|
||||
|
||||
def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_sampling_u=0.375,
|
||||
best_request_rate=3.0,
|
||||
next_trial_index=2,
|
||||
trials=[],
|
||||
)
|
||||
proposal = Proposal.from_dict(
|
||||
{
|
||||
"observation": "Obs",
|
||||
"diagnosis": "Diag",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
|
||||
"expected_effects": ["raise rate"],
|
||||
}
|
||||
)
|
||||
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||
self.assertEqual(trial.search.low, 0.375)
|
||||
self.assertEqual(trial.search.high, 1.0)
|
||||
|
||||
def test_ingest_trial_results_records_failure_reason(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase):
|
||||
store = StudyStore(store_root)
|
||||
state = store.load_state("study-1")
|
||||
self.assertEqual(state.best_trial_id, "trial-0002")
|
||||
self.assertEqual(state.best_sampling_u, 0.75)
|
||||
self.assertEqual(state.best_request_rate, 2.0)
|
||||
self.assertEqual(state.next_trial_index, 3)
|
||||
|
||||
@@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase):
|
||||
["throughput: higher", "ttft: lower"],
|
||||
)
|
||||
|
||||
def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
proposal = parse_proposal_text(
|
||||
"""Here is the proposal:
|
||||
```json
|
||||
{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
|
||||
```""",
|
||||
study,
|
||||
)
|
||||
self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
|
||||
|
||||
def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
|
||||
requests = [
|
||||
TraceRequest(
|
||||
@@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertEqual(len(replayed), 2)
|
||||
self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")
|
||||
|
||||
def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
|
||||
requests = [
|
||||
TraceRequest(
|
||||
row_id="r0",
|
||||
arrival_s=0.0,
|
||||
sampling_u=0.1,
|
||||
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
|
||||
prompt_tokens_hint=8,
|
||||
completion_tokens_hint=4,
|
||||
)
|
||||
]
|
||||
|
||||
class FakeFuture:
|
||||
def result(self, timeout=None):
|
||||
raise AssertionError("future should not be awaited after elapsed early stop")
|
||||
|
||||
def cancel(self):
|
||||
return True
|
||||
|
||||
submitted = []
|
||||
|
||||
class FakeExecutor:
|
||||
def __init__(self, max_workers):
|
||||
self.max_workers = max_workers
|
||||
|
||||
def submit(self, fn, request, **kwargs):
|
||||
submitted.append(request.row_id)
|
||||
return FakeFuture()
|
||||
|
||||
def shutdown(self, wait=False, cancel_futures=True):
|
||||
return None
|
||||
|
||||
wait_timeouts: list[float] = []
|
||||
|
||||
def fake_wait(futures, timeout=None, return_when=None):
|
||||
wait_timeouts.append(timeout)
|
||||
return set(), set(futures)
|
||||
|
||||
def fake_evaluate(outcome: RequestOutcome):
|
||||
return type("Eval", (), {"passed": outcome.success})()
|
||||
|
||||
monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
|
||||
|
||||
with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
|
||||
with mock.patch("aituner.worker.wait", side_effect=fake_wait):
|
||||
with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
|
||||
replayed, early_stopped, reason = _replay_requests(
|
||||
requests,
|
||||
base_url="http://127.0.0.1:8000",
|
||||
timeout_s=30.0,
|
||||
max_concurrency=1,
|
||||
target_pass_rate=0.95,
|
||||
max_lag_s=None,
|
||||
max_elapsed_s=1.0,
|
||||
evaluate_outcome=fake_evaluate,
|
||||
)
|
||||
|
||||
self.assertEqual(submitted, ["r0"])
|
||||
self.assertTrue(early_stopped)
|
||||
self.assertEqual(reason, "probe_elapsed_s>1.0")
|
||||
self.assertEqual(len(replayed), 1)
|
||||
self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
|
||||
self.assertTrue(wait_timeouts)
|
||||
self.assertLessEqual(wait_timeouts[0], 0.5)
|
||||
|
||||
def test_latency_summary_reports_quantiles_and_slo(self) -> None:
|
||||
study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
|
||||
outcomes = [
|
||||
|
||||
Reference in New Issue
Block a user