diff --git a/configs/examples/dash0_qwen27b_tight_slo_baseline_proposal.json b/configs/examples/dash0_qwen27b_tight_slo_baseline_proposal.json new file mode 100644 index 0000000..662a30d --- /dev/null +++ b/configs/examples/dash0_qwen27b_tight_slo_baseline_proposal.json @@ -0,0 +1,28 @@ +{ + "observation": "The incumbent should start from the known launch-safe qwen3.5-27b serving recipe on dash0 before asking the LLM to optimize throughput above that baseline.", + "diagnosis": "This model uses a long-context hybrid stack and fp8 quantization. The safest first measurement is to preserve the existing warmup, hybrid-model, chunked-prefill, and prefix-caching behavior from run_qwen27b.sh, while keeping a conservative sequence cap.", + "config_patch": { + "env_patch": { + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_ENABLE_TORCH_COMPILE": "1", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1" + }, + "flag_patch": { + "tensor-parallel-size": 4, + "gpu-memory-utilization": 0.9, + "block-size": 64, + "max-num-batched-tokens": 8192, + "max-num-seqs": 16, + "enable-prefix-caching": true, + "enable-chunked-prefill": true, + "disable-cascade-attn": true + } + }, + "expected_effects": [ + "Launch-safe baseline aligned with the current hand-tuned qwen27b recipe while using all 4 visible H20 GPUs", + "Reliable first incumbent under the tighter TTFT and TPOT SLO", + "Clear trial history for the LLM to propose a higher-throughput follow-up patch" + ], + "why_not_previous_failures": "This baseline intentionally avoids speculative new kernels or batching spikes before we have an incumbent under the new SLO." +} diff --git a/configs/examples/dash0_qwen27b_tight_slo_run1.json b/configs/examples/dash0_qwen27b_tight_slo_run1.json new file mode 100644 index 0000000..2ef5675 --- /dev/null +++ b/configs/examples/dash0_qwen27b_tight_slo_run1.json @@ -0,0 +1,147 @@ +{ + "study_id": "dash0-qwen27b-tight-slo-10min-run1", + "hardware": { + "gpu_count": 4, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3.5-27b-256k-0223-internal", + "served_model_name": "qwen35-27b-aituner" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "latest-release-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18082, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "4,5,6,7", + "VLLM_DISABLE_COMPILE_CACHE": "1", + "DS_LLM_IGNORE_WARMUP": "1", + "DS_LLM_IGNORE_CHECK_WARMUP": "1", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1", + "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0", + "PARAM_TOTAL_MAX": "262144", + "PARAM_IN_LENGTH_MAX": "262144", + "PARAM_MAX_LENGTH_MAX": "131072", + "DS_LLM_MAX_THINK_TOKENS": "81920", + "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_GDN_USE_BLADNN": "0", + "VLLM_USE_V1": "1", + "VLLM_IS_HYBRID_MODEL": "1", + "VLLM_ENABLE_TORCH_COMPILE": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_RESPONSE_TIMEOUT": "300", + "VLLM_LOG_REQ_KV_LENS": "1", + "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18082, + "served-model-name": "qwen35-27b-aituner", + "trust-remote-code": true, + "dtype": "bfloat16", + "gpu-memory-utilization": 0.9, + "enable-prefix-caching": true, + "mamba-cache-mode": "light", + "distributed-executor-backend": "mp", + "block-size": 64, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-cascade-attn": true, + "max-model-len": 262144, + "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}", + "mm-processor-cache-gb": 0, + "limit-mm-per-prompt": "{\"image\":256,\"video\":64}", + "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}", + "mamba-cache-dtype": "float32", + "skip-mm-profiling": true, + "quantization": "fp8", + "tensor-parallel-size": 1, + "max-num-seqs": 16, + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ATTENTION_BACKEND", + "VLLM_ENABLE_TORCH_COMPILE", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP" + ], + "tunable_flags": [ + "tensor-parallel-size", + "gpu-memory-utilization", + "block-size", + "max-num-batched-tokens", + "max-num-seqs", + "enable-prefix-caching", + "enable-chunked-prefill", + "disable-cascade-attn" + ], + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 32, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 1.0, + "tolerance": 0.01, + "max_probes": 8, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.", + "max_history_trials": 8, + "endpoint": { + "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1", + "model": "gpt-5.4", + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 180 + } + } +} diff --git a/configs/examples/dash0_qwen27b_tight_slo_run2.json b/configs/examples/dash0_qwen27b_tight_slo_run2.json new file mode 100644 index 0000000..58a968b --- /dev/null +++ b/configs/examples/dash0_qwen27b_tight_slo_run2.json @@ -0,0 +1,147 @@ +{ + "study_id": "dash0-qwen27b-tight-slo-10min-run2", + "hardware": { + "gpu_count": 4, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3.5-27b-256k-0223-internal", + "served_model_name": "qwen35-27b-aituner" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "latest-release-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18082, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "4,5,6,7", + "VLLM_DISABLE_COMPILE_CACHE": "1", + "DS_LLM_IGNORE_WARMUP": "1", + "DS_LLM_IGNORE_CHECK_WARMUP": "1", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1", + "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0", + "PARAM_TOTAL_MAX": "262144", + "PARAM_IN_LENGTH_MAX": "262144", + "PARAM_MAX_LENGTH_MAX": "131072", + "DS_LLM_MAX_THINK_TOKENS": "81920", + "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_GDN_USE_BLADNN": "0", + "VLLM_USE_V1": "1", + "VLLM_IS_HYBRID_MODEL": "1", + "VLLM_ENABLE_TORCH_COMPILE": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_RESPONSE_TIMEOUT": "300", + "VLLM_LOG_REQ_KV_LENS": "1", + "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18082, + "served-model-name": "qwen35-27b-aituner", + "trust-remote-code": true, + "dtype": "bfloat16", + "gpu-memory-utilization": 0.9, + "enable-prefix-caching": true, + "mamba-cache-mode": "light", + "distributed-executor-backend": "mp", + "block-size": 64, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-cascade-attn": true, + "max-model-len": 262144, + "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}", + "mm-processor-cache-gb": 0, + "limit-mm-per-prompt": "{\"image\":256,\"video\":64}", + "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}", + "mamba-cache-dtype": "float32", + "skip-mm-profiling": true, + "quantization": "fp8", + "tensor-parallel-size": 4, + "max-num-seqs": 16, + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ATTENTION_BACKEND", + "VLLM_ENABLE_TORCH_COMPILE", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP" + ], + "tunable_flags": [ + "tensor-parallel-size", + "gpu-memory-utilization", + "block-size", + "max-num-batched-tokens", + "max-num-seqs", + "enable-prefix-caching", + "enable-chunked-prefill", + "disable-cascade-attn" + ], + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 32, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.0625, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.", + "max_history_trials": 8, + "endpoint": { + "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1", + "model": "gpt-5.4", + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 180 + } + } +} diff --git a/configs/examples/dash0_qwen27b_tight_slo_run3.json b/configs/examples/dash0_qwen27b_tight_slo_run3.json new file mode 100644 index 0000000..64aaa3e --- /dev/null +++ b/configs/examples/dash0_qwen27b_tight_slo_run3.json @@ -0,0 +1,147 @@ +{ + "study_id": "dash0-qwen27b-tight-slo-10min-run3", + "hardware": { + "gpu_count": 4, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3.5-27b-256k-0223-internal", + "served_model_name": "qwen35-27b-aituner" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "latest-release-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18082, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "4,5,6,7", + "VLLM_DISABLE_COMPILE_CACHE": "1", + "DS_LLM_IGNORE_WARMUP": "1", + "DS_LLM_IGNORE_CHECK_WARMUP": "1", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1", + "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0", + "PARAM_TOTAL_MAX": "262144", + "PARAM_IN_LENGTH_MAX": "262144", + "PARAM_MAX_LENGTH_MAX": "131072", + "DS_LLM_MAX_THINK_TOKENS": "81920", + "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_GDN_USE_BLADNN": "0", + "VLLM_USE_V1": "1", + "VLLM_IS_HYBRID_MODEL": "1", + "VLLM_ENABLE_TORCH_COMPILE": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_RESPONSE_TIMEOUT": "300", + "VLLM_LOG_REQ_KV_LENS": "1", + "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18082, + "served-model-name": "qwen35-27b-aituner", + "trust-remote-code": true, + "dtype": "bfloat16", + "gpu-memory-utilization": 0.9, + "enable-prefix-caching": true, + "mamba-cache-mode": "light", + "distributed-executor-backend": "mp", + "block-size": 64, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-cascade-attn": true, + "max-model-len": 262144, + "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}", + "mm-processor-cache-gb": 0, + "limit-mm-per-prompt": "{\"image\":256,\"video\":64}", + "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}", + "mamba-cache-dtype": "float32", + "skip-mm-profiling": true, + "quantization": "fp8", + "tensor-parallel-size": 4, + "max-num-seqs": 16, + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ATTENTION_BACKEND", + "VLLM_ENABLE_TORCH_COMPILE", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP" + ], + "tunable_flags": [ + "tensor-parallel-size", + "gpu-memory-utilization", + "block-size", + "max-num-batched-tokens", + "max-num-seqs", + "enable-prefix-caching", + "enable-chunked-prefill", + "disable-cascade-attn" + ], + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 32, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.0625, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.", + "max_history_trials": 8, + "endpoint": { + "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1", + "model": "gpt-5.4", + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 180 + } + } +} diff --git a/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json b/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json new file mode 100644 index 0000000..a59decb --- /dev/null +++ b/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json @@ -0,0 +1,151 @@ +{ + "study_id": "dash0-qwen27b-tight-slo-10min-run4-chat-0-8k", + "hardware": { + "gpu_count": 4, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3.5-27b-256k-0223-internal", + "served_model_name": "qwen35-27b-aituner" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "latest-release-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18082, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "4,5,6,7", + "VLLM_DISABLE_COMPILE_CACHE": "1", + "DS_LLM_IGNORE_WARMUP": "1", + "DS_LLM_IGNORE_CHECK_WARMUP": "1", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1", + "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0", + "PARAM_TOTAL_MAX": "262144", + "PARAM_IN_LENGTH_MAX": "262144", + "PARAM_MAX_LENGTH_MAX": "131072", + "DS_LLM_MAX_THINK_TOKENS": "81920", + "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_GDN_USE_BLADNN": "0", + "VLLM_USE_V1": "1", + "VLLM_IS_HYBRID_MODEL": "1", + "VLLM_ENABLE_TORCH_COMPILE": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_RESPONSE_TIMEOUT": "300", + "VLLM_LOG_REQ_KV_LENS": "1", + "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18082, + "served-model-name": "qwen35-27b-aituner", + "trust-remote-code": true, + "dtype": "bfloat16", + "gpu-memory-utilization": 0.9, + "enable-prefix-caching": true, + "mamba-cache-mode": "light", + "distributed-executor-backend": "mp", + "block-size": 64, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-cascade-attn": true, + "max-model-len": 262144, + "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}", + "mm-processor-cache-gb": 0, + "limit-mm-per-prompt": "{\"image\":256,\"video\":64}", + "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}", + "mamba-cache-dtype": "float32", + "skip-mm-profiling": true, + "quantization": "fp8", + "tensor-parallel-size": 4, + "max-num-seqs": 16, + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ATTENTION_BACKEND", + "VLLM_ENABLE_TORCH_COMPILE", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP" + ], + "tunable_flags": [ + "tensor-parallel-size", + "gpu-memory-utilization", + "block-size", + "max-num-batched-tokens", + "max-num-seqs", + "enable-prefix-caching", + "enable-chunked-prefill", + "disable-cascade-attn" + ], + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 32, + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192 + }, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.0625, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.", + "max_history_trials": 8, + "endpoint": { + "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1", + "model": "gpt-5.4", + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 180 + } + } +} diff --git a/configs/examples/study.example.json b/configs/examples/study.example.json index 6ee9fd5..57f694d 100644 --- a/configs/examples/study.example.json +++ b/configs/examples/study.example.json @@ -50,7 +50,11 @@ "window_id": "chat_w_example_0001", "u_field": "sampling_u", "timestamp_field": "timestamp", - "max_concurrency": 64 + "max_concurrency": 64, + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192 + } }, "slo": { "target_pass_rate": 0.95, diff --git a/src/aituner/llm.py b/src/aituner/llm.py index 21ebed8..6f76dd6 100644 --- a/src/aituner/llm.py +++ b/src/aituner/llm.py @@ -42,6 +42,11 @@ def build_prompt( json.dumps( { "study_id": study.study_id, + "current_best": { + "trial_id": state.best_trial_id, + "best_sampling_u": state.best_sampling_u, + "best_request_rate": state.best_request_rate, + }, "hardware": { "gpu_count": study.hardware.gpu_count, "gpu_model": study.hardware.gpu_model, @@ -50,6 +55,17 @@ def build_prompt( "model_id": study.model.model_id, "served_model_name": study.model.served_model_name, }, + "trace": { + "window_id": study.trace.window_id, + "input_length_filter": ( + { + "min_input_tokens": study.trace.input_length_filter.min_input_tokens, + "max_input_tokens": study.trace.input_length_filter.max_input_tokens, + } + if study.trace.input_length_filter is not None + else None + ), + }, "engine": { "engine_name": study.engine.engine_name, "engine_version": study.engine.engine_version, @@ -84,6 +100,8 @@ def build_prompt( "Trial history:", json.dumps(history, ensure_ascii=False, indent=2), "", + "The proposal must beat the current incumbent. Do not propose a config that is only likely to be feasible below the current best_sampling_u/request_rate.", + "The evaluator for a new trial will start searching from the current best feasible sampling_u and only look for improvements above it.", "The proposal should improve the maximum feasible sampling_u under the 95%+ SLO target.", ] return "\n".join(sections) @@ -110,8 +128,22 @@ def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal: return proposal +def _parse_json_object_text(text: str) -> dict[str, Any]: + try: + payload = json.loads(text) + except json.JSONDecodeError: + start = text.find("{") + end = text.rfind("}") + if start < 0 or end < start: + raise + payload = json.loads(text[start : end + 1]) + if not isinstance(payload, dict): + raise SpecError("proposal payload must be a JSON object") + return payload + + def parse_proposal_text(text: str, study: StudySpec) -> Proposal: - payload = json.loads(text) + payload = _parse_json_object_text(text) proposal = Proposal.from_dict(payload) return validate_proposal(proposal, study) diff --git a/src/aituner/spec.py b/src/aituner/spec.py index 2048818..aa37115 100644 --- a/src/aituner/spec.py +++ b/src/aituner/spec.py @@ -142,6 +142,42 @@ class EngineLaunchSpec: ) +@dataclass(frozen=True) +class InputLengthFilterSpec: + min_input_tokens: int | None = None + max_input_tokens: int | None = None + + @classmethod + def from_dict(cls, data: Mapping[str, Any], *, context: str) -> "InputLengthFilterSpec": + min_input_tokens = data.get("min_input_tokens") + max_input_tokens = data.get("max_input_tokens") + spec = cls( + min_input_tokens=( + _require_int(min_input_tokens, context=f"{context}.min_input_tokens") + if min_input_tokens is not None + else None + ), + max_input_tokens=( + _require_int(max_input_tokens, context=f"{context}.max_input_tokens") + if max_input_tokens is not None + else None + ), + ) + if spec.min_input_tokens is None and spec.max_input_tokens is None: + raise SpecError( + f"{context} must define at least one of min_input_tokens/max_input_tokens." + ) + if ( + spec.min_input_tokens is not None + and spec.max_input_tokens is not None + and spec.min_input_tokens > spec.max_input_tokens + ): + raise SpecError( + f"{context}.min_input_tokens must be <= {context}.max_input_tokens." + ) + return spec + + @dataclass(frozen=True) class TraceSpec: windows_path: str @@ -150,6 +186,7 @@ class TraceSpec: u_field: str timestamp_field: str max_concurrency: int + input_length_filter: InputLengthFilterSpec | None = None max_requests_per_probe: int | None = None synthetic_prompt_cap_tokens: int | None = None replay_time_scale: float = 1.0 @@ -171,6 +208,17 @@ class TraceSpec: max_concurrency=_require_int( data.get("max_concurrency", 64), context="trace.max_concurrency" ), + input_length_filter=( + InputLengthFilterSpec.from_dict( + _require_mapping( + data.get("input_length_filter"), + context="trace.input_length_filter", + ), + context="trace.input_length_filter", + ) + if data.get("input_length_filter") is not None + else None + ), max_requests_per_probe=int(max_requests) if max_requests is not None else None, synthetic_prompt_cap_tokens=( int(synthetic_prompt_cap) if synthetic_prompt_cap is not None else None @@ -454,6 +502,7 @@ class TrialSummary: class StudyState: study_id: str best_trial_id: str | None = None + best_sampling_u: float | None = None best_request_rate: float | None = None next_trial_index: int = 1 trials: list[TrialSummary] = field(default_factory=list) diff --git a/src/aituner/store.py b/src/aituner/store.py index ca2dd18..f2983e7 100644 --- a/src/aituner/store.py +++ b/src/aituner/store.py @@ -32,6 +32,7 @@ class StudyStore: return StudyState( study_id=str(payload["study_id"]), best_trial_id=payload.get("best_trial_id"), + best_sampling_u=payload.get("best_sampling_u"), best_request_rate=payload.get("best_request_rate"), next_trial_index=int(payload.get("next_trial_index", 1)), trials=trials, @@ -64,7 +65,18 @@ class StudyStore: study_id=study.study_id, trial_id=trial_id, config_patch=proposal.config_patch, - search=study.search, + search=replace( + study.search, + low=min( + study.search.high, + max( + study.search.low, + float(state.best_sampling_u) + if isinstance(state.best_sampling_u, (int, float)) + else study.search.low, + ), + ), + ), study_spec_path=str((self.study_root(study.study_id) / "study_spec.source").resolve()), artifact_dir=str(trial_root), probe_log_path=str(trial_root / "probe_history.json"), @@ -89,6 +101,7 @@ class StudyStore: by_id = {item.trial_id: item for item in state.trials} trials_dir = self.study_root(study_id) / "trials" best_trial_id = state.best_trial_id + best_sampling_u = state.best_sampling_u best_rate = state.best_request_rate for trial_dir in sorted(trials_dir.glob("trial-*")): result_path = trial_dir / "result.json" @@ -112,7 +125,13 @@ class StudyStore: and (best_rate is None or summary.best_request_rate > best_rate) ): best_rate = float(summary.best_request_rate) + best_sampling_u = ( + float(summary.best_sampling_u) + if isinstance(summary.best_sampling_u, (int, float)) + else None + ) best_trial_id = trial_id + state.best_sampling_u = best_sampling_u state.best_request_rate = best_rate state.best_trial_id = best_trial_id self.save_state(state) diff --git a/src/aituner/trace.py b/src/aituner/trace.py index 36c7905..a318a4c 100644 --- a/src/aituner/trace.py +++ b/src/aituner/trace.py @@ -132,6 +132,25 @@ def _downsample_requests( return [requests[idx] for idx in indexes] +def _matches_input_length_filter(study: StudySpec, *, prompt_tokens_hint: int | None) -> bool: + length_filter = study.trace.input_length_filter + if length_filter is None: + return True + if prompt_tokens_hint is None: + return False + if ( + length_filter.min_input_tokens is not None + and prompt_tokens_hint < length_filter.min_input_tokens + ): + return False + if ( + length_filter.max_input_tokens is not None + and prompt_tokens_hint > length_filter.max_input_tokens + ): + return False + return True + + def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[WindowRecord, list[TraceRequest]]: window = resolve_window_record(study, study_spec_path=study_spec_path) time_scale = float(study.trace.replay_time_scale) @@ -163,6 +182,8 @@ def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[Win if isinstance(sampling_u, bool) or not isinstance(sampling_u, (int, float)): raise TraceError(f"trace row {idx} is missing numeric {study.trace.u_field}") prompt_tokens_hint = _coerce_prompt_tokens(row) + if not _matches_input_length_filter(study, prompt_tokens_hint=prompt_tokens_hint): + continue try: messages = _coerce_messages(row) except TraceError: diff --git a/src/aituner/worker.py b/src/aituner/worker.py index 2902d89..e569a13 100644 --- a/src/aituner/worker.py +++ b/src/aituner/worker.py @@ -177,14 +177,19 @@ def _replay_requests( if early_stopped: break if futures_by_request: - timeout = None + timeout = 0.5 if next_index < len(requests): - timeout = max(0.0, requests[next_index].arrival_s - elapsed) + timeout = min(timeout, max(0.0, requests[next_index].arrival_s - elapsed)) + if max_elapsed_s is not None: + remaining_elapsed = max(0.0, max_elapsed_s - elapsed) + timeout = min(timeout, remaining_elapsed) done, _ = wait( list(futures_by_request), timeout=timeout, return_when=FIRST_COMPLETED, ) + if not done: + continue for future in done: request = futures_by_request.pop(future) outcome = future.result() diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 4d24d6b..d084318 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job from aituner.llm import build_prompt, parse_proposal_text from aituner.search import ThresholdProbe, binary_search_max_feasible from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations -from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec +from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec from aituner.store import StudyStore from aituner.trace import load_trace_requests, summarize_window from aituner.worker import ( @@ -25,7 +25,9 @@ from aituner.worker import ( from aituner.trace import TraceRequest -def _write_study_assets(tmp_path: Path) -> Path: +def _write_study_assets( + tmp_path: Path, *, trace_overrides: dict[str, object] | None = None +) -> Path: trace_dir = tmp_path / "trace_windows" / "traces" trace_dir.mkdir(parents=True) trace_path = trace_dir / "chat_w1.jsonl" @@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path: ) study_path = tmp_path / "study.json" + trace_payload: dict[str, object] = { + "windows_path": str(windows_path), + "window_id": "chat_w1", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 4, + } + if trace_overrides: + trace_payload.update(trace_overrides) + study_payload = { "study_id": "study-1", "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]}, @@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path: "tunable_flags": ["tensor-parallel-size", "max-num-seqs"], "python_executable": "python3" }, - "trace": { - "windows_path": str(windows_path), - "window_id": "chat_w1", - "u_field": "sampling_u", - "timestamp_field": "timestamp", - "max_concurrency": 4 - }, + "trace": trace_payload, "slo": { "target_pass_rate": 0.95, "ttft_rule": { @@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase): ) self.assertIn("allowed_flag_keys", prompt) self.assertIn("study-1", prompt) + self.assertIn('"current_best"', prompt) self.assertIn("queueing_knee_by_bucket", prompt) self.assertTrue(study_root.exists()) + def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + trace_overrides={ + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192, + } + }, + ) + study = load_study_spec(study_path) + window, requests = load_trace_requests(study, study_spec_path=study_path) + summary = summarize_window(requests, window) + self.assertEqual(len(requests), 2) + self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000]) + self.assertEqual(summary["request_count"], 2) + self.assertEqual(summary["prompt_tokens_p95"], 5000.0) + prompt = build_prompt( + study=study, + window_summary=summary, + state=StudyState(study_id=study.study_id), + capability_profile=None, + ) + self.assertIn('"input_length_filter"', prompt) + self.assertIn('"max_input_tokens": 8192', prompt) + + def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + trace_overrides={ + "input_length_filter": { + "min_input_tokens": 8193, + "max_input_tokens": 8192, + } + }, + ) + with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="): + load_study_spec(study_path) + def test_prompt_includes_failed_trial_context(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) @@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase): ) next_state = store.ingest_trial_results(study.study_id) self.assertEqual(next_state.best_trial_id, trial.trial_id) + self.assertEqual(next_state.best_sampling_u, 0.75) self.assertEqual(next_state.best_request_rate, 12.5) + def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + study = load_study_spec(study_path) + store = StudyStore(tmp_path / ".aituner" / "studies") + store.init_study(spec_path=study_path, study=study) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0001", + best_sampling_u=0.375, + best_request_rate=3.0, + next_trial_index=2, + trials=[], + ) + proposal = Proposal.from_dict( + { + "observation": "Obs", + "diagnosis": "Diag", + "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}}, + "expected_effects": ["raise rate"], + } + ) + trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) + self.assertEqual(trial.search.low, 0.375) + self.assertEqual(trial.search.high, 1.0) + def test_ingest_trial_results_records_failure_reason(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) @@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase): store = StudyStore(store_root) state = store.load_state("study-1") self.assertEqual(state.best_trial_id, "trial-0002") + self.assertEqual(state.best_sampling_u, 0.75) self.assertEqual(state.best_request_rate, 2.0) self.assertEqual(state.next_trial_index, 3) @@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase): ["throughput: higher", "ttft: lower"], ) + def test_parse_proposal_text_accepts_wrapped_json(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + study = load_study_spec(study_path) + proposal = parse_proposal_text( + """Here is the proposal: +```json +{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"} +```""", + study, + ) + self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32) + def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None: requests = [ TraceRequest( @@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase): self.assertEqual(len(replayed), 2) self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable") + def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None: + requests = [ + TraceRequest( + row_id="r0", + arrival_s=0.0, + sampling_u=0.1, + body={"model": "m", "messages": [{"role": "user", "content": "x"}]}, + prompt_tokens_hint=8, + completion_tokens_hint=4, + ) + ] + + class FakeFuture: + def result(self, timeout=None): + raise AssertionError("future should not be awaited after elapsed early stop") + + def cancel(self): + return True + + submitted = [] + + class FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def submit(self, fn, request, **kwargs): + submitted.append(request.row_id) + return FakeFuture() + + def shutdown(self, wait=False, cancel_futures=True): + return None + + wait_timeouts: list[float] = [] + + def fake_wait(futures, timeout=None, return_when=None): + wait_timeouts.append(timeout) + return set(), set(futures) + + def fake_evaluate(outcome: RequestOutcome): + return type("Eval", (), {"passed": outcome.success})() + + monotonic_values = iter([0.0, 0.0, 0.4, 1.2]) + + with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor): + with mock.patch("aituner.worker.wait", side_effect=fake_wait): + with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)): + replayed, early_stopped, reason = _replay_requests( + requests, + base_url="http://127.0.0.1:8000", + timeout_s=30.0, + max_concurrency=1, + target_pass_rate=0.95, + max_lag_s=None, + max_elapsed_s=1.0, + evaluate_outcome=fake_evaluate, + ) + + self.assertEqual(submitted, ["r0"]) + self.assertTrue(early_stopped) + self.assertEqual(reason, "probe_elapsed_s>1.0") + self.assertEqual(len(replayed), 1) + self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0") + self.assertTrue(wait_timeouts) + self.assertLessEqual(wait_timeouts[0], 0.5) + def test_latency_summary_reports_quantiles_and_slo(self) -> None: study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp()))) outcomes = [