{ "study_id": "dash0-qwen27b-tight-slo-10min-run4-chat-0-8k", "hardware": { "gpu_count": 4, "gpu_model": "H20", "host_candidates": [ "dash0" ] }, "model": { "model_id": "qwen3.5-27b-256k-0223-internal", "served_model_name": "qwen35-27b-aituner" }, "engine": { "engine_name": "vllm", "engine_version": "latest-release-on-dash0", "exec_path": "/usr/local/bin/vllm", "cwd": "/home/admin/cpfs/wjh/aituner/aituner", "host": "127.0.0.1", "port": 18082, "healthcheck_path": "/v1/models", "ready_timeout_s": 900, "request_timeout_s": 900, "launch_args": [ "serve", "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal" ], "base_envs": { "CUDA_VISIBLE_DEVICES": "4,5,6,7", "VLLM_DISABLE_COMPILE_CACHE": "1", "DS_LLM_IGNORE_WARMUP": "1", "DS_LLM_IGNORE_CHECK_WARMUP": "1", "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1", "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0", "PARAM_TOTAL_MAX": "262144", "PARAM_IN_LENGTH_MAX": "262144", "PARAM_MAX_LENGTH_MAX": "131072", "DS_LLM_MAX_THINK_TOKENS": "81920", "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600", "VLLM_FP8_USE_BLADNN": "1", "VLLM_MOE_USE_BLADNN": "1", "VLLM_GDN_USE_BLADNN": "0", "VLLM_USE_V1": "1", "VLLM_IS_HYBRID_MODEL": "1", "VLLM_ENABLE_TORCH_COMPILE": "1", "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", "VLLM_USE_FLASHINFER_SAMPLER": "0", "VLLM_RESPONSE_TIMEOUT": "300", "VLLM_LOG_REQ_KV_LENS": "1", "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" }, "base_flags": { "host": "127.0.0.1", "port": 18082, "served-model-name": "qwen35-27b-aituner", "trust-remote-code": true, "dtype": "bfloat16", "gpu-memory-utilization": 0.9, "enable-prefix-caching": true, "mamba-cache-mode": "light", "distributed-executor-backend": "mp", "block-size": 64, "enable-chunked-prefill": true, "max-num-batched-tokens": 8192, "disable-cascade-attn": true, "max-model-len": 262144, "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}", "mm-processor-cache-gb": 0, "limit-mm-per-prompt": "{\"image\":256,\"video\":64}", "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}", "mamba-cache-dtype": "float32", "skip-mm-profiling": true, "quantization": "fp8", "tensor-parallel-size": 4, "max-num-seqs": 16, "disable-log-requests": true }, "tunable_envs": [ "VLLM_ATTENTION_BACKEND", "VLLM_ENABLE_TORCH_COMPILE", "VLLM_USE_FLASHINFER_SAMPLER", "VLLM_ENABLE_MODEL_RUNNER_WARMUP" ], "tunable_flags": [ "tensor-parallel-size", "gpu-memory-utilization", "block-size", "max-num-batched-tokens", "max-num-seqs", "enable-prefix-caching", "enable-chunked-prefill", "disable-cascade-attn" ], "python_executable": "python3" }, "trace": { "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", "window_id": "chat_w20260311_1000", "u_field": "sampling_u", "timestamp_field": "timestamp", "max_concurrency": 32, "input_length_filter": { "min_input_tokens": 0, "max_input_tokens": 8192 }, "replay_time_scale": 1.0, "early_stop_max_lag_s": 120.0, "early_stop_max_elapsed_s": 900.0 }, "slo": { "target_pass_rate": 0.95, "ttft_rule": { "kind": "step_ms", "buckets": [ { "max_input_tokens": 4096, "threshold_ms": 2000 }, { "max_input_tokens": 32768, "threshold_ms": 4000 }, { "threshold_ms": 6000 } ] }, "tpot_rule": { "kind": "fixed_ms", "threshold_ms": 50 } }, "search": { "low": 0.0, "high": 0.0625, "tolerance": 0.001, "max_probes": 6, "sample_seed": 20260325 }, "llm": { "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.", "max_history_trials": 8, "endpoint": { "provider": "codex", "model": "gpt-5.4", "api_key_env": "OPENAI_API_KEY", "timeout_s": 180 } } }