{ "study_id": "dash0-qwen235b-prefill-thinking-run2-ttft-tight-topology", "hardware": { "gpu_count": 8, "gpu_model": "H20", "host_candidates": [ "dash0" ] }, "model": { "model_id": "qwen3-235b-a22b-256k-0717-internal", "served_model_name": "qwen3-235b-prefill" }, "engine": { "engine_name": "vllm", "engine_version": "internal-on-dash0", "exec_path": "/usr/local/bin/vllm", "cwd": "/home/admin/cpfs/wjh/aituner/aituner", "host": "127.0.0.1", "port": 18125, "healthcheck_path": "/v1/models", "ready_timeout_s": 1800, "request_timeout_s": 1800, "launch_args": [ "serve", "/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717" ], "base_envs": { "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", "VLLM_USE_V1": "1", "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", "VLLM_QUANTIZATION_LAYER_WISE": "1", "VLLM_MOE_USE_DEEPEP": "0", "VLLM_MOE_BALANCED_GATING": "0", "VLLM_MOE_RANDOM_GATING": "0", "VLLM_FUSED_MOE_CHUNK_SIZE": "4096", "VLLM_DP_META_USE_CPU_GROUP": "0", "VLLM_MLA_FP8_ATTENTION": "0", "VLLM_MOE_EXPERTS_OVERLAP": "0", "VLLM_USE_FLASHINFER_SAMPLER": "0", "VLLM_RESPONSE_TIMEOUT": "290", "VLLM_FP8_USE_BLADNN": "1", "VLLM_MOE_USE_BLADNN": "1", "VLLM_USE_DEEP_GEMM": "0", "VLLM_PD_TRY_CONNECT_TIMEOUT_SECONDS": "120", "VLLM_DEEP_GEMM_WARMUP": "skip", "DEEPEP_LL_COMBINE_USE_FP8": "1", "DEEPEP_LL_BUFFER_FP8_OPT": "1", "DEEPEP_LL_DISPATCH_USE_NVL": "1", "DEEPEP_LL_COMBINE_USE_NVL": "1", "ACCL_LOW_LATENCY_OPTIMIZE": "2", "ACCL_WRITEBATCH_OPT": "2", "ACCL_IBV_MTU": "9000", "ACCL_TX_DEPTH": "1024", "ACCL_RETRANSMIT_TIMEOUT": "17", "NVSHMEM_IBGDA_NUM_RC_PER_PE": "4", "BLLM_KVTRANS_RDMA_SP": "2", "NCCL_SOCKET_IFNAME": "eth1", "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME": "eth1", "GLOO_SOCKET_IFNAME": "eth1" }, "base_flags": { "host": "127.0.0.1", "port": 18125, "served-model-name": "qwen3-235b-prefill", "tensor-parallel-size": 4, "gpu-memory-utilization": 0.85, "enable-prefix-caching": true, "enable-chunked-prefill": true, "max-num-batched-tokens": 8192, "disable-hybrid-kv-cache-manager": true, "max-model-len": 262144, "block-size": 64, "max-num-seqs": 64, "quantization": "fp8", "cuda-graph-sizes": [ 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024 ], "compilation-config": "{\"cudagraph_mode\":\"PIECEWISE\",\"use_inductor\":false,\"custom_ops\":[\"all\"],\"max_cudagraph_capture_size\":2048}", "speculative-config": "{\"method\":\"eagle3\",\"num_speculative_tokens\":1,\"hf_overrides\":{\"rope_scaling\":{\"type\":\"yarn\",\"factor\":128,\"original_max_position_embeddings\":2048,\"semi_dynamic\":false,\"dynamic\":true},\"num_experts\":0},\"model\":\"/home/admin/resource/model/464482ce.qwen3-235b-a22b/0717-eagle-0820\"}", "hf-overrides": "{\"architectures\":[\"Qwen3MoeForCausalLM\"],\"model_type\":\"qwen3_moe\"}", "kv-cache-dtype": "fp8", "disable-log-requests": true }, "tunable_envs": [ "VLLM_ENABLE_TORCH_COMPILE" ], "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "enable-expert-parallel", "expert-parallel-size", "gpu-memory-utilization", "max-num-batched-tokens", "max-num-seqs", "block-size", "enable-prefix-caching", "enable-chunked-prefill" ], "topology_constraints": { "require_tp_dp_product_equals_gpu_count": false, "require_ep_size_leq_tp_dp_product": true, "require_ep_size_divides_tp_dp_product": true, "require_enable_expert_parallel_when_ep_gt_one": true, "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, "allowed_tp_dp_products": [ 4, 8 ], "allowed_tensor_parallel_sizes": [ 4, 8 ], "allowed_data_parallel_sizes": [ 1, 2 ], "allowed_expert_parallel_sizes": [ 1, 2, 4, 8 ] }, "python_executable": "python3" }, "trace": { "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", "window_id": "thinking_w20260327_1000", "request_mode": "chat", "completion_tokens_override": 1, "u_field": "sampling_u", "timestamp_field": "timestamp", "max_concurrency": 64, "replay_time_scale": 1.0, "early_stop_max_lag_s": 180.0, "early_stop_max_elapsed_s": 1200.0 }, "slo": { "target_pass_rate": 0.95, "ttft_rule": { "kind": "step_ms", "buckets": [ { "max_input_tokens": 8191, "threshold_ms": 2000 }, { "max_input_tokens": 32767, "threshold_ms": 4000 }, { "threshold_ms": 6000 } ] } }, "search": { "low": 0.0, "high": 0.125, "tolerance": 0.001, "max_probes": 6, "sample_seed": 20260325 }, "llm": { "system_prompt": "You are tuning a prefill-dominated vLLM serving stack. The trace replay forces completion length to exactly 1 token, so optimize for TTFT under the configured stepped SLO. Propose one launch-safe config patch that increases the maximum feasible sampling_u while respecting the topology constraints and avoiding known launch failures.", "max_history_trials": 8, "endpoint": { "provider": "codex", "model": "gpt-5.4", "stream": true, "api_key_env": "OPENAI_API_KEY", "timeout_s": 240 } } }