Add topology-aware qwen27b 0-8k tuning
This commit is contained in:
@@ -46,6 +46,7 @@
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||
"VLLM_DP_MASTER_PORT": "9528",
|
||||
"VLLM_RESPONSE_TIMEOUT": "300",
|
||||
"VLLM_LOG_REQ_KV_LENS": "1",
|
||||
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
|
||||
@@ -73,25 +74,36 @@
|
||||
"skip-mm-profiling": true,
|
||||
"quantization": "fp8",
|
||||
"tensor-parallel-size": 4,
|
||||
"data-parallel-size": 1,
|
||||
"expert-parallel-size": 1,
|
||||
"max-num-seqs": 16,
|
||||
"disable-log-requests": true
|
||||
},
|
||||
"tunable_envs": [
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_ENABLE_TORCH_COMPILE",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
|
||||
"VLLM_ENABLE_TORCH_COMPILE"
|
||||
],
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"data-parallel-size",
|
||||
"expert-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
"block-size",
|
||||
"max-num-batched-tokens",
|
||||
"max-num-seqs",
|
||||
"enable-prefix-caching",
|
||||
"enable-chunked-prefill",
|
||||
"disable-cascade-attn"
|
||||
"enable-chunked-prefill"
|
||||
],
|
||||
"topology_constraints": {
|
||||
"require_tp_dp_product_equals_gpu_count": false,
|
||||
"require_ep_size_leq_tp_dp_product": true,
|
||||
"require_ep_size_divides_tp_dp_product": true,
|
||||
"require_enable_expert_parallel_when_ep_gt_one": true,
|
||||
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
|
||||
"allowed_tp_dp_products": [1, 2, 4, 8],
|
||||
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||
"allowed_data_parallel_sizes": [1, 2, 4, 8],
|
||||
"allowed_expert_parallel_sizes": [1]
|
||||
},
|
||||
"python_executable": "python3"
|
||||
},
|
||||
"trace": {
|
||||
@@ -144,6 +156,7 @@
|
||||
"endpoint": {
|
||||
"provider": "codex",
|
||||
"model": "gpt-5.4",
|
||||
"stream": true,
|
||||
"api_key_env": "OPENAI_API_KEY",
|
||||
"timeout_s": 180
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user