Prioritize topology exploration in decode tuning
This commit is contained in:
@@ -135,10 +135,7 @@
|
||||
"disable-log-requests": true
|
||||
},
|
||||
"tunable_envs": [
|
||||
"VLLM_ENABLE_TORCH_COMPILE",
|
||||
"VLLM_ENABLE_TBO_OPT",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"CUDA_DEVICE_MAX_CONNECTIONS"
|
||||
"VLLM_ENABLE_TORCH_COMPILE"
|
||||
],
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
|
||||
Reference in New Issue
Block a user