Add topology-aware tuning constraints

2026-04-09 21:07:51 +08:00
parent 7371d6635c
commit ef78fe7eb5
6 changed files with 506 additions and 2 deletions
--- a/configs/examples/dash0_qwen235b_decode_thinking_run2_tpot40.json
+++ b/configs/examples/dash0_qwen235b_decode_thinking_run2_tpot40.json
@@ -141,11 +141,24 @@
      "CUDA_DEVICE_MAX_CONNECTIONS"
    ],
    "tunable_flags": [
+      "tensor-parallel-size",
+      "data-parallel-size",
+      "expert-parallel-size",
      "gpu-memory-utilization",
      "max-num-batched-tokens",
      "max-num-seqs",
      "block-size"
    ],
+    "topology_constraints": {
+      "require_tp_dp_product_equals_gpu_count": true,
+      "require_ep_size_leq_tp_dp_product": true,
+      "require_ep_size_divides_tp_dp_product": true,
+      "require_enable_expert_parallel_when_ep_gt_one": true,
+      "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
+      "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
+      "allowed_data_parallel_sizes": [1, 2, 4, 8],
+      "allowed_expert_parallel_sizes": [1, 2, 4, 8]
+    },
    "python_executable": "python3"
  },
  "trace": {