Add topology-aware qwen27b 0-8k tuning

This commit is contained in:
2026-04-10 17:41:54 +08:00
parent b960607d8f
commit 8d0777e5e2
4 changed files with 144 additions and 8 deletions

View File

@@ -46,6 +46,7 @@
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN", "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
"VLLM_USE_FLASHINFER_SAMPLER": "0", "VLLM_USE_FLASHINFER_SAMPLER": "0",
"VLLM_DP_MASTER_PORT": "9528",
"VLLM_RESPONSE_TIMEOUT": "300", "VLLM_RESPONSE_TIMEOUT": "300",
"VLLM_LOG_REQ_KV_LENS": "1", "VLLM_LOG_REQ_KV_LENS": "1",
"DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
@@ -73,25 +74,36 @@
"skip-mm-profiling": true, "skip-mm-profiling": true,
"quantization": "fp8", "quantization": "fp8",
"tensor-parallel-size": 4, "tensor-parallel-size": 4,
"data-parallel-size": 1,
"expert-parallel-size": 1,
"max-num-seqs": 16, "max-num-seqs": 16,
"disable-log-requests": true "disable-log-requests": true
}, },
"tunable_envs": [ "tunable_envs": [
"VLLM_ATTENTION_BACKEND", "VLLM_ENABLE_TORCH_COMPILE"
"VLLM_ENABLE_TORCH_COMPILE",
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_ENABLE_MODEL_RUNNER_WARMUP"
], ],
"tunable_flags": [ "tunable_flags": [
"tensor-parallel-size", "tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"gpu-memory-utilization", "gpu-memory-utilization",
"block-size", "block-size",
"max-num-batched-tokens", "max-num-batched-tokens",
"max-num-seqs", "max-num-seqs",
"enable-prefix-caching", "enable-prefix-caching",
"enable-chunked-prefill", "enable-chunked-prefill"
"disable-cascade-attn"
], ],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1]
},
"python_executable": "python3" "python_executable": "python3"
}, },
"trace": { "trace": {
@@ -144,6 +156,7 @@
"endpoint": { "endpoint": {
"provider": "codex", "provider": "codex",
"model": "gpt-5.4", "model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY", "api_key_env": "OPENAI_API_KEY",
"timeout_s": 180 "timeout_s": 180
} }

View File

@@ -124,7 +124,15 @@ def _enumerate_parallel_candidates(study: StudySpec) -> list[dict[str, int | boo
for tp in sorted(set(tp_values)): for tp in sorted(set(tp_values)):
for dp in sorted(set(dp_values)): for dp in sorted(set(dp_values)):
tp_dp_product = tp * dp tp_dp_product = tp * dp
if tp_dp_product > study.hardware.gpu_count: if (
constraints.allowed_tp_dp_products
and tp_dp_product not in constraints.allowed_tp_dp_products
):
continue
if (
not constraints.allowed_tp_dp_products
and tp_dp_product > study.hardware.gpu_count
):
continue continue
if ( if (
constraints.require_tp_dp_product_equals_gpu_count constraints.require_tp_dp_product_equals_gpu_count
@@ -342,7 +350,19 @@ def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal:
f"expert-parallel-size={effective_ep} not in {constraints.allowed_expert_parallel_sizes}." f"expert-parallel-size={effective_ep} not in {constraints.allowed_expert_parallel_sizes}."
) )
tp_dp_product = topology["tp_dp_product"] tp_dp_product = topology["tp_dp_product"]
if tp_dp_product > study.hardware.gpu_count: if (
constraints.allowed_tp_dp_products
and tp_dp_product not in constraints.allowed_tp_dp_products
):
raise SpecError(
"Proposal violates topology constraints: "
f"tensor-parallel-size * data-parallel-size = {tp_dp_product} not in "
f"{constraints.allowed_tp_dp_products}."
)
if (
not constraints.allowed_tp_dp_products
and tp_dp_product > study.hardware.gpu_count
):
raise SpecError( raise SpecError(
"Proposal violates topology constraints: " "Proposal violates topology constraints: "
f"tensor-parallel-size * data-parallel-size = {tp_dp_product} exceeds " f"tensor-parallel-size * data-parallel-size = {tp_dp_product} exceeds "

View File

@@ -225,6 +225,7 @@ class TopologyConstraintSpec:
require_ep_size_divides_tp_dp_product: bool = False require_ep_size_divides_tp_dp_product: bool = False
require_enable_expert_parallel_when_ep_gt_one: bool = True require_enable_expert_parallel_when_ep_gt_one: bool = True
validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter: bool = True validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter: bool = True
allowed_tp_dp_products: list[int] = field(default_factory=list)
allowed_tensor_parallel_sizes: list[int] = field(default_factory=list) allowed_tensor_parallel_sizes: list[int] = field(default_factory=list)
allowed_data_parallel_sizes: list[int] = field(default_factory=list) allowed_data_parallel_sizes: list[int] = field(default_factory=list)
allowed_expert_parallel_sizes: list[int] = field(default_factory=list) allowed_expert_parallel_sizes: list[int] = field(default_factory=list)
@@ -255,6 +256,10 @@ class TopologyConstraintSpec:
), ),
context="engine.topology_constraints.validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter", context="engine.topology_constraints.validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter",
), ),
allowed_tp_dp_products=_coerce_int_list(
data.get("allowed_tp_dp_products"),
context="engine.topology_constraints.allowed_tp_dp_products",
),
allowed_tensor_parallel_sizes=_coerce_int_list( allowed_tensor_parallel_sizes=_coerce_int_list(
data.get("allowed_tensor_parallel_sizes"), data.get("allowed_tensor_parallel_sizes"),
context="engine.topology_constraints.allowed_tensor_parallel_sizes", context="engine.topology_constraints.allowed_tensor_parallel_sizes",

View File

@@ -1279,6 +1279,104 @@ class CoreFlowTests(unittest.TestCase):
validated = validate_proposal(proposal, study) validated = validate_proposal(proposal, study)
self.assertEqual(validated.config_patch.flag_patch["tensor-parallel-size"], 2) self.assertEqual(validated.config_patch.flag_patch["tensor-parallel-size"], 2)
def test_validate_proposal_accepts_allowed_tp_dp_product_above_gpu_count(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": False,
"tensor-parallel-size": 4,
"data-parallel-size": 1,
"expert-parallel-size": 1,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": False,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1],
},
},
)
study = load_study_spec(study_path)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Allow product 8",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 4,
"data-parallel-size": 2,
"expert-parallel-size": 1,
},
},
"expected_effects": ["explore larger topology"],
}
)
validated = validate_proposal(proposal, study)
self.assertEqual(validated.config_patch.flag_patch["data-parallel-size"], 2)
def test_validate_proposal_rejects_tp_dp_product_outside_allowed_set(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
engine_overrides={
"base_flags": {
"host": "127.0.0.1",
"port": 8000,
"enable-expert-parallel": False,
"tensor-parallel-size": 4,
"data-parallel-size": 1,
"expert-parallel-size": 1,
},
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": False,
"require_ep_size_leq_tp_dp_product": True,
"require_ep_size_divides_tp_dp_product": True,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 3, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 3, 4, 8],
"allowed_expert_parallel_sizes": [1],
},
},
)
study = load_study_spec(study_path)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Invalid product",
"config_patch": {
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 3,
"data-parallel-size": 2,
"expert-parallel-size": 1,
},
},
"expected_effects": ["explore invalid topology"],
}
)
with self.assertRaisesRegex(SpecError, "not in \\[1, 2, 4, 8\\]"):
validate_proposal(proposal, study)
def test_cli_tune_runs_multiple_manual_proposals(self) -> None: def test_cli_tune_runs_multiple_manual_proposals(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)