From 8d0777e5e238e60b39d1d4835fc98d15f71cb449 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Fri, 10 Apr 2026 17:41:54 +0800 Subject: [PATCH] Add topology-aware qwen27b 0-8k tuning --- .../dash0_qwen27b_tight_slo_run4_0_8k.json | 25 +++-- src/aituner/llm.py | 24 ++++- src/aituner/spec.py | 5 + tests/test_core_flow.py | 98 +++++++++++++++++++ 4 files changed, 144 insertions(+), 8 deletions(-) diff --git a/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json b/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json index 34d15c2..00c394b 100644 --- a/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json +++ b/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json @@ -46,6 +46,7 @@ "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_DP_MASTER_PORT": "9528", "VLLM_RESPONSE_TIMEOUT": "300", "VLLM_LOG_REQ_KV_LENS": "1", "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" @@ -73,25 +74,36 @@ "skip-mm-profiling": true, "quantization": "fp8", "tensor-parallel-size": 4, + "data-parallel-size": 1, + "expert-parallel-size": 1, "max-num-seqs": 16, "disable-log-requests": true }, "tunable_envs": [ - "VLLM_ATTENTION_BACKEND", - "VLLM_ENABLE_TORCH_COMPILE", - "VLLM_USE_FLASHINFER_SAMPLER", - "VLLM_ENABLE_MODEL_RUNNER_WARMUP" + "VLLM_ENABLE_TORCH_COMPILE" ], "tunable_flags": [ "tensor-parallel-size", + "data-parallel-size", + "expert-parallel-size", "gpu-memory-utilization", "block-size", "max-num-batched-tokens", "max-num-seqs", "enable-prefix-caching", - "enable-chunked-prefill", - "disable-cascade-attn" + "enable-chunked-prefill" ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [1, 2, 4, 8], + "allowed_tensor_parallel_sizes": [1, 2, 4], + "allowed_data_parallel_sizes": [1, 2, 4, 8], + "allowed_expert_parallel_sizes": [1] + }, "python_executable": "python3" }, "trace": { @@ -144,6 +156,7 @@ "endpoint": { "provider": "codex", "model": "gpt-5.4", + "stream": true, "api_key_env": "OPENAI_API_KEY", "timeout_s": 180 } diff --git a/src/aituner/llm.py b/src/aituner/llm.py index 028bad1..6658a2e 100644 --- a/src/aituner/llm.py +++ b/src/aituner/llm.py @@ -124,7 +124,15 @@ def _enumerate_parallel_candidates(study: StudySpec) -> list[dict[str, int | boo for tp in sorted(set(tp_values)): for dp in sorted(set(dp_values)): tp_dp_product = tp * dp - if tp_dp_product > study.hardware.gpu_count: + if ( + constraints.allowed_tp_dp_products + and tp_dp_product not in constraints.allowed_tp_dp_products + ): + continue + if ( + not constraints.allowed_tp_dp_products + and tp_dp_product > study.hardware.gpu_count + ): continue if ( constraints.require_tp_dp_product_equals_gpu_count @@ -342,7 +350,19 @@ def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal: f"expert-parallel-size={effective_ep} not in {constraints.allowed_expert_parallel_sizes}." ) tp_dp_product = topology["tp_dp_product"] - if tp_dp_product > study.hardware.gpu_count: + if ( + constraints.allowed_tp_dp_products + and tp_dp_product not in constraints.allowed_tp_dp_products + ): + raise SpecError( + "Proposal violates topology constraints: " + f"tensor-parallel-size * data-parallel-size = {tp_dp_product} not in " + f"{constraints.allowed_tp_dp_products}." + ) + if ( + not constraints.allowed_tp_dp_products + and tp_dp_product > study.hardware.gpu_count + ): raise SpecError( "Proposal violates topology constraints: " f"tensor-parallel-size * data-parallel-size = {tp_dp_product} exceeds " diff --git a/src/aituner/spec.py b/src/aituner/spec.py index 7648a8c..da5acb2 100644 --- a/src/aituner/spec.py +++ b/src/aituner/spec.py @@ -225,6 +225,7 @@ class TopologyConstraintSpec: require_ep_size_divides_tp_dp_product: bool = False require_enable_expert_parallel_when_ep_gt_one: bool = True validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter: bool = True + allowed_tp_dp_products: list[int] = field(default_factory=list) allowed_tensor_parallel_sizes: list[int] = field(default_factory=list) allowed_data_parallel_sizes: list[int] = field(default_factory=list) allowed_expert_parallel_sizes: list[int] = field(default_factory=list) @@ -255,6 +256,10 @@ class TopologyConstraintSpec: ), context="engine.topology_constraints.validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter", ), + allowed_tp_dp_products=_coerce_int_list( + data.get("allowed_tp_dp_products"), + context="engine.topology_constraints.allowed_tp_dp_products", + ), allowed_tensor_parallel_sizes=_coerce_int_list( data.get("allowed_tensor_parallel_sizes"), context="engine.topology_constraints.allowed_tensor_parallel_sizes", diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 3eb0ac4..770d8f0 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -1279,6 +1279,104 @@ class CoreFlowTests(unittest.TestCase): validated = validate_proposal(proposal, study) self.assertEqual(validated.config_patch.flag_patch["tensor-parallel-size"], 2) + def test_validate_proposal_accepts_allowed_tp_dp_product_above_gpu_count(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + engine_overrides={ + "base_flags": { + "host": "127.0.0.1", + "port": 8000, + "enable-expert-parallel": False, + "tensor-parallel-size": 4, + "data-parallel-size": 1, + "expert-parallel-size": 1, + }, + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "expert-parallel-size", + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": False, + "require_ep_size_leq_tp_dp_product": True, + "require_ep_size_divides_tp_dp_product": True, + "allowed_tp_dp_products": [1, 2, 4, 8], + "allowed_tensor_parallel_sizes": [1, 2, 4, 8], + "allowed_data_parallel_sizes": [1, 2, 4, 8], + "allowed_expert_parallel_sizes": [1], + }, + }, + ) + study = load_study_spec(study_path) + proposal = Proposal.from_dict( + { + "observation": "Obs", + "diagnosis": "Allow product 8", + "config_patch": { + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 4, + "data-parallel-size": 2, + "expert-parallel-size": 1, + }, + }, + "expected_effects": ["explore larger topology"], + } + ) + validated = validate_proposal(proposal, study) + self.assertEqual(validated.config_patch.flag_patch["data-parallel-size"], 2) + + def test_validate_proposal_rejects_tp_dp_product_outside_allowed_set(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets( + tmp_path, + engine_overrides={ + "base_flags": { + "host": "127.0.0.1", + "port": 8000, + "enable-expert-parallel": False, + "tensor-parallel-size": 4, + "data-parallel-size": 1, + "expert-parallel-size": 1, + }, + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "expert-parallel-size", + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": False, + "require_ep_size_leq_tp_dp_product": True, + "require_ep_size_divides_tp_dp_product": True, + "allowed_tp_dp_products": [1, 2, 4, 8], + "allowed_tensor_parallel_sizes": [1, 2, 3, 4, 8], + "allowed_data_parallel_sizes": [1, 2, 3, 4, 8], + "allowed_expert_parallel_sizes": [1], + }, + }, + ) + study = load_study_spec(study_path) + proposal = Proposal.from_dict( + { + "observation": "Obs", + "diagnosis": "Invalid product", + "config_patch": { + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 3, + "data-parallel-size": 2, + "expert-parallel-size": 1, + }, + }, + "expected_effects": ["explore invalid topology"], + } + ) + with self.assertRaisesRegex(SpecError, "not in \\[1, 2, 4, 8\\]"): + validate_proposal(proposal, study) + def test_cli_tune_runs_multiple_manual_proposals(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)