diff --git a/configs/examples/dash0_qwen30b_a3b_stopA_on.json b/configs/examples/dash0_qwen30b_a3b_stopA_on.json new file mode 100644 index 0000000..5db891c --- /dev/null +++ b/configs/examples/dash0_qwen30b_a3b_stopA_on.json @@ -0,0 +1,147 @@ +{ + "study_id": "dash0-qwen30b-a3b-stopA-on-chat-0-8k", + "hardware": { + "gpu_count": 8, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "Qwen/Qwen3-30B-A3B", + "served_model_name": "qwen3-30b-a3b-community" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "0.20.0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18230, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "HOME": "/tmp/wjh", + "XDG_CACHE_HOME": "/tmp/wjh/.cache" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18230, + "served-model-name": "qwen3-30b-a3b-community", + "gpu-memory-utilization": 0.9, + "max-model-len": 16384, + "trust-remote-code": true, + "enable-prefix-caching": true + }, + "tunable_envs": [], + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "enable-expert-parallel", + "expert-parallel-size", + "gpu-memory-utilization", + "max-num-batched-tokens", + "max-num-seqs", + "block-size", + "enable-prefix-caching", + "enable-chunked-prefill" + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [ + 1, + 2, + 4, + 8 + ], + "allowed_tensor_parallel_sizes": [ + 1, + 2, + 4, + 8 + ], + "allowed_data_parallel_sizes": [ + 1, + 2, + 4, + 8 + ], + "allowed_expert_parallel_sizes": [ + 1, + 2, + 4, + 8 + ] + }, + "python_executable": "/tmp/wjh/venvs/vllm-0.20.0-cu129/bin/python" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "completion_tokens_override": 128, + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 64, + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192 + }, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0, + "adaptive_stop": { + "enabled": true, + "tau": 0.9, + "tau_c": 0.9, + "stable_checks": 3, + "max_checks": 20, + "min_fraction": 0.1, + "boundary_delta": 0.02 + } + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.125, + "tolerance": 0.001, + "max_probes": 4, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.", + "max_history_trials": 8, + "use_harness": false + } +} \ No newline at end of file