diff --git a/configs/examples/dash0_qwen27b_tp_ab.json b/configs/examples/dash0_qwen27b_tp_ab.json new file mode 100644 index 0000000..9cb15f0 --- /dev/null +++ b/configs/examples/dash0_qwen27b_tp_ab.json @@ -0,0 +1,186 @@ +{ + "study_id": "dash0-qwen27b-tp-ab-chat-0-8k", + "hardware": { + "gpu_count": 8, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3.5-27b-256k-0223-internal", + "served_model_name": "qwen35-27b-aituner" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "latest-release-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18082, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal" + ], + "base_envs": { + "VLLM_DISABLE_COMPILE_CACHE": "1", + "DS_LLM_IGNORE_WARMUP": "1", + "DS_LLM_IGNORE_CHECK_WARMUP": "1", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1", + "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0", + "PARAM_TOTAL_MAX": "262144", + "PARAM_IN_LENGTH_MAX": "262144", + "PARAM_MAX_LENGTH_MAX": "131072", + "DS_LLM_MAX_THINK_TOKENS": "81920", + "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_GDN_USE_BLADNN": "0", + "VLLM_USE_V1": "1", + "VLLM_IS_HYBRID_MODEL": "1", + "VLLM_ENABLE_TORCH_COMPILE": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_DP_MASTER_PORT": "9528", + "VLLM_RESPONSE_TIMEOUT": "300", + "VLLM_LOG_REQ_KV_LENS": "1", + "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18082, + "served-model-name": "qwen35-27b-aituner", + "trust-remote-code": true, + "dtype": "bfloat16", + "gpu-memory-utilization": 0.9, + "enable-prefix-caching": true, + "mamba-cache-mode": "light", + "distributed-executor-backend": "mp", + "block-size": 64, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-cascade-attn": true, + "max-model-len": 262144, + "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}", + "mm-processor-cache-gb": 0, + "limit-mm-per-prompt": "{\"image\":256,\"video\":64}", + "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}", + "mamba-cache-dtype": "float32", + "skip-mm-profiling": true, + "quantization": "fp8", + "tensor-parallel-size": 1, + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ENABLE_TORCH_COMPILE" + ], + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "expert-parallel-size", + "gpu-memory-utilization", + "block-size", + "max-num-batched-tokens", + "max-num-seqs", + "enable-prefix-caching", + "enable-chunked-prefill" + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [ + 1, + 2, + 4, + 8 + ], + "allowed_tensor_parallel_sizes": [ + 1, + 2, + 4, + 8 + ], + "allowed_data_parallel_sizes": [ + 1, + 2, + 4, + 8 + ], + "allowed_expert_parallel_sizes": [ + 1 + ] + }, + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 32, + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192 + }, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0, + "adaptive_stop": { + "enabled": true, + "tau": 0.9, + "tau_c": 0.9, + "stable_checks": 3, + "max_checks": 20, + "min_fraction": 0.1, + "boundary_delta": 0.02 + } + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.25, + "tolerance": 0.001, + "max_probes": 7, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.", + "max_history_trials": 8, + "endpoint": { + "provider": "codex", + "model": "gpt-5.4", + "stream": true, + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 180 + } + } +} \ No newline at end of file diff --git a/configs/examples/stopb_27b_ab/p1_tp1.json b/configs/examples/stopb_27b_ab/p1_tp1.json new file mode 100644 index 0000000..bdb5778 --- /dev/null +++ b/configs/examples/stopb_27b_ab/p1_tp1.json @@ -0,0 +1,13 @@ +{ + "observation": "baseline TP1 (deployed flags)", + "diagnosis": "deterministic TP A/B point", + "config_patch": { + "env_patch": {}, + "flag_patch": {} + }, + "expected_effects": [ + "measure peak request_rate_per_gpu at this topology" + ], + "why_not_previous_failures": "n/a", + "should_stop": false +} \ No newline at end of file diff --git a/configs/examples/stopb_27b_ab/p2_tp2.json b/configs/examples/stopb_27b_ab/p2_tp2.json new file mode 100644 index 0000000..1aa7bea --- /dev/null +++ b/configs/examples/stopb_27b_ab/p2_tp2.json @@ -0,0 +1,15 @@ +{ + "observation": "TP2", + "diagnosis": "deterministic TP A/B point", + "config_patch": { + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 2 + } + }, + "expected_effects": [ + "measure peak request_rate_per_gpu at this topology" + ], + "why_not_previous_failures": "n/a", + "should_stop": false +} \ No newline at end of file diff --git a/configs/examples/stopb_27b_ab/p3_tp4.json b/configs/examples/stopb_27b_ab/p3_tp4.json new file mode 100644 index 0000000..7e721c6 --- /dev/null +++ b/configs/examples/stopb_27b_ab/p3_tp4.json @@ -0,0 +1,15 @@ +{ + "observation": "TP4", + "diagnosis": "deterministic TP A/B point", + "config_patch": { + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 4 + } + }, + "expected_effects": [ + "measure peak request_rate_per_gpu at this topology" + ], + "why_not_previous_failures": "n/a", + "should_stop": false +} \ No newline at end of file