From 4a64196a994f97f85e2f9e05ae4c9b098487c349 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 16 Jun 2026 09:08:46 +0800 Subject: [PATCH] Add 27B Stop-B agentic-loop config (harness-driven, GPUs 2-7) Co-Authored-By: Claude Opus 4.8 --- .../examples/dash0_qwen27b_stopB_loop.json | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 configs/examples/dash0_qwen27b_stopB_loop.json diff --git a/configs/examples/dash0_qwen27b_stopB_loop.json b/configs/examples/dash0_qwen27b_stopB_loop.json new file mode 100644 index 0000000..3169526 --- /dev/null +++ b/configs/examples/dash0_qwen27b_stopB_loop.json @@ -0,0 +1,177 @@ +{ + "study_id": "dash0-qwen27b-stopB-loop-chat-0-8k", + "hardware": { + "gpu_count": 8, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3.5-27b-256k-0223-internal", + "served_model_name": "qwen35-27b-aituner" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "latest-release-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18082, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 180, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal" + ], + "base_envs": { + "VLLM_DISABLE_COMPILE_CACHE": "1", + "DS_LLM_IGNORE_WARMUP": "1", + "DS_LLM_IGNORE_CHECK_WARMUP": "1", + "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1", + "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0", + "PARAM_TOTAL_MAX": "262144", + "PARAM_IN_LENGTH_MAX": "262144", + "PARAM_MAX_LENGTH_MAX": "131072", + "DS_LLM_MAX_THINK_TOKENS": "81920", + "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_GDN_USE_BLADNN": "0", + "VLLM_USE_V1": "1", + "VLLM_IS_HYBRID_MODEL": "1", + "VLLM_ENABLE_TORCH_COMPILE": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_DP_MASTER_PORT": "9528", + "VLLM_RESPONSE_TIMEOUT": "300", + "VLLM_LOG_REQ_KV_LENS": "1", + "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600", + "CUDA_VISIBLE_DEVICES": "2,3,4,5,6,7" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18082, + "served-model-name": "qwen35-27b-aituner", + "trust-remote-code": true, + "dtype": "bfloat16", + "gpu-memory-utilization": 0.9, + "enable-prefix-caching": true, + "mamba-cache-mode": "light", + "distributed-executor-backend": "mp", + "block-size": 64, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-cascade-attn": true, + "max-model-len": 262144, + "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}", + "mm-processor-cache-gb": 0, + "limit-mm-per-prompt": "{\"image\":256,\"video\":64}", + "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}", + "mamba-cache-dtype": "float32", + "skip-mm-profiling": true, + "quantization": "fp8", + "tensor-parallel-size": 1, + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ENABLE_TORCH_COMPILE" + ], + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "expert-parallel-size", + "gpu-memory-utilization", + "block-size", + "max-num-batched-tokens", + "max-num-seqs", + "enable-prefix-caching", + "enable-chunked-prefill" + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [ + 1, + 2, + 4, + 8 + ], + "allowed_tensor_parallel_sizes": [ + 1, + 2, + 4, + 8 + ], + "allowed_data_parallel_sizes": [ + 1, + 2, + 4, + 8 + ], + "allowed_expert_parallel_sizes": [ + 1 + ] + }, + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 32, + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192 + }, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0, + "adaptive_stop": { + "enabled": true, + "tau": 0.9, + "tau_c": 0.9, + "stable_checks": 3, + "max_checks": 20, + "min_fraction": 0.1, + "boundary_delta": 0.02 + } + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "linear_ms", + "intercept_ms": 4000, + "per_token_ms": 0.125 + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.25, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325, + "inherit_incumbent_floor": true + }, + "llm": { + "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.", + "max_history_trials": 8, + "endpoint": { + "provider": "codex", + "model": "gpt-5.4", + "stream": true, + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 180 + } + } +} \ No newline at end of file