From edfd61a696af9144a21291ce2b02d67c8507a65f Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sun, 12 Apr 2026 11:24:23 +0800 Subject: [PATCH] Add qwen235b prefill docs and tight TTFT spec --- ...235b_prefill_thinking_run2_ttft_tight.json | 208 ++++++++++++++++++ docs/qwen235b-thinking-prefill/README.md | 91 ++++++++ scripts/run_baseline_then_llm.py | 142 ++++++++++++ 3 files changed, 441 insertions(+) create mode 100644 configs/examples/dash0_qwen235b_prefill_thinking_run2_ttft_tight.json create mode 100644 docs/qwen235b-thinking-prefill/README.md create mode 100644 scripts/run_baseline_then_llm.py diff --git a/configs/examples/dash0_qwen235b_prefill_thinking_run2_ttft_tight.json b/configs/examples/dash0_qwen235b_prefill_thinking_run2_ttft_tight.json new file mode 100644 index 0000000..3475c50 --- /dev/null +++ b/configs/examples/dash0_qwen235b_prefill_thinking_run2_ttft_tight.json @@ -0,0 +1,208 @@ +{ + "study_id": "dash0-qwen235b-prefill-thinking-run2-ttft-tight-topology", + "hardware": { + "gpu_count": 8, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3-235b-a22b-256k-0717-internal", + "served_model_name": "qwen3-235b-prefill" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "internal-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18125, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 1800, + "request_timeout_s": 1800, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "VLLM_USE_V1": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZATION_LAYER_WISE": "1", + "VLLM_MOE_USE_DEEPEP": "0", + "VLLM_MOE_BALANCED_GATING": "0", + "VLLM_MOE_RANDOM_GATING": "0", + "VLLM_FUSED_MOE_CHUNK_SIZE": "4096", + "VLLM_DP_META_USE_CPU_GROUP": "0", + "VLLM_MLA_FP8_ATTENTION": "0", + "VLLM_MOE_EXPERTS_OVERLAP": "0", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_RESPONSE_TIMEOUT": "290", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_USE_DEEP_GEMM": "0", + "VLLM_PD_TRY_CONNECT_TIMEOUT_SECONDS": "120", + "VLLM_DEEP_GEMM_WARMUP": "skip", + "DEEPEP_LL_COMBINE_USE_FP8": "1", + "DEEPEP_LL_BUFFER_FP8_OPT": "1", + "DEEPEP_LL_DISPATCH_USE_NVL": "1", + "DEEPEP_LL_COMBINE_USE_NVL": "1", + "ACCL_LOW_LATENCY_OPTIMIZE": "2", + "ACCL_WRITEBATCH_OPT": "2", + "ACCL_IBV_MTU": "9000", + "ACCL_TX_DEPTH": "1024", + "ACCL_RETRANSMIT_TIMEOUT": "17", + "NVSHMEM_IBGDA_NUM_RC_PER_PE": "4", + "BLLM_KVTRANS_RDMA_SP": "2", + "NCCL_SOCKET_IFNAME": "eth1", + "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME": "eth1", + "GLOO_SOCKET_IFNAME": "eth1" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18125, + "served-model-name": "qwen3-235b-prefill", + "tensor-parallel-size": 4, + "gpu-memory-utilization": 0.85, + "enable-prefix-caching": true, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-hybrid-kv-cache-manager": true, + "max-model-len": 262144, + "block-size": 64, + "max-num-seqs": 64, + "quantization": "fp8", + "cuda-graph-sizes": [ + 16, + 32, + 64, + 96, + 128, + 160, + 192, + 224, + 256, + 288, + 320, + 352, + 384, + 416, + 448, + 480, + 512, + 544, + 576, + 608, + 640, + 672, + 704, + 736, + 768, + 800, + 832, + 864, + 896, + 928, + 960, + 992, + 1024 + ], + "compilation-config": "{\"cudagraph_mode\":\"PIECEWISE\",\"use_inductor\":false,\"custom_ops\":[\"all\"],\"max_cudagraph_capture_size\":2048}", + "speculative-config": "{\"method\":\"eagle3\",\"num_speculative_tokens\":1,\"hf_overrides\":{\"rope_scaling\":{\"type\":\"yarn\",\"factor\":128,\"original_max_position_embeddings\":2048,\"semi_dynamic\":false,\"dynamic\":true},\"num_experts\":0},\"model\":\"/home/admin/resource/model/464482ce.qwen3-235b-a22b/0717-eagle-0820\"}", + "hf-overrides": "{\"architectures\":[\"Qwen3MoeForCausalLM\"],\"model_type\":\"qwen3_moe\"}", + "kv-cache-dtype": "fp8", + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ENABLE_TORCH_COMPILE" + ], + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "enable-expert-parallel", + "expert-parallel-size", + "gpu-memory-utilization", + "max-num-batched-tokens", + "max-num-seqs", + "block-size", + "enable-prefix-caching", + "enable-chunked-prefill" + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [ + 4, + 8 + ], + "allowed_tensor_parallel_sizes": [ + 4, + 8 + ], + "allowed_data_parallel_sizes": [ + 1, + 2 + ], + "allowed_expert_parallel_sizes": [ + 1, + 2, + 4, + 8 + ] + }, + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "thinking_w20260327_1000", + "request_mode": "chat", + "completion_tokens_override": 1, + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 64, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 180.0, + "early_stop_max_elapsed_s": 1200.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 8191, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32767, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + } + }, + "search": { + "low": 0.0, + "high": 0.125, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "You are tuning a prefill-dominated vLLM serving stack. The trace replay forces completion length to exactly 1 token, so optimize for TTFT under the configured stepped SLO. Propose one launch-safe config patch that increases the maximum feasible sampling_u while respecting the topology constraints and avoiding known launch failures.", + "max_history_trials": 8, + "endpoint": { + "provider": "codex", + "model": "gpt-5.4", + "stream": true, + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 240 + } + } +} diff --git a/docs/qwen235b-thinking-prefill/README.md b/docs/qwen235b-thinking-prefill/README.md new file mode 100644 index 0000000..0063474 --- /dev/null +++ b/docs/qwen235b-thinking-prefill/README.md @@ -0,0 +1,91 @@ +# qwen235b-thinking-prefill + +qwen3-235b-a22b `thinking` trace, prefill-only replay with `output_length=1`, internal vLLM (`/usr/local/bin/vllm`), compared by `request_rate_per_gpu`. + +## Setup + +- Hardware: `dash0`, `8x H20` +- Model: `/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717` +- Engine: internal vLLM, baseline aligned to `~/run_qwen235b.sh` +- Baseline topology: `TP=4, DP=1, EP=1` +- Trace: `thinking_w20260327_1000` +- Trace source: `trace_windows/traces/thinking_w20260327_1000.jsonl` +- Window duration: `600s` (`10:00-10:10`, `2026-03-27`) +- Request mode: `chat` +- Replay override: `min_tokens=max_tokens=1` +- SLO: + - pass target: `95%` + - `TTFT <= 3000ms` for `<=4096` input tokens + - `TTFT <= 6000ms` for `<=32768` input tokens + - `TTFT <= 9000ms` for `>32768` input tokens +- Search: + - `sampling_u in [0, 0.125]` + - `max_probes = 6` + - `12` trials total +- Proposal model: `codex / gpt-5.4` + +## Run assets + +- Study root: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-prefill/dash0-qwen235b-prefill-thinking-run1-ttft-topology` +- State: `/home/admin/cpfs/wjh/aituner/aituner/.aituner-prefill/dash0-qwen235b-prefill-thinking-run1-ttft-topology/state.json` +- Log: `/home/admin/cpfs/wjh/aituner/aituner/logs/dash0_qwen235b_prefill_thinking_run1_ttft_topology.log` +- Spec: `/home/admin/cpfs/wjh/aituner/aituner/configs/examples/dash0_qwen235b_prefill_thinking_run1_ttft.json` + +## Best result + +- Best trial: `trial-0010` +- Best config: + - `tensor-parallel-size=8` + - `data-parallel-size=1` + - `enable-expert-parallel=false` + - `max-num-batched-tokens=3712` +- Best `sampling_u`: `0.120422363281` +- Best request rate: `3.035 req/s` +- Best request rate per GPU: `0.379375 req/s/gpu` +- Best pass rate: `0.9533223503569467` + +Compared with baseline: + +- `trial-0001`: `0.8116666666666666 req/s`, `0.20291666666666666 req/s/gpu` +- `trial-0010`: `3.035 req/s`, `0.379375 req/s/gpu` +- Raw throughput gain: `3.74x` +- Per-GPU throughput gain: `1.87x` + +Best-point latency: + +- `TTFT mean/p50/p90/p95/p99 = 863.84 / 253.58 / 2392.48 / 3154.26 / 5377.00 ms` + +## 12-trial summary + +| Trial | Proposed config delta | Result | +| --- | --- | --- | +| `trial-0001` | baseline `TP4/DP1/EP-off`, `max-num-batched-tokens=8192` | `0.8117 req/s`, feasible | +| `trial-0002` | `DP=2`, `max-num-batched-tokens=4096` | probe-time runtime failure | +| `trial-0003` | `DP=2`, `max-num-batched-tokens=8192` | probe-time runtime failure | +| `trial-0004` | `EP=4`, `enable-expert-parallel=true` | launch fail | +| `trial-0005` | `max-num-batched-tokens=4096` | infeasible | +| `trial-0006` | `TP=8, DP=1`, `max-num-batched-tokens=4096` | `2.8600 req/s`, feasible | +| `trial-0007` | `trial-0006 + max-num-batched-tokens=3072` | infeasible | +| `trial-0008` | `trial-0006 + max-num-batched-tokens=3584` | `2.9667 req/s`, feasible | +| `trial-0009` | `trial-0006 + max-num-batched-tokens=3328` | infeasible | +| `trial-0010` | `trial-0006 + max-num-batched-tokens=3712` | `3.0350 req/s`, feasible, best | +| `trial-0011` | `trial-0010 + max-num-batched-tokens=3840` | infeasible | +| `trial-0012` | `trial-0010 + max-num-batched-tokens=3776` | infeasible | + +## Key insights + +- The main win came from topology first, then local batch-shape refinement. `TP4 -> TP8` was the key change. +- `TP4/DP2` was not just suboptimal; it was unstable at runtime under probing and should be treated as negative evidence for this stack. +- `EP=4` on the baseline `TP4/DP1` path failed at launch with `group_gemm's contiguous kernel requires deepgemm`, so EP is currently not a viable direction here. +- After switching to `TP8/DP1/EP-off`, the remaining gain came from tightening `max-num-batched-tokens` from `4096` into a narrow sweet spot around `3584~3712`. +- Too-small prefill batches (`3072`, `3328`) and too-large ones (`3776`, `3840`) both hurt the TTFT tail enough to lose the `95%` target. + +## Current recommendation + +Use `trial-0010` as the default serving shape for this workload: + +- `tensor-parallel-size=8` +- `data-parallel-size=1` +- `enable-expert-parallel=false` +- `max-num-batched-tokens=3712` +- keep the rest of the `run_qwen235b.sh` baseline unchanged diff --git a/scripts/run_baseline_then_llm.py b/scripts/run_baseline_then_llm.py new file mode 100644 index 0000000..68d753e --- /dev/null +++ b/scripts/run_baseline_then_llm.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from aituner.llm import ( + build_prompt, + call_llm_for_proposal, + load_capability_profile, + parse_proposal_text, +) +from aituner.spec import load_study_spec +from aituner.store import StudyStore +from aituner.trace import load_trace_requests, summarize_window +from aituner.worker import run_trial + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Run one baseline trial followed by LLM-proposed trials." + ) + parser.add_argument("--spec", required=True) + parser.add_argument("--store-root", required=True) + parser.add_argument("--baseline-proposal", required=True) + parser.add_argument("--total-trials", type=int, default=12) + return parser + + +def main() -> int: + args = build_parser().parse_args() + spec_path = Path(args.spec).resolve() + store_root = Path(args.store_root).resolve() + baseline_path = Path(args.baseline_proposal).resolve() + if args.total_trials <= 0: + raise SystemExit("--total-trials must be positive") + + study = load_study_spec(spec_path) + store = StudyStore(store_root) + study_root = store.init_study(spec_path=spec_path, study=study) + capability_profile = load_capability_profile(study, study_spec_path=spec_path) + + print( + json.dumps( + { + "event": "study_initialized", + "study_root": str(study_root), + "study_id": study.study_id, + "total_trials": args.total_trials, + }, + ensure_ascii=False, + ), + flush=True, + ) + + state = store.load_state(study.study_id) + baseline_text = baseline_path.read_text(encoding="utf-8") + baseline = parse_proposal_text(baseline_text, study) + baseline_name = baseline_path.stem + store.write_proposal(study.study_id, baseline_name, baseline) + trial, _ = store.materialize_trial(study=study, state=state, proposal=baseline) + result = run_trial(Path(trial.artifact_dir) / "trial_spec.json") + state = store.ingest_trial_results(study.study_id) + print( + json.dumps( + { + "event": "trial_completed", + "trial_id": trial.trial_id, + "source": baseline_name, + "status": result.get("status"), + "best_sampling_u": result.get("best_sampling_u"), + "best_request_rate": result.get("best_request_rate"), + "best_pass_rate": result.get("best_pass_rate"), + "state_best_trial_id": state.best_trial_id, + "state_best_request_rate": state.best_request_rate, + "state_best_request_rate_per_gpu": state.best_request_rate_per_gpu, + }, + ensure_ascii=False, + ), + flush=True, + ) + + remaining_trials = args.total_trials - 1 + for _ in range(max(0, remaining_trials)): + state = store.load_state(study.study_id) + window, requests = load_trace_requests(study, study_spec_path=spec_path) + prompt = build_prompt( + study=study, + window_summary=summarize_window(requests, window), + state=state, + capability_profile=capability_profile, + ) + prompt_name = f"prompt-{state.next_trial_index:04d}" + store.write_prompt(study.study_id, prompt_name, prompt) + proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) + proposal_name = f"proposal-{state.next_trial_index:04d}" + raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt" + raw_proposal_path.write_text(proposal_text, encoding="utf-8") + proposal = parse_proposal_text(proposal_text, study) + store.write_proposal(study.study_id, proposal_name, proposal) + trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal) + result = run_trial(Path(trial.artifact_dir) / "trial_spec.json") + state = store.ingest_trial_results(study.study_id) + print( + json.dumps( + { + "event": "trial_completed", + "trial_id": trial.trial_id, + "source": proposal_name, + "status": result.get("status"), + "best_sampling_u": result.get("best_sampling_u"), + "best_request_rate": result.get("best_request_rate"), + "best_pass_rate": result.get("best_pass_rate"), + "state_best_trial_id": state.best_trial_id, + "state_best_request_rate": state.best_request_rate, + "state_best_request_rate_per_gpu": state.best_request_rate_per_gpu, + }, + ensure_ascii=False, + ), + flush=True, + ) + + final_state = store.load_state(study.study_id) + print( + json.dumps( + { + "event": "study_finished", + "study_root": str(study_root), + "best_trial_id": final_state.best_trial_id, + "best_request_rate": final_state.best_request_rate, + "best_request_rate_per_gpu": final_state.best_request_rate_per_gpu, + "trial_count": len(final_state.trials), + }, + ensure_ascii=False, + ), + flush=True, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())