diff --git a/configs/examples/dash0_qwen235b_prefill_thinking_baseline.json b/configs/examples/dash0_qwen235b_prefill_thinking_baseline.json new file mode 100644 index 0000000..fbf7e1d --- /dev/null +++ b/configs/examples/dash0_qwen235b_prefill_thinking_baseline.json @@ -0,0 +1,13 @@ +{ + "observation": "This is the prefill-only baseline aligned to run_qwen235b.sh. Keep the internal vLLM launch shape unchanged and replay the thinking trace with completion forced to 1 token.", + "diagnosis": "A baseline measurement is required before proposing prefill-only TTFT improvements. Preserve all current envs and flags from run_qwen235b.sh to establish the first feasible sampling_u/request_rate point under the TTFT-only SLO.", + "config_patch": { + "env_patch": {}, + "flag_patch": {} + }, + "expected_effects": [ + "Establish a launch-safe prefill-only baseline for qwen3-235b thinking traffic", + "Seed later trials from the first feasible sampling_u if one exists" + ], + "why_not_previous_failures": "No previous failures in this study." +} diff --git a/configs/examples/dash0_qwen235b_prefill_thinking_run1_ttft.json b/configs/examples/dash0_qwen235b_prefill_thinking_run1_ttft.json new file mode 100644 index 0000000..28321c3 --- /dev/null +++ b/configs/examples/dash0_qwen235b_prefill_thinking_run1_ttft.json @@ -0,0 +1,208 @@ +{ + "study_id": "dash0-qwen235b-prefill-thinking-run1-ttft-topology", + "hardware": { + "gpu_count": 8, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "qwen3-235b-a22b-256k-0717-internal", + "served_model_name": "qwen3-235b-prefill" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "internal-on-dash0", + "exec_path": "/usr/local/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18124, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 1800, + "request_timeout_s": 1800, + "launch_args": [ + "serve", + "/home/admin/resource/model/464482ce.qwen3-235b-a22b/256k-0717" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "VLLM_USE_V1": "1", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", + "VLLM_QUANTIZATION_LAYER_WISE": "1", + "VLLM_MOE_USE_DEEPEP": "0", + "VLLM_MOE_BALANCED_GATING": "0", + "VLLM_MOE_RANDOM_GATING": "0", + "VLLM_FUSED_MOE_CHUNK_SIZE": "4096", + "VLLM_DP_META_USE_CPU_GROUP": "0", + "VLLM_MLA_FP8_ATTENTION": "0", + "VLLM_MOE_EXPERTS_OVERLAP": "0", + "VLLM_USE_FLASHINFER_SAMPLER": "0", + "VLLM_RESPONSE_TIMEOUT": "290", + "VLLM_FP8_USE_BLADNN": "1", + "VLLM_MOE_USE_BLADNN": "1", + "VLLM_USE_DEEP_GEMM": "0", + "VLLM_PD_TRY_CONNECT_TIMEOUT_SECONDS": "120", + "VLLM_DEEP_GEMM_WARMUP": "skip", + "DEEPEP_LL_COMBINE_USE_FP8": "1", + "DEEPEP_LL_BUFFER_FP8_OPT": "1", + "DEEPEP_LL_DISPATCH_USE_NVL": "1", + "DEEPEP_LL_COMBINE_USE_NVL": "1", + "ACCL_LOW_LATENCY_OPTIMIZE": "2", + "ACCL_WRITEBATCH_OPT": "2", + "ACCL_IBV_MTU": "9000", + "ACCL_TX_DEPTH": "1024", + "ACCL_RETRANSMIT_TIMEOUT": "17", + "NVSHMEM_IBGDA_NUM_RC_PER_PE": "4", + "BLLM_KVTRANS_RDMA_SP": "2", + "NCCL_SOCKET_IFNAME": "eth1", + "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME": "eth1", + "GLOO_SOCKET_IFNAME": "eth1" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18124, + "served-model-name": "qwen3-235b-prefill", + "tensor-parallel-size": 4, + "gpu-memory-utilization": 0.85, + "enable-prefix-caching": true, + "enable-chunked-prefill": true, + "max-num-batched-tokens": 8192, + "disable-hybrid-kv-cache-manager": true, + "max-model-len": 262144, + "block-size": 64, + "max-num-seqs": 64, + "quantization": "fp8", + "cuda-graph-sizes": [ + 16, + 32, + 64, + 96, + 128, + 160, + 192, + 224, + 256, + 288, + 320, + 352, + 384, + 416, + 448, + 480, + 512, + 544, + 576, + 608, + 640, + 672, + 704, + 736, + 768, + 800, + 832, + 864, + 896, + 928, + 960, + 992, + 1024 + ], + "compilation-config": "{\"cudagraph_mode\":\"PIECEWISE\",\"use_inductor\":false,\"custom_ops\":[\"all\"],\"max_cudagraph_capture_size\":2048}", + "speculative-config": "{\"method\":\"eagle3\",\"num_speculative_tokens\":1,\"hf_overrides\":{\"rope_scaling\":{\"type\":\"yarn\",\"factor\":128,\"original_max_position_embeddings\":2048,\"semi_dynamic\":false,\"dynamic\":true},\"num_experts\":0},\"model\":\"/home/admin/resource/model/464482ce.qwen3-235b-a22b/0717-eagle-0820\"}", + "hf-overrides": "{\"architectures\":[\"Qwen3MoeForCausalLM\"],\"model_type\":\"qwen3_moe\"}", + "kv-cache-dtype": "fp8", + "disable-log-requests": true + }, + "tunable_envs": [ + "VLLM_ENABLE_TORCH_COMPILE" + ], + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "enable-expert-parallel", + "expert-parallel-size", + "gpu-memory-utilization", + "max-num-batched-tokens", + "max-num-seqs", + "block-size", + "enable-prefix-caching", + "enable-chunked-prefill" + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [ + 4, + 8 + ], + "allowed_tensor_parallel_sizes": [ + 4, + 8 + ], + "allowed_data_parallel_sizes": [ + 1, + 2 + ], + "allowed_expert_parallel_sizes": [ + 1, + 2, + 4, + 8 + ] + }, + "python_executable": "python3" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "thinking_w20260327_1000", + "request_mode": "chat", + "completion_tokens_override": 1, + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 64, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 180.0, + "early_stop_max_elapsed_s": 1200.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 3000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 6000 + }, + { + "threshold_ms": 9000 + } + ] + } + }, + "search": { + "low": 0.0, + "high": 0.125, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "You are tuning a prefill-dominated vLLM serving stack. The trace replay forces completion length to exactly 1 token, so optimize for TTFT under the configured stepped SLO. Propose one launch-safe config patch that increases the maximum feasible sampling_u while respecting the topology constraints and avoiding known launch failures.", + "max_history_trials": 8, + "endpoint": { + "provider": "codex", + "model": "gpt-5.4", + "stream": true, + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 240 + } + } +} diff --git a/src/aituner/llm.py b/src/aituner/llm.py index 79ac9ea..f27be86 100644 --- a/src/aituner/llm.py +++ b/src/aituner/llm.py @@ -182,6 +182,11 @@ def build_prompt( objective_notes.append( "This study is decode-only. The engine uses a KV decode benchmark connector, so TTFT is informational only unless an explicit TTFT rule is configured." ) + if study.trace.completion_tokens_override is not None: + objective_notes.append( + "The trace replay overrides completion length for every request. " + f"min_tokens=max_tokens={study.trace.completion_tokens_override}." + ) if study.slo.ttft_rule is None: objective_notes.append("There is no TTFT SLO for this study.") if study.slo.tpot_rule is None: @@ -243,6 +248,7 @@ def build_prompt( "trace": { "window_id": study.trace.window_id, "request_mode": study.trace.request_mode, + "completion_tokens_override": study.trace.completion_tokens_override, "input_length_filter": ( { "min_input_tokens": study.trace.input_length_filter.min_input_tokens, diff --git a/src/aituner/spec.py b/src/aituner/spec.py index d42c561..24393ab 100644 --- a/src/aituner/spec.py +++ b/src/aituner/spec.py @@ -317,6 +317,7 @@ class TraceSpec: window_id: str trace_file_override: str | None request_mode: str + completion_tokens_override: int | None u_field: str timestamp_field: str max_concurrency: int @@ -331,9 +332,17 @@ class TraceSpec: def from_dict(cls, data: Mapping[str, Any]) -> "TraceSpec": max_requests = data.get("max_requests_per_probe") synthetic_prompt_cap = data.get("synthetic_prompt_cap_tokens") + completion_tokens_override = data.get("completion_tokens_override") request_mode = str(data.get("request_mode") or "chat").strip().lower() if request_mode not in {"chat", "decode_only"}: raise SpecError("trace.request_mode must be one of: chat, decode_only.") + if completion_tokens_override is not None: + completion_tokens_override = _require_int( + completion_tokens_override, + context="trace.completion_tokens_override", + ) + if completion_tokens_override < 0: + raise SpecError("trace.completion_tokens_override must be >= 0.") return cls( windows_path=_require_str(data.get("windows_path"), context="trace.windows_path"), window_id=_require_str(data.get("window_id"), context="trace.window_id"), @@ -341,6 +350,7 @@ class TraceSpec: if data.get("trace_file_override") else None, request_mode=request_mode, + completion_tokens_override=completion_tokens_override, u_field=str(data.get("u_field") or "sampling_u").strip(), timestamp_field=str(data.get("timestamp_field") or "timestamp").strip(), max_concurrency=_require_int( diff --git a/src/aituner/trace.py b/src/aituner/trace.py index a318a4c..7d03a80 100644 --- a/src/aituner/trace.py +++ b/src/aituner/trace.py @@ -204,7 +204,11 @@ def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[Win "stream": True, "stream_options": {"include_usage": True}, } - completion_tokens = _coerce_completion_tokens(row) + completion_tokens = ( + study.trace.completion_tokens_override + if study.trace.completion_tokens_override is not None + else _coerce_completion_tokens(row) + ) if completion_tokens is not None: body["min_tokens"] = completion_tokens body["max_tokens"] = completion_tokens diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index e8677f7..4272640 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -666,6 +666,44 @@ class CoreFlowTests(unittest.TestCase): self.assertFalse(evaluations[1].passed) self.assertEqual(summary["slo_pass_rate"], 0.5) + def test_trace_completion_tokens_override_forces_min_and_max_tokens(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + study_path = _write_study_assets( + Path(tmp), + trace_overrides={"completion_tokens_override": 1}, + ) + study = load_study_spec(study_path) + _, requests = load_trace_requests(study, study_spec_path=study_path) + self.assertEqual(len(requests), 3) + self.assertEqual(requests[0].completion_tokens_hint, 1) + self.assertEqual(requests[1].completion_tokens_hint, 1) + self.assertEqual(requests[2].completion_tokens_hint, 1) + self.assertEqual(requests[0].body["min_tokens"], 1) + self.assertEqual(requests[0].body["max_tokens"], 1) + self.assertEqual(requests[2].body["min_tokens"], 1) + self.assertEqual(requests[2].body["max_tokens"], 1) + + def test_build_prompt_mentions_completion_tokens_override(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + study_path = _write_study_assets( + Path(tmp), + trace_overrides={"completion_tokens_override": 1}, + slo_overrides={"tpot_rule": None}, + ) + study = load_study_spec(study_path) + store = StudyStore(Path(tmp) / ".aituner") + store.init_study(spec_path=study_path, study=study) + state = store.load_state(study.study_id) + window, requests = load_trace_requests(study, study_spec_path=study_path) + prompt = build_prompt( + study=study, + window_summary=summarize_window(requests, window), + state=state, + capability_profile=None, + ) + self.assertIn('"completion_tokens_override": 1', prompt) + self.assertIn("min_tokens=max_tokens=1", prompt) + def test_slo_evaluation_supports_tpot_only_95_percent_target(self) -> None: with tempfile.TemporaryDirectory() as tmp: study = load_study_spec(