Add trace length bucket tuning support

2026-04-07 11:03:16 +08:00
parent e9b5e9b957
commit 46ed688ace
12 changed files with 922 additions and 14 deletions
--- a/configs/examples/dash0_qwen27b_tight_slo_baseline_proposal.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_baseline_proposal.json
@@ -0,0 +1,28 @@
+{
+  "observation": "The incumbent should start from the known launch-safe qwen3.5-27b serving recipe on dash0 before asking the LLM to optimize throughput above that baseline.",
+  "diagnosis": "This model uses a long-context hybrid stack and fp8 quantization. The safest first measurement is to preserve the existing warmup, hybrid-model, chunked-prefill, and prefix-caching behavior from run_qwen27b.sh, while keeping a conservative sequence cap.",
+  "config_patch": {
+    "env_patch": {
+      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+      "VLLM_ENABLE_TORCH_COMPILE": "1",
+      "VLLM_USE_FLASHINFER_SAMPLER": "0",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1"
+    },
+    "flag_patch": {
+      "tensor-parallel-size": 4,
+      "gpu-memory-utilization": 0.9,
+      "block-size": 64,
+      "max-num-batched-tokens": 8192,
+      "max-num-seqs": 16,
+      "enable-prefix-caching": true,
+      "enable-chunked-prefill": true,
+      "disable-cascade-attn": true
+    }
+  },
+  "expected_effects": [
+    "Launch-safe baseline aligned with the current hand-tuned qwen27b recipe while using all 4 visible H20 GPUs",
+    "Reliable first incumbent under the tighter TTFT and TPOT SLO",
+    "Clear trial history for the LLM to propose a higher-throughput follow-up patch"
+  ],
+  "why_not_previous_failures": "This baseline intentionally avoids speculative new kernels or batching spikes before we have an incumbent under the new SLO."
+}
--- a/configs/examples/dash0_qwen27b_tight_slo_run1.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run1.json
@@ -0,0 +1,147 @@
+{
+  "study_id": "dash0-qwen27b-tight-slo-10min-run1",
+  "hardware": {
+    "gpu_count": 4,
+    "gpu_model": "H20",
+    "host_candidates": [
+      "dash0"
+    ]
+  },
+  "model": {
+    "model_id": "qwen3.5-27b-256k-0223-internal",
+    "served_model_name": "qwen35-27b-aituner"
+  },
+  "engine": {
+    "engine_name": "vllm",
+    "engine_version": "latest-release-on-dash0",
+    "exec_path": "/usr/local/bin/vllm",
+    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
+    "host": "127.0.0.1",
+    "port": 18082,
+    "healthcheck_path": "/v1/models",
+    "ready_timeout_s": 900,
+    "request_timeout_s": 900,
+    "launch_args": [
+      "serve",
+      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
+    ],
+    "base_envs": {
+      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
+      "VLLM_DISABLE_COMPILE_CACHE": "1",
+      "DS_LLM_IGNORE_WARMUP": "1",
+      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
+      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
+      "PARAM_TOTAL_MAX": "262144",
+      "PARAM_IN_LENGTH_MAX": "262144",
+      "PARAM_MAX_LENGTH_MAX": "131072",
+      "DS_LLM_MAX_THINK_TOKENS": "81920",
+      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
+      "VLLM_FP8_USE_BLADNN": "1",
+      "VLLM_MOE_USE_BLADNN": "1",
+      "VLLM_GDN_USE_BLADNN": "0",
+      "VLLM_USE_V1": "1",
+      "VLLM_IS_HYBRID_MODEL": "1",
+      "VLLM_ENABLE_TORCH_COMPILE": "1",
+      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
+      "VLLM_USE_FLASHINFER_SAMPLER": "0",
+      "VLLM_RESPONSE_TIMEOUT": "300",
+      "VLLM_LOG_REQ_KV_LENS": "1",
+      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
+    },
+    "base_flags": {
+      "host": "127.0.0.1",
+      "port": 18082,
+      "served-model-name": "qwen35-27b-aituner",
+      "trust-remote-code": true,
+      "dtype": "bfloat16",
+      "gpu-memory-utilization": 0.9,
+      "enable-prefix-caching": true,
+      "mamba-cache-mode": "light",
+      "distributed-executor-backend": "mp",
+      "block-size": 64,
+      "enable-chunked-prefill": true,
+      "max-num-batched-tokens": 8192,
+      "disable-cascade-attn": true,
+      "max-model-len": 262144,
+      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
+      "mm-processor-cache-gb": 0,
+      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
+      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
+      "mamba-cache-dtype": "float32",
+      "skip-mm-profiling": true,
+      "quantization": "fp8",
+      "tensor-parallel-size": 1,
+      "max-num-seqs": 16,
+      "disable-log-requests": true
+    },
+    "tunable_envs": [
+      "VLLM_ATTENTION_BACKEND",
+      "VLLM_ENABLE_TORCH_COMPILE",
+      "VLLM_USE_FLASHINFER_SAMPLER",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
+    ],
+    "tunable_flags": [
+      "tensor-parallel-size",
+      "gpu-memory-utilization",
+      "block-size",
+      "max-num-batched-tokens",
+      "max-num-seqs",
+      "enable-prefix-caching",
+      "enable-chunked-prefill",
+      "disable-cascade-attn"
+    ],
+    "python_executable": "python3"
+  },
+  "trace": {
+    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
+    "window_id": "chat_w20260311_1000",
+    "u_field": "sampling_u",
+    "timestamp_field": "timestamp",
+    "max_concurrency": 32,
+    "replay_time_scale": 1.0,
+    "early_stop_max_lag_s": 120.0,
+    "early_stop_max_elapsed_s": 900.0
+  },
+  "slo": {
+    "target_pass_rate": 0.95,
+    "ttft_rule": {
+      "kind": "step_ms",
+      "buckets": [
+        {
+          "max_input_tokens": 4096,
+          "threshold_ms": 2000
+        },
+        {
+          "max_input_tokens": 32768,
+          "threshold_ms": 4000
+        },
+        {
+          "threshold_ms": 6000
+        }
+      ]
+    },
+    "tpot_rule": {
+      "kind": "fixed_ms",
+      "threshold_ms": 50
+    }
+  },
+  "search": {
+    "low": 0.0,
+    "high": 1.0,
+    "tolerance": 0.01,
+    "max_probes": 8,
+    "sample_seed": 20260325
+  },
+  "llm": {
+    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
+    "max_history_trials": 8,
+    "endpoint": {
+      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
+      "model": "gpt-5.4",
+      "api_key_env": "OPENAI_API_KEY",
+      "timeout_s": 180
+    }
+  }
+}
--- a/configs/examples/dash0_qwen27b_tight_slo_run2.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run2.json
@@ -0,0 +1,147 @@
+{
+  "study_id": "dash0-qwen27b-tight-slo-10min-run2",
+  "hardware": {
+    "gpu_count": 4,
+    "gpu_model": "H20",
+    "host_candidates": [
+      "dash0"
+    ]
+  },
+  "model": {
+    "model_id": "qwen3.5-27b-256k-0223-internal",
+    "served_model_name": "qwen35-27b-aituner"
+  },
+  "engine": {
+    "engine_name": "vllm",
+    "engine_version": "latest-release-on-dash0",
+    "exec_path": "/usr/local/bin/vllm",
+    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
+    "host": "127.0.0.1",
+    "port": 18082,
+    "healthcheck_path": "/v1/models",
+    "ready_timeout_s": 900,
+    "request_timeout_s": 900,
+    "launch_args": [
+      "serve",
+      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
+    ],
+    "base_envs": {
+      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
+      "VLLM_DISABLE_COMPILE_CACHE": "1",
+      "DS_LLM_IGNORE_WARMUP": "1",
+      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
+      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
+      "PARAM_TOTAL_MAX": "262144",
+      "PARAM_IN_LENGTH_MAX": "262144",
+      "PARAM_MAX_LENGTH_MAX": "131072",
+      "DS_LLM_MAX_THINK_TOKENS": "81920",
+      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
+      "VLLM_FP8_USE_BLADNN": "1",
+      "VLLM_MOE_USE_BLADNN": "1",
+      "VLLM_GDN_USE_BLADNN": "0",
+      "VLLM_USE_V1": "1",
+      "VLLM_IS_HYBRID_MODEL": "1",
+      "VLLM_ENABLE_TORCH_COMPILE": "1",
+      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
+      "VLLM_USE_FLASHINFER_SAMPLER": "0",
+      "VLLM_RESPONSE_TIMEOUT": "300",
+      "VLLM_LOG_REQ_KV_LENS": "1",
+      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
+    },
+    "base_flags": {
+      "host": "127.0.0.1",
+      "port": 18082,
+      "served-model-name": "qwen35-27b-aituner",
+      "trust-remote-code": true,
+      "dtype": "bfloat16",
+      "gpu-memory-utilization": 0.9,
+      "enable-prefix-caching": true,
+      "mamba-cache-mode": "light",
+      "distributed-executor-backend": "mp",
+      "block-size": 64,
+      "enable-chunked-prefill": true,
+      "max-num-batched-tokens": 8192,
+      "disable-cascade-attn": true,
+      "max-model-len": 262144,
+      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
+      "mm-processor-cache-gb": 0,
+      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
+      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
+      "mamba-cache-dtype": "float32",
+      "skip-mm-profiling": true,
+      "quantization": "fp8",
+      "tensor-parallel-size": 4,
+      "max-num-seqs": 16,
+      "disable-log-requests": true
+    },
+    "tunable_envs": [
+      "VLLM_ATTENTION_BACKEND",
+      "VLLM_ENABLE_TORCH_COMPILE",
+      "VLLM_USE_FLASHINFER_SAMPLER",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
+    ],
+    "tunable_flags": [
+      "tensor-parallel-size",
+      "gpu-memory-utilization",
+      "block-size",
+      "max-num-batched-tokens",
+      "max-num-seqs",
+      "enable-prefix-caching",
+      "enable-chunked-prefill",
+      "disable-cascade-attn"
+    ],
+    "python_executable": "python3"
+  },
+  "trace": {
+    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
+    "window_id": "chat_w20260311_1000",
+    "u_field": "sampling_u",
+    "timestamp_field": "timestamp",
+    "max_concurrency": 32,
+    "replay_time_scale": 1.0,
+    "early_stop_max_lag_s": 120.0,
+    "early_stop_max_elapsed_s": 900.0
+  },
+  "slo": {
+    "target_pass_rate": 0.95,
+    "ttft_rule": {
+      "kind": "step_ms",
+      "buckets": [
+        {
+          "max_input_tokens": 4096,
+          "threshold_ms": 2000
+        },
+        {
+          "max_input_tokens": 32768,
+          "threshold_ms": 4000
+        },
+        {
+          "threshold_ms": 6000
+        }
+      ]
+    },
+    "tpot_rule": {
+      "kind": "fixed_ms",
+      "threshold_ms": 50
+    }
+  },
+  "search": {
+    "low": 0.0,
+    "high": 0.0625,
+    "tolerance": 0.001,
+    "max_probes": 6,
+    "sample_seed": 20260325
+  },
+  "llm": {
+    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
+    "max_history_trials": 8,
+    "endpoint": {
+      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
+      "model": "gpt-5.4",
+      "api_key_env": "OPENAI_API_KEY",
+      "timeout_s": 180
+    }
+  }
+}
--- a/configs/examples/dash0_qwen27b_tight_slo_run3.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run3.json
@@ -0,0 +1,147 @@
+{
+  "study_id": "dash0-qwen27b-tight-slo-10min-run3",
+  "hardware": {
+    "gpu_count": 4,
+    "gpu_model": "H20",
+    "host_candidates": [
+      "dash0"
+    ]
+  },
+  "model": {
+    "model_id": "qwen3.5-27b-256k-0223-internal",
+    "served_model_name": "qwen35-27b-aituner"
+  },
+  "engine": {
+    "engine_name": "vllm",
+    "engine_version": "latest-release-on-dash0",
+    "exec_path": "/usr/local/bin/vllm",
+    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
+    "host": "127.0.0.1",
+    "port": 18082,
+    "healthcheck_path": "/v1/models",
+    "ready_timeout_s": 900,
+    "request_timeout_s": 900,
+    "launch_args": [
+      "serve",
+      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
+    ],
+    "base_envs": {
+      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
+      "VLLM_DISABLE_COMPILE_CACHE": "1",
+      "DS_LLM_IGNORE_WARMUP": "1",
+      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
+      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
+      "PARAM_TOTAL_MAX": "262144",
+      "PARAM_IN_LENGTH_MAX": "262144",
+      "PARAM_MAX_LENGTH_MAX": "131072",
+      "DS_LLM_MAX_THINK_TOKENS": "81920",
+      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
+      "VLLM_FP8_USE_BLADNN": "1",
+      "VLLM_MOE_USE_BLADNN": "1",
+      "VLLM_GDN_USE_BLADNN": "0",
+      "VLLM_USE_V1": "1",
+      "VLLM_IS_HYBRID_MODEL": "1",
+      "VLLM_ENABLE_TORCH_COMPILE": "1",
+      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
+      "VLLM_USE_FLASHINFER_SAMPLER": "0",
+      "VLLM_RESPONSE_TIMEOUT": "300",
+      "VLLM_LOG_REQ_KV_LENS": "1",
+      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
+    },
+    "base_flags": {
+      "host": "127.0.0.1",
+      "port": 18082,
+      "served-model-name": "qwen35-27b-aituner",
+      "trust-remote-code": true,
+      "dtype": "bfloat16",
+      "gpu-memory-utilization": 0.9,
+      "enable-prefix-caching": true,
+      "mamba-cache-mode": "light",
+      "distributed-executor-backend": "mp",
+      "block-size": 64,
+      "enable-chunked-prefill": true,
+      "max-num-batched-tokens": 8192,
+      "disable-cascade-attn": true,
+      "max-model-len": 262144,
+      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
+      "mm-processor-cache-gb": 0,
+      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
+      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
+      "mamba-cache-dtype": "float32",
+      "skip-mm-profiling": true,
+      "quantization": "fp8",
+      "tensor-parallel-size": 4,
+      "max-num-seqs": 16,
+      "disable-log-requests": true
+    },
+    "tunable_envs": [
+      "VLLM_ATTENTION_BACKEND",
+      "VLLM_ENABLE_TORCH_COMPILE",
+      "VLLM_USE_FLASHINFER_SAMPLER",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
+    ],
+    "tunable_flags": [
+      "tensor-parallel-size",
+      "gpu-memory-utilization",
+      "block-size",
+      "max-num-batched-tokens",
+      "max-num-seqs",
+      "enable-prefix-caching",
+      "enable-chunked-prefill",
+      "disable-cascade-attn"
+    ],
+    "python_executable": "python3"
+  },
+  "trace": {
+    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
+    "window_id": "chat_w20260311_1000",
+    "u_field": "sampling_u",
+    "timestamp_field": "timestamp",
+    "max_concurrency": 32,
+    "replay_time_scale": 1.0,
+    "early_stop_max_lag_s": 120.0,
+    "early_stop_max_elapsed_s": 900.0
+  },
+  "slo": {
+    "target_pass_rate": 0.95,
+    "ttft_rule": {
+      "kind": "step_ms",
+      "buckets": [
+        {
+          "max_input_tokens": 4096,
+          "threshold_ms": 2000
+        },
+        {
+          "max_input_tokens": 32768,
+          "threshold_ms": 4000
+        },
+        {
+          "threshold_ms": 6000
+        }
+      ]
+    },
+    "tpot_rule": {
+      "kind": "fixed_ms",
+      "threshold_ms": 50
+    }
+  },
+  "search": {
+    "low": 0.0,
+    "high": 0.0625,
+    "tolerance": 0.001,
+    "max_probes": 6,
+    "sample_seed": 20260325
+  },
+  "llm": {
+    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
+    "max_history_trials": 8,
+    "endpoint": {
+      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
+      "model": "gpt-5.4",
+      "api_key_env": "OPENAI_API_KEY",
+      "timeout_s": 180
+    }
+  }
+}
--- a/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json
@@ -0,0 +1,151 @@
+{
+  "study_id": "dash0-qwen27b-tight-slo-10min-run4-chat-0-8k",
+  "hardware": {
+    "gpu_count": 4,
+    "gpu_model": "H20",
+    "host_candidates": [
+      "dash0"
+    ]
+  },
+  "model": {
+    "model_id": "qwen3.5-27b-256k-0223-internal",
+    "served_model_name": "qwen35-27b-aituner"
+  },
+  "engine": {
+    "engine_name": "vllm",
+    "engine_version": "latest-release-on-dash0",
+    "exec_path": "/usr/local/bin/vllm",
+    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
+    "host": "127.0.0.1",
+    "port": 18082,
+    "healthcheck_path": "/v1/models",
+    "ready_timeout_s": 900,
+    "request_timeout_s": 900,
+    "launch_args": [
+      "serve",
+      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
+    ],
+    "base_envs": {
+      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
+      "VLLM_DISABLE_COMPILE_CACHE": "1",
+      "DS_LLM_IGNORE_WARMUP": "1",
+      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
+      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
+      "PARAM_TOTAL_MAX": "262144",
+      "PARAM_IN_LENGTH_MAX": "262144",
+      "PARAM_MAX_LENGTH_MAX": "131072",
+      "DS_LLM_MAX_THINK_TOKENS": "81920",
+      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
+      "VLLM_FP8_USE_BLADNN": "1",
+      "VLLM_MOE_USE_BLADNN": "1",
+      "VLLM_GDN_USE_BLADNN": "0",
+      "VLLM_USE_V1": "1",
+      "VLLM_IS_HYBRID_MODEL": "1",
+      "VLLM_ENABLE_TORCH_COMPILE": "1",
+      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
+      "VLLM_USE_FLASHINFER_SAMPLER": "0",
+      "VLLM_RESPONSE_TIMEOUT": "300",
+      "VLLM_LOG_REQ_KV_LENS": "1",
+      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
+    },
+    "base_flags": {
+      "host": "127.0.0.1",
+      "port": 18082,
+      "served-model-name": "qwen35-27b-aituner",
+      "trust-remote-code": true,
+      "dtype": "bfloat16",
+      "gpu-memory-utilization": 0.9,
+      "enable-prefix-caching": true,
+      "mamba-cache-mode": "light",
+      "distributed-executor-backend": "mp",
+      "block-size": 64,
+      "enable-chunked-prefill": true,
+      "max-num-batched-tokens": 8192,
+      "disable-cascade-attn": true,
+      "max-model-len": 262144,
+      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
+      "mm-processor-cache-gb": 0,
+      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
+      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
+      "mamba-cache-dtype": "float32",
+      "skip-mm-profiling": true,
+      "quantization": "fp8",
+      "tensor-parallel-size": 4,
+      "max-num-seqs": 16,
+      "disable-log-requests": true
+    },
+    "tunable_envs": [
+      "VLLM_ATTENTION_BACKEND",
+      "VLLM_ENABLE_TORCH_COMPILE",
+      "VLLM_USE_FLASHINFER_SAMPLER",
+      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
+    ],
+    "tunable_flags": [
+      "tensor-parallel-size",
+      "gpu-memory-utilization",
+      "block-size",
+      "max-num-batched-tokens",
+      "max-num-seqs",
+      "enable-prefix-caching",
+      "enable-chunked-prefill",
+      "disable-cascade-attn"
+    ],
+    "python_executable": "python3"
+  },
+  "trace": {
+    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
+    "window_id": "chat_w20260311_1000",
+    "u_field": "sampling_u",
+    "timestamp_field": "timestamp",
+    "max_concurrency": 32,
+    "input_length_filter": {
+      "min_input_tokens": 0,
+      "max_input_tokens": 8192
+    },
+    "replay_time_scale": 1.0,
+    "early_stop_max_lag_s": 120.0,
+    "early_stop_max_elapsed_s": 900.0
+  },
+  "slo": {
+    "target_pass_rate": 0.95,
+    "ttft_rule": {
+      "kind": "step_ms",
+      "buckets": [
+        {
+          "max_input_tokens": 4096,
+          "threshold_ms": 2000
+        },
+        {
+          "max_input_tokens": 32768,
+          "threshold_ms": 4000
+        },
+        {
+          "threshold_ms": 6000
+        }
+      ]
+    },
+    "tpot_rule": {
+      "kind": "fixed_ms",
+      "threshold_ms": 50
+    }
+  },
+  "search": {
+    "low": 0.0,
+    "high": 0.0625,
+    "tolerance": 0.001,
+    "max_probes": 6,
+    "sample_seed": 20260325
+  },
+  "llm": {
+    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
+    "max_history_trials": 8,
+    "endpoint": {
+      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
+      "model": "gpt-5.4",
+      "api_key_env": "OPENAI_API_KEY",
+      "timeout_s": 180
+    }
+  }
+}
--- a/configs/examples/study.example.json
+++ b/configs/examples/study.example.json
@@ -50,7 +50,11 @@
    "window_id": "chat_w_example_0001",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
-    "max_concurrency": 64
+    "max_concurrency": 64,
+    "input_length_filter": {
+      "min_input_tokens": 0,
+      "max_input_tokens": 8192
+    }
  },
  "slo": {
    "target_pass_rate": 0.95,
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -42,6 +42,11 @@ def build_prompt(
        json.dumps(
            {
                "study_id": study.study_id,
+                "current_best": {
+                    "trial_id": state.best_trial_id,
+                    "best_sampling_u": state.best_sampling_u,
+                    "best_request_rate": state.best_request_rate,
+                },
                "hardware": {
                    "gpu_count": study.hardware.gpu_count,
                    "gpu_model": study.hardware.gpu_model,
@@ -50,6 +55,17 @@ def build_prompt(
                    "model_id": study.model.model_id,
                    "served_model_name": study.model.served_model_name,
                },
+                "trace": {
+                    "window_id": study.trace.window_id,
+                    "input_length_filter": (
+                        {
+                            "min_input_tokens": study.trace.input_length_filter.min_input_tokens,
+                            "max_input_tokens": study.trace.input_length_filter.max_input_tokens,
+                        }
+                        if study.trace.input_length_filter is not None
+                        else None
+                    ),
+                },
                "engine": {
                    "engine_name": study.engine.engine_name,
                    "engine_version": study.engine.engine_version,
@@ -84,6 +100,8 @@ def build_prompt(
        "Trial history:",
        json.dumps(history, ensure_ascii=False, indent=2),
        "",
+        "The proposal must beat the current incumbent. Do not propose a config that is only likely to be feasible below the current best_sampling_u/request_rate.",
+        "The evaluator for a new trial will start searching from the current best feasible sampling_u and only look for improvements above it.",
        "The proposal should improve the maximum feasible sampling_u under the 95%+ SLO target.",
    ]
    return "\n".join(sections)
@@ -110,8 +128,22 @@ def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal:
    return proposal


-def parse_proposal_text(text: str, study: StudySpec) -> Proposal:
+def _parse_json_object_text(text: str) -> dict[str, Any]:
+    try:
        payload = json.loads(text)
+    except json.JSONDecodeError:
+        start = text.find("{")
+        end = text.rfind("}")
+        if start < 0 or end < start:
+            raise
+        payload = json.loads(text[start : end + 1])
+    if not isinstance(payload, dict):
+        raise SpecError("proposal payload must be a JSON object")
+    return payload
+
+
+def parse_proposal_text(text: str, study: StudySpec) -> Proposal:
+    payload = _parse_json_object_text(text)
    proposal = Proposal.from_dict(payload)
    return validate_proposal(proposal, study)

--- a/src/aituner/spec.py
+++ b/src/aituner/spec.py
@@ -142,6 +142,42 @@ class EngineLaunchSpec:
        )


+@dataclass(frozen=True)
+class InputLengthFilterSpec:
+    min_input_tokens: int | None = None
+    max_input_tokens: int | None = None
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any], *, context: str) -> "InputLengthFilterSpec":
+        min_input_tokens = data.get("min_input_tokens")
+        max_input_tokens = data.get("max_input_tokens")
+        spec = cls(
+            min_input_tokens=(
+                _require_int(min_input_tokens, context=f"{context}.min_input_tokens")
+                if min_input_tokens is not None
+                else None
+            ),
+            max_input_tokens=(
+                _require_int(max_input_tokens, context=f"{context}.max_input_tokens")
+                if max_input_tokens is not None
+                else None
+            ),
+        )
+        if spec.min_input_tokens is None and spec.max_input_tokens is None:
+            raise SpecError(
+                f"{context} must define at least one of min_input_tokens/max_input_tokens."
+            )
+        if (
+            spec.min_input_tokens is not None
+            and spec.max_input_tokens is not None
+            and spec.min_input_tokens > spec.max_input_tokens
+        ):
+            raise SpecError(
+                f"{context}.min_input_tokens must be <= {context}.max_input_tokens."
+            )
+        return spec
+
+
@dataclass(frozen=True)
 class TraceSpec:
    windows_path: str
@@ -150,6 +186,7 @@ class TraceSpec:
    u_field: str
    timestamp_field: str
    max_concurrency: int
+    input_length_filter: InputLengthFilterSpec | None = None
    max_requests_per_probe: int | None = None
    synthetic_prompt_cap_tokens: int | None = None
    replay_time_scale: float = 1.0
@@ -171,6 +208,17 @@ class TraceSpec:
            max_concurrency=_require_int(
                data.get("max_concurrency", 64), context="trace.max_concurrency"
            ),
+            input_length_filter=(
+                InputLengthFilterSpec.from_dict(
+                    _require_mapping(
+                        data.get("input_length_filter"),
+                        context="trace.input_length_filter",
+                    ),
+                    context="trace.input_length_filter",
+                )
+                if data.get("input_length_filter") is not None
+                else None
+            ),
            max_requests_per_probe=int(max_requests) if max_requests is not None else None,
            synthetic_prompt_cap_tokens=(
                int(synthetic_prompt_cap) if synthetic_prompt_cap is not None else None
@@ -454,6 +502,7 @@ class TrialSummary:
 class StudyState:
    study_id: str
    best_trial_id: str | None = None
+    best_sampling_u: float | None = None
    best_request_rate: float | None = None
    next_trial_index: int = 1
    trials: list[TrialSummary] = field(default_factory=list)
--- a/src/aituner/store.py
+++ b/src/aituner/store.py
@@ -32,6 +32,7 @@ class StudyStore:
        return StudyState(
            study_id=str(payload["study_id"]),
            best_trial_id=payload.get("best_trial_id"),
+            best_sampling_u=payload.get("best_sampling_u"),
            best_request_rate=payload.get("best_request_rate"),
            next_trial_index=int(payload.get("next_trial_index", 1)),
            trials=trials,
@@ -64,7 +65,18 @@ class StudyStore:
            study_id=study.study_id,
            trial_id=trial_id,
            config_patch=proposal.config_patch,
-            search=study.search,
+            search=replace(
+                study.search,
+                low=min(
+                    study.search.high,
+                    max(
+                        study.search.low,
+                        float(state.best_sampling_u)
+                        if isinstance(state.best_sampling_u, (int, float))
+                        else study.search.low,
+                    ),
+                ),
+            ),
            study_spec_path=str((self.study_root(study.study_id) / "study_spec.source").resolve()),
            artifact_dir=str(trial_root),
            probe_log_path=str(trial_root / "probe_history.json"),
@@ -89,6 +101,7 @@ class StudyStore:
        by_id = {item.trial_id: item for item in state.trials}
        trials_dir = self.study_root(study_id) / "trials"
        best_trial_id = state.best_trial_id
+        best_sampling_u = state.best_sampling_u
        best_rate = state.best_request_rate
        for trial_dir in sorted(trials_dir.glob("trial-*")):
            result_path = trial_dir / "result.json"
@@ -112,7 +125,13 @@ class StudyStore:
                and (best_rate is None or summary.best_request_rate > best_rate)
            ):
                best_rate = float(summary.best_request_rate)
+                best_sampling_u = (
+                    float(summary.best_sampling_u)
+                    if isinstance(summary.best_sampling_u, (int, float))
+                    else None
+                )
                best_trial_id = trial_id
+        state.best_sampling_u = best_sampling_u
        state.best_request_rate = best_rate
        state.best_trial_id = best_trial_id
        self.save_state(state)
--- a/src/aituner/trace.py
+++ b/src/aituner/trace.py
@@ -132,6 +132,25 @@ def _downsample_requests(
    return [requests[idx] for idx in indexes]


+def _matches_input_length_filter(study: StudySpec, *, prompt_tokens_hint: int | None) -> bool:
+    length_filter = study.trace.input_length_filter
+    if length_filter is None:
+        return True
+    if prompt_tokens_hint is None:
+        return False
+    if (
+        length_filter.min_input_tokens is not None
+        and prompt_tokens_hint < length_filter.min_input_tokens
+    ):
+        return False
+    if (
+        length_filter.max_input_tokens is not None
+        and prompt_tokens_hint > length_filter.max_input_tokens
+    ):
+        return False
+    return True
+
+
 def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[WindowRecord, list[TraceRequest]]:
    window = resolve_window_record(study, study_spec_path=study_spec_path)
    time_scale = float(study.trace.replay_time_scale)
@@ -163,6 +182,8 @@ def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[Win
            if isinstance(sampling_u, bool) or not isinstance(sampling_u, (int, float)):
                raise TraceError(f"trace row {idx} is missing numeric {study.trace.u_field}")
            prompt_tokens_hint = _coerce_prompt_tokens(row)
+            if not _matches_input_length_filter(study, prompt_tokens_hint=prompt_tokens_hint):
+                continue
            try:
                messages = _coerce_messages(row)
            except TraceError:
--- a/src/aituner/worker.py
+++ b/src/aituner/worker.py
@@ -177,14 +177,19 @@ def _replay_requests(
            if early_stopped:
                break
            if futures_by_request:
-                timeout = None
+                timeout = 0.5
                if next_index < len(requests):
-                    timeout = max(0.0, requests[next_index].arrival_s - elapsed)
+                    timeout = min(timeout, max(0.0, requests[next_index].arrival_s - elapsed))
+                if max_elapsed_s is not None:
+                    remaining_elapsed = max(0.0, max_elapsed_s - elapsed)
+                    timeout = min(timeout, remaining_elapsed)
                done, _ = wait(
                    list(futures_by_request),
                    timeout=timeout,
                    return_when=FIRST_COMPLETED,
                )
+                if not done:
+                    continue
                for future in done:
                    request = futures_by_request.pop(future)
                    outcome = future.result()
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job
 from aituner.llm import build_prompt, parse_proposal_text
 from aituner.search import ThresholdProbe, binary_search_max_feasible
 from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
-from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec
+from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec
 from aituner.store import StudyStore
 from aituner.trace import load_trace_requests, summarize_window
 from aituner.worker import (
@@ -25,7 +25,9 @@ from aituner.worker import (
 from aituner.trace import TraceRequest


-def _write_study_assets(tmp_path: Path) -> Path:
+def _write_study_assets(
+    tmp_path: Path, *, trace_overrides: dict[str, object] | None = None
+) -> Path:
    trace_dir = tmp_path / "trace_windows" / "traces"
    trace_dir.mkdir(parents=True)
    trace_path = trace_dir / "chat_w1.jsonl"
@@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path:
    )

    study_path = tmp_path / "study.json"
+    trace_payload: dict[str, object] = {
+        "windows_path": str(windows_path),
+        "window_id": "chat_w1",
+        "u_field": "sampling_u",
+        "timestamp_field": "timestamp",
+        "max_concurrency": 4,
+    }
+    if trace_overrides:
+        trace_payload.update(trace_overrides)
+
    study_payload = {
        "study_id": "study-1",
        "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
@@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path:
            "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
            "python_executable": "python3"
        },
-        "trace": {
-            "windows_path": str(windows_path),
-            "window_id": "chat_w1",
-            "u_field": "sampling_u",
-            "timestamp_field": "timestamp",
-            "max_concurrency": 4
-        },
+        "trace": trace_payload,
        "slo": {
            "target_pass_rate": 0.95,
            "ttft_rule": {
@@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase):
            )
            self.assertIn("allowed_flag_keys", prompt)
            self.assertIn("study-1", prompt)
+            self.assertIn('"current_best"', prompt)
            self.assertIn("queueing_knee_by_bucket", prompt)
            self.assertTrue(study_root.exists())

+    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(
+                tmp_path,
+                trace_overrides={
+                    "input_length_filter": {
+                        "min_input_tokens": 0,
+                        "max_input_tokens": 8192,
+                    }
+                },
+            )
+            study = load_study_spec(study_path)
+            window, requests = load_trace_requests(study, study_spec_path=study_path)
+            summary = summarize_window(requests, window)
+            self.assertEqual(len(requests), 2)
+            self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
+            self.assertEqual(summary["request_count"], 2)
+            self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
+            prompt = build_prompt(
+                study=study,
+                window_summary=summary,
+                state=StudyState(study_id=study.study_id),
+                capability_profile=None,
+            )
+            self.assertIn('"input_length_filter"', prompt)
+            self.assertIn('"max_input_tokens": 8192', prompt)
+
+    def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(
+                tmp_path,
+                trace_overrides={
+                    "input_length_filter": {
+                        "min_input_tokens": 8193,
+                        "max_input_tokens": 8192,
+                    }
+                },
+            )
+            with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
+                load_study_spec(study_path)
+
    def test_prompt_includes_failed_trial_context(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase):
            )
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.best_trial_id, trial.trial_id)
+            self.assertEqual(next_state.best_sampling_u, 0.75)
            self.assertEqual(next_state.best_request_rate, 12.5)

+    def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store = StudyStore(tmp_path / ".aituner" / "studies")
+            store.init_study(spec_path=study_path, study=study)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0001",
+                best_sampling_u=0.375,
+                best_request_rate=3.0,
+                next_trial_index=2,
+                trials=[],
+            )
+            proposal = Proposal.from_dict(
+                {
+                    "observation": "Obs",
+                    "diagnosis": "Diag",
+                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
+                    "expected_effects": ["raise rate"],
+                }
+            )
+            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
+            self.assertEqual(trial.search.low, 0.375)
+            self.assertEqual(trial.search.high, 1.0)
+
    def test_ingest_trial_results_records_failure_reason(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase):
            store = StudyStore(store_root)
            state = store.load_state("study-1")
            self.assertEqual(state.best_trial_id, "trial-0002")
+            self.assertEqual(state.best_sampling_u, 0.75)
            self.assertEqual(state.best_request_rate, 2.0)
            self.assertEqual(state.next_trial_index, 3)

@@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase):
            ["throughput: higher", "ttft: lower"],
        )

+    def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            proposal = parse_proposal_text(
+                """Here is the proposal:
+```json
+{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
+```""",
+                study,
+            )
+            self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
+
    def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
        requests = [
            TraceRequest(
@@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase):
        self.assertEqual(len(replayed), 2)
        self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")

+    def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
+        requests = [
+            TraceRequest(
+                row_id="r0",
+                arrival_s=0.0,
+                sampling_u=0.1,
+                body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
+                prompt_tokens_hint=8,
+                completion_tokens_hint=4,
+            )
+        ]
+
+        class FakeFuture:
+            def result(self, timeout=None):
+                raise AssertionError("future should not be awaited after elapsed early stop")
+
+            def cancel(self):
+                return True
+
+        submitted = []
+
+        class FakeExecutor:
+            def __init__(self, max_workers):
+                self.max_workers = max_workers
+
+            def submit(self, fn, request, **kwargs):
+                submitted.append(request.row_id)
+                return FakeFuture()
+
+            def shutdown(self, wait=False, cancel_futures=True):
+                return None
+
+        wait_timeouts: list[float] = []
+
+        def fake_wait(futures, timeout=None, return_when=None):
+            wait_timeouts.append(timeout)
+            return set(), set(futures)
+
+        def fake_evaluate(outcome: RequestOutcome):
+            return type("Eval", (), {"passed": outcome.success})()
+
+        monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
+
+        with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
+            with mock.patch("aituner.worker.wait", side_effect=fake_wait):
+                with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
+                    replayed, early_stopped, reason = _replay_requests(
+                        requests,
+                        base_url="http://127.0.0.1:8000",
+                        timeout_s=30.0,
+                        max_concurrency=1,
+                        target_pass_rate=0.95,
+                        max_lag_s=None,
+                        max_elapsed_s=1.0,
+                        evaluate_outcome=fake_evaluate,
+                    )
+
+        self.assertEqual(submitted, ["r0"])
+        self.assertTrue(early_stopped)
+        self.assertEqual(reason, "probe_elapsed_s>1.0")
+        self.assertEqual(len(replayed), 1)
+        self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
+        self.assertTrue(wait_timeouts)
+        self.assertLessEqual(wait_timeouts[0], 0.5)
+
    def test_latency_summary_reports_quantiles_and_slo(self) -> None:
        study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
        outcomes = [