Add trace length bucket tuning support

2026-04-07 11:03:16 +08:00
parent e9b5e9b957
commit 46ed688ace
12 changed files with 922 additions and 14 deletions
--- a/configs/examples/dash0_qwen27b_tight_slo_baseline_proposal.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_baseline_proposal.json
@@ -0,0 +1,28 @@
 {
  "observation": "The incumbent should start from the known launch-safe qwen3.5-27b serving recipe on dash0 before asking the LLM to optimize throughput above that baseline.",
  "diagnosis": "This model uses a long-context hybrid stack and fp8 quantization. The safest first measurement is to preserve the existing warmup, hybrid-model, chunked-prefill, and prefix-caching behavior from run_qwen27b.sh, while keeping a conservative sequence cap.",
  "config_patch": {
    "env_patch": {
      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
      "VLLM_ENABLE_TORCH_COMPILE": "1",
      "VLLM_USE_FLASHINFER_SAMPLER": "0",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1"
    },
    "flag_patch": {
      "tensor-parallel-size": 4,
      "gpu-memory-utilization": 0.9,
      "block-size": 64,
      "max-num-batched-tokens": 8192,
      "max-num-seqs": 16,
      "enable-prefix-caching": true,
      "enable-chunked-prefill": true,
      "disable-cascade-attn": true
    }
  },
  "expected_effects": [
    "Launch-safe baseline aligned with the current hand-tuned qwen27b recipe while using all 4 visible H20 GPUs",
    "Reliable first incumbent under the tighter TTFT and TPOT SLO",
    "Clear trial history for the LLM to propose a higher-throughput follow-up patch"
  ],
  "why_not_previous_failures": "This baseline intentionally avoids speculative new kernels or batching spikes before we have an incumbent under the new SLO."
 }
--- a/configs/examples/dash0_qwen27b_tight_slo_run1.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run1.json
@@ -0,0 +1,147 @@
 {
  "study_id": "dash0-qwen27b-tight-slo-10min-run1",
  "hardware": {
    "gpu_count": 4,
    "gpu_model": "H20",
    "host_candidates": [
      "dash0"
    ]
  },
  "model": {
    "model_id": "qwen3.5-27b-256k-0223-internal",
    "served_model_name": "qwen35-27b-aituner"
  },
  "engine": {
    "engine_name": "vllm",
    "engine_version": "latest-release-on-dash0",
    "exec_path": "/usr/local/bin/vllm",
    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
    "host": "127.0.0.1",
    "port": 18082,
    "healthcheck_path": "/v1/models",
    "ready_timeout_s": 900,
    "request_timeout_s": 900,
    "launch_args": [
      "serve",
      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
    ],
    "base_envs": {
      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
      "VLLM_DISABLE_COMPILE_CACHE": "1",
      "DS_LLM_IGNORE_WARMUP": "1",
      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
      "PARAM_TOTAL_MAX": "262144",
      "PARAM_IN_LENGTH_MAX": "262144",
      "PARAM_MAX_LENGTH_MAX": "131072",
      "DS_LLM_MAX_THINK_TOKENS": "81920",
      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
      "VLLM_FP8_USE_BLADNN": "1",
      "VLLM_MOE_USE_BLADNN": "1",
      "VLLM_GDN_USE_BLADNN": "0",
      "VLLM_USE_V1": "1",
      "VLLM_IS_HYBRID_MODEL": "1",
      "VLLM_ENABLE_TORCH_COMPILE": "1",
      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
      "VLLM_USE_FLASHINFER_SAMPLER": "0",
      "VLLM_RESPONSE_TIMEOUT": "300",
      "VLLM_LOG_REQ_KV_LENS": "1",
      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
    },
    "base_flags": {
      "host": "127.0.0.1",
      "port": 18082,
      "served-model-name": "qwen35-27b-aituner",
      "trust-remote-code": true,
      "dtype": "bfloat16",
      "gpu-memory-utilization": 0.9,
      "enable-prefix-caching": true,
      "mamba-cache-mode": "light",
      "distributed-executor-backend": "mp",
      "block-size": 64,
      "enable-chunked-prefill": true,
      "max-num-batched-tokens": 8192,
      "disable-cascade-attn": true,
      "max-model-len": 262144,
      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
      "mm-processor-cache-gb": 0,
      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
      "mamba-cache-dtype": "float32",
      "skip-mm-profiling": true,
      "quantization": "fp8",
      "tensor-parallel-size": 1,
      "max-num-seqs": 16,
      "disable-log-requests": true
    },
    "tunable_envs": [
      "VLLM_ATTENTION_BACKEND",
      "VLLM_ENABLE_TORCH_COMPILE",
      "VLLM_USE_FLASHINFER_SAMPLER",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
    ],
    "tunable_flags": [
      "tensor-parallel-size",
      "gpu-memory-utilization",
      "block-size",
      "max-num-batched-tokens",
      "max-num-seqs",
      "enable-prefix-caching",
      "enable-chunked-prefill",
      "disable-cascade-attn"
    ],
    "python_executable": "python3"
  },
  "trace": {
    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
    "window_id": "chat_w20260311_1000",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
    "max_concurrency": 32,
    "replay_time_scale": 1.0,
    "early_stop_max_lag_s": 120.0,
    "early_stop_max_elapsed_s": 900.0
  },
  "slo": {
    "target_pass_rate": 0.95,
    "ttft_rule": {
      "kind": "step_ms",
      "buckets": [
        {
          "max_input_tokens": 4096,
          "threshold_ms": 2000
        },
        {
          "max_input_tokens": 32768,
          "threshold_ms": 4000
        },
        {
          "threshold_ms": 6000
        }
      ]
    },
    "tpot_rule": {
      "kind": "fixed_ms",
      "threshold_ms": 50
    }
  },
  "search": {
    "low": 0.0,
    "high": 1.0,
    "tolerance": 0.01,
    "max_probes": 8,
    "sample_seed": 20260325
  },
  "llm": {
    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
    "max_history_trials": 8,
    "endpoint": {
      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
      "model": "gpt-5.4",
      "api_key_env": "OPENAI_API_KEY",
      "timeout_s": 180
    }
  }
 }
--- a/configs/examples/dash0_qwen27b_tight_slo_run2.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run2.json
@@ -0,0 +1,147 @@
 {
  "study_id": "dash0-qwen27b-tight-slo-10min-run2",
  "hardware": {
    "gpu_count": 4,
    "gpu_model": "H20",
    "host_candidates": [
      "dash0"
    ]
  },
  "model": {
    "model_id": "qwen3.5-27b-256k-0223-internal",
    "served_model_name": "qwen35-27b-aituner"
  },
  "engine": {
    "engine_name": "vllm",
    "engine_version": "latest-release-on-dash0",
    "exec_path": "/usr/local/bin/vllm",
    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
    "host": "127.0.0.1",
    "port": 18082,
    "healthcheck_path": "/v1/models",
    "ready_timeout_s": 900,
    "request_timeout_s": 900,
    "launch_args": [
      "serve",
      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
    ],
    "base_envs": {
      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
      "VLLM_DISABLE_COMPILE_CACHE": "1",
      "DS_LLM_IGNORE_WARMUP": "1",
      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
      "PARAM_TOTAL_MAX": "262144",
      "PARAM_IN_LENGTH_MAX": "262144",
      "PARAM_MAX_LENGTH_MAX": "131072",
      "DS_LLM_MAX_THINK_TOKENS": "81920",
      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
      "VLLM_FP8_USE_BLADNN": "1",
      "VLLM_MOE_USE_BLADNN": "1",
      "VLLM_GDN_USE_BLADNN": "0",
      "VLLM_USE_V1": "1",
      "VLLM_IS_HYBRID_MODEL": "1",
      "VLLM_ENABLE_TORCH_COMPILE": "1",
      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
      "VLLM_USE_FLASHINFER_SAMPLER": "0",
      "VLLM_RESPONSE_TIMEOUT": "300",
      "VLLM_LOG_REQ_KV_LENS": "1",
      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
    },
    "base_flags": {
      "host": "127.0.0.1",
      "port": 18082,
      "served-model-name": "qwen35-27b-aituner",
      "trust-remote-code": true,
      "dtype": "bfloat16",
      "gpu-memory-utilization": 0.9,
      "enable-prefix-caching": true,
      "mamba-cache-mode": "light",
      "distributed-executor-backend": "mp",
      "block-size": 64,
      "enable-chunked-prefill": true,
      "max-num-batched-tokens": 8192,
      "disable-cascade-attn": true,
      "max-model-len": 262144,
      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
      "mm-processor-cache-gb": 0,
      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
      "mamba-cache-dtype": "float32",
      "skip-mm-profiling": true,
      "quantization": "fp8",
      "tensor-parallel-size": 4,
      "max-num-seqs": 16,
      "disable-log-requests": true
    },
    "tunable_envs": [
      "VLLM_ATTENTION_BACKEND",
      "VLLM_ENABLE_TORCH_COMPILE",
      "VLLM_USE_FLASHINFER_SAMPLER",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
    ],
    "tunable_flags": [
      "tensor-parallel-size",
      "gpu-memory-utilization",
      "block-size",
      "max-num-batched-tokens",
      "max-num-seqs",
      "enable-prefix-caching",
      "enable-chunked-prefill",
      "disable-cascade-attn"
    ],
    "python_executable": "python3"
  },
  "trace": {
    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
    "window_id": "chat_w20260311_1000",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
    "max_concurrency": 32,
    "replay_time_scale": 1.0,
    "early_stop_max_lag_s": 120.0,
    "early_stop_max_elapsed_s": 900.0
  },
  "slo": {
    "target_pass_rate": 0.95,
    "ttft_rule": {
      "kind": "step_ms",
      "buckets": [
        {
          "max_input_tokens": 4096,
          "threshold_ms": 2000
        },
        {
          "max_input_tokens": 32768,
          "threshold_ms": 4000
        },
        {
          "threshold_ms": 6000
        }
      ]
    },
    "tpot_rule": {
      "kind": "fixed_ms",
      "threshold_ms": 50
    }
  },
  "search": {
    "low": 0.0,
    "high": 0.0625,
    "tolerance": 0.001,
    "max_probes": 6,
    "sample_seed": 20260325
  },
  "llm": {
    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
    "max_history_trials": 8,
    "endpoint": {
      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
      "model": "gpt-5.4",
      "api_key_env": "OPENAI_API_KEY",
      "timeout_s": 180
    }
  }
 }
--- a/configs/examples/dash0_qwen27b_tight_slo_run3.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run3.json
@@ -0,0 +1,147 @@
 {
  "study_id": "dash0-qwen27b-tight-slo-10min-run3",
  "hardware": {
    "gpu_count": 4,
    "gpu_model": "H20",
    "host_candidates": [
      "dash0"
    ]
  },
  "model": {
    "model_id": "qwen3.5-27b-256k-0223-internal",
    "served_model_name": "qwen35-27b-aituner"
  },
  "engine": {
    "engine_name": "vllm",
    "engine_version": "latest-release-on-dash0",
    "exec_path": "/usr/local/bin/vllm",
    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
    "host": "127.0.0.1",
    "port": 18082,
    "healthcheck_path": "/v1/models",
    "ready_timeout_s": 900,
    "request_timeout_s": 900,
    "launch_args": [
      "serve",
      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
    ],
    "base_envs": {
      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
      "VLLM_DISABLE_COMPILE_CACHE": "1",
      "DS_LLM_IGNORE_WARMUP": "1",
      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
      "PARAM_TOTAL_MAX": "262144",
      "PARAM_IN_LENGTH_MAX": "262144",
      "PARAM_MAX_LENGTH_MAX": "131072",
      "DS_LLM_MAX_THINK_TOKENS": "81920",
      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
      "VLLM_FP8_USE_BLADNN": "1",
      "VLLM_MOE_USE_BLADNN": "1",
      "VLLM_GDN_USE_BLADNN": "0",
      "VLLM_USE_V1": "1",
      "VLLM_IS_HYBRID_MODEL": "1",
      "VLLM_ENABLE_TORCH_COMPILE": "1",
      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
      "VLLM_USE_FLASHINFER_SAMPLER": "0",
      "VLLM_RESPONSE_TIMEOUT": "300",
      "VLLM_LOG_REQ_KV_LENS": "1",
      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
    },
    "base_flags": {
      "host": "127.0.0.1",
      "port": 18082,
      "served-model-name": "qwen35-27b-aituner",
      "trust-remote-code": true,
      "dtype": "bfloat16",
      "gpu-memory-utilization": 0.9,
      "enable-prefix-caching": true,
      "mamba-cache-mode": "light",
      "distributed-executor-backend": "mp",
      "block-size": 64,
      "enable-chunked-prefill": true,
      "max-num-batched-tokens": 8192,
      "disable-cascade-attn": true,
      "max-model-len": 262144,
      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
      "mm-processor-cache-gb": 0,
      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
      "mamba-cache-dtype": "float32",
      "skip-mm-profiling": true,
      "quantization": "fp8",
      "tensor-parallel-size": 4,
      "max-num-seqs": 16,
      "disable-log-requests": true
    },
    "tunable_envs": [
      "VLLM_ATTENTION_BACKEND",
      "VLLM_ENABLE_TORCH_COMPILE",
      "VLLM_USE_FLASHINFER_SAMPLER",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
    ],
    "tunable_flags": [
      "tensor-parallel-size",
      "gpu-memory-utilization",
      "block-size",
      "max-num-batched-tokens",
      "max-num-seqs",
      "enable-prefix-caching",
      "enable-chunked-prefill",
      "disable-cascade-attn"
    ],
    "python_executable": "python3"
  },
  "trace": {
    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
    "window_id": "chat_w20260311_1000",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
    "max_concurrency": 32,
    "replay_time_scale": 1.0,
    "early_stop_max_lag_s": 120.0,
    "early_stop_max_elapsed_s": 900.0
  },
  "slo": {
    "target_pass_rate": 0.95,
    "ttft_rule": {
      "kind": "step_ms",
      "buckets": [
        {
          "max_input_tokens": 4096,
          "threshold_ms": 2000
        },
        {
          "max_input_tokens": 32768,
          "threshold_ms": 4000
        },
        {
          "threshold_ms": 6000
        }
      ]
    },
    "tpot_rule": {
      "kind": "fixed_ms",
      "threshold_ms": 50
    }
  },
  "search": {
    "low": 0.0,
    "high": 0.0625,
    "tolerance": 0.001,
    "max_probes": 6,
    "sample_seed": 20260325
  },
  "llm": {
    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
    "max_history_trials": 8,
    "endpoint": {
      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
      "model": "gpt-5.4",
      "api_key_env": "OPENAI_API_KEY",
      "timeout_s": 180
    }
  }
 }
--- a/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json
+++ b/configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json
@@ -0,0 +1,151 @@
 {
  "study_id": "dash0-qwen27b-tight-slo-10min-run4-chat-0-8k",
  "hardware": {
    "gpu_count": 4,
    "gpu_model": "H20",
    "host_candidates": [
      "dash0"
    ]
  },
  "model": {
    "model_id": "qwen3.5-27b-256k-0223-internal",
    "served_model_name": "qwen35-27b-aituner"
  },
  "engine": {
    "engine_name": "vllm",
    "engine_version": "latest-release-on-dash0",
    "exec_path": "/usr/local/bin/vllm",
    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
    "host": "127.0.0.1",
    "port": 18082,
    "healthcheck_path": "/v1/models",
    "ready_timeout_s": 900,
    "request_timeout_s": 900,
    "launch_args": [
      "serve",
      "/home/admin/resource/model/464482ce/qwen3.5-27b/256k-0223-internal"
    ],
    "base_envs": {
      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
      "VLLM_DISABLE_COMPILE_CACHE": "1",
      "DS_LLM_IGNORE_WARMUP": "1",
      "DS_LLM_IGNORE_CHECK_WARMUP": "1",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP": "1",
      "VLLM_GDN_USE_FUSED_QKVZBA_KERNEL": "0",
      "PARAM_TOTAL_MAX": "262144",
      "PARAM_IN_LENGTH_MAX": "262144",
      "PARAM_MAX_LENGTH_MAX": "131072",
      "DS_LLM_MAX_THINK_TOKENS": "81920",
      "DS_LLM_GRACEFUL_SHUTDOWN_WAIT_SECONDS": "600",
      "VLLM_FP8_USE_BLADNN": "1",
      "VLLM_MOE_USE_BLADNN": "1",
      "VLLM_GDN_USE_BLADNN": "0",
      "VLLM_USE_V1": "1",
      "VLLM_IS_HYBRID_MODEL": "1",
      "VLLM_ENABLE_TORCH_COMPILE": "1",
      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
      "VLLM_QUANTIZE_ROUTED_EXPERTS_ONLY": "1",
      "VLLM_USE_FLASHINFER_SAMPLER": "0",
      "VLLM_RESPONSE_TIMEOUT": "300",
      "VLLM_LOG_REQ_KV_LENS": "1",
      "DS_LLM_GRACEFUL_SHUTDOWN_KEEP_SECONDS": "600"
    },
    "base_flags": {
      "host": "127.0.0.1",
      "port": 18082,
      "served-model-name": "qwen35-27b-aituner",
      "trust-remote-code": true,
      "dtype": "bfloat16",
      "gpu-memory-utilization": 0.9,
      "enable-prefix-caching": true,
      "mamba-cache-mode": "light",
      "distributed-executor-backend": "mp",
      "block-size": 64,
      "enable-chunked-prefill": true,
      "max-num-batched-tokens": 8192,
      "disable-cascade-attn": true,
      "max-model-len": 262144,
      "speculative-config": "{\"method\":\"qwen3_next_vl_mtp\",\"num_speculative_tokens\":3}",
      "mm-processor-cache-gb": 0,
      "limit-mm-per-prompt": "{\"image\":256,\"video\":64}",
      "compilation-config": "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor\":false,\"pass_config\":{\"fuse_norm_quant\":false,\"fuse_act_quant\":false,\"fuse_attn_quant\":false}}",
      "mamba-cache-dtype": "float32",
      "skip-mm-profiling": true,
      "quantization": "fp8",
      "tensor-parallel-size": 4,
      "max-num-seqs": 16,
      "disable-log-requests": true
    },
    "tunable_envs": [
      "VLLM_ATTENTION_BACKEND",
      "VLLM_ENABLE_TORCH_COMPILE",
      "VLLM_USE_FLASHINFER_SAMPLER",
      "VLLM_ENABLE_MODEL_RUNNER_WARMUP"
    ],
    "tunable_flags": [
      "tensor-parallel-size",
      "gpu-memory-utilization",
      "block-size",
      "max-num-batched-tokens",
      "max-num-seqs",
      "enable-prefix-caching",
      "enable-chunked-prefill",
      "disable-cascade-attn"
    ],
    "python_executable": "python3"
  },
  "trace": {
    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
    "window_id": "chat_w20260311_1000",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
    "max_concurrency": 32,
    "input_length_filter": {
      "min_input_tokens": 0,
      "max_input_tokens": 8192
    },
    "replay_time_scale": 1.0,
    "early_stop_max_lag_s": 120.0,
    "early_stop_max_elapsed_s": 900.0
  },
  "slo": {
    "target_pass_rate": 0.95,
    "ttft_rule": {
      "kind": "step_ms",
      "buckets": [
        {
          "max_input_tokens": 4096,
          "threshold_ms": 2000
        },
        {
          "max_input_tokens": 32768,
          "threshold_ms": 4000
        },
        {
          "threshold_ms": 6000
        }
      ]
    },
    "tpot_rule": {
      "kind": "fixed_ms",
      "threshold_ms": 50
    }
  },
  "search": {
    "low": 0.0,
    "high": 0.0625,
    "tolerance": 0.001,
    "max_probes": 6,
    "sample_seed": 20260325
  },
  "llm": {
    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target. Favor launch-safe changes grounded in the incumbent result and only propose knobs that plausibly improve throughput above the incumbent request rate.",
    "max_history_trials": 8,
    "endpoint": {
      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
      "model": "gpt-5.4",
      "api_key_env": "OPENAI_API_KEY",
      "timeout_s": 180
    }
  }
 }
--- a/configs/examples/study.example.json
+++ b/configs/examples/study.example.json
@@ -50,7 +50,11 @@
    "window_id": "chat_w_example_0001",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
-    "max_concurrency": 64
+    "max_concurrency": 64,
    "input_length_filter": {
      "min_input_tokens": 0,
      "max_input_tokens": 8192
    }
  },
  "slo": {
    "target_pass_rate": 0.95,
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -42,6 +42,11 @@ def build_prompt(
        json.dumps(
            {
                "study_id": study.study_id,
                "current_best": {
                    "trial_id": state.best_trial_id,
                    "best_sampling_u": state.best_sampling_u,
                    "best_request_rate": state.best_request_rate,
                },
                "hardware": {
                    "gpu_count": study.hardware.gpu_count,
                    "gpu_model": study.hardware.gpu_model,
@@ -50,6 +55,17 @@ def build_prompt(
                    "model_id": study.model.model_id,
                    "served_model_name": study.model.served_model_name,
                },
                "trace": {
                    "window_id": study.trace.window_id,
                    "input_length_filter": (
                        {
                            "min_input_tokens": study.trace.input_length_filter.min_input_tokens,
                            "max_input_tokens": study.trace.input_length_filter.max_input_tokens,
                        }
                        if study.trace.input_length_filter is not None
                        else None
                    ),
                },
                "engine": {
                    "engine_name": study.engine.engine_name,
                    "engine_version": study.engine.engine_version,
@@ -84,6 +100,8 @@ def build_prompt(
        "Trial history:",
        json.dumps(history, ensure_ascii=False, indent=2),
        "",
        "The proposal must beat the current incumbent. Do not propose a config that is only likely to be feasible below the current best_sampling_u/request_rate.",
        "The evaluator for a new trial will start searching from the current best feasible sampling_u and only look for improvements above it.",
        "The proposal should improve the maximum feasible sampling_u under the 95%+ SLO target.",
    ]
    return "\n".join(sections)
@@ -110,8 +128,22 @@ def validate_proposal(proposal: Proposal, study: StudySpec) -> Proposal:
    return proposal
 def _parse_json_object_text(text: str) -> dict[str, Any]:
    try:
        payload = json.loads(text)
    except json.JSONDecodeError:
        start = text.find("{")
        end = text.rfind("}")
        if start < 0 or end < start:
            raise
        payload = json.loads(text[start : end + 1])
    if not isinstance(payload, dict):
        raise SpecError("proposal payload must be a JSON object")
    return payload
 def parse_proposal_text(text: str, study: StudySpec) -> Proposal:
-    payload = json.loads(text)
+    payload = _parse_json_object_text(text)
    proposal = Proposal.from_dict(payload)
    return validate_proposal(proposal, study)
--- a/src/aituner/spec.py
+++ b/src/aituner/spec.py
@@ -142,6 +142,42 @@ class EngineLaunchSpec:
        )
@dataclass(frozen=True)
 class InputLengthFilterSpec:
    min_input_tokens: int | None = None
    max_input_tokens: int | None = None
    @classmethod
    def from_dict(cls, data: Mapping[str, Any], *, context: str) -> "InputLengthFilterSpec":
        min_input_tokens = data.get("min_input_tokens")
        max_input_tokens = data.get("max_input_tokens")
        spec = cls(
            min_input_tokens=(
                _require_int(min_input_tokens, context=f"{context}.min_input_tokens")
                if min_input_tokens is not None
                else None
            ),
            max_input_tokens=(
                _require_int(max_input_tokens, context=f"{context}.max_input_tokens")
                if max_input_tokens is not None
                else None
            ),
        )
        if spec.min_input_tokens is None and spec.max_input_tokens is None:
            raise SpecError(
                f"{context} must define at least one of min_input_tokens/max_input_tokens."
            )
        if (
            spec.min_input_tokens is not None
            and spec.max_input_tokens is not None
            and spec.min_input_tokens > spec.max_input_tokens
        ):
            raise SpecError(
                f"{context}.min_input_tokens must be <= {context}.max_input_tokens."
            )
        return spec
@dataclass(frozen=True)
 class TraceSpec:
    windows_path: str
@@ -150,6 +186,7 @@ class TraceSpec:
    u_field: str
    timestamp_field: str
    max_concurrency: int
    input_length_filter: InputLengthFilterSpec | None = None
    max_requests_per_probe: int | None = None
    synthetic_prompt_cap_tokens: int | None = None
    replay_time_scale: float = 1.0
@@ -171,6 +208,17 @@ class TraceSpec:
            max_concurrency=_require_int(
                data.get("max_concurrency", 64), context="trace.max_concurrency"
            ),
            input_length_filter=(
                InputLengthFilterSpec.from_dict(
                    _require_mapping(
                        data.get("input_length_filter"),
                        context="trace.input_length_filter",
                    ),
                    context="trace.input_length_filter",
                )
                if data.get("input_length_filter") is not None
                else None
            ),
            max_requests_per_probe=int(max_requests) if max_requests is not None else None,
            synthetic_prompt_cap_tokens=(
                int(synthetic_prompt_cap) if synthetic_prompt_cap is not None else None
@@ -454,6 +502,7 @@ class TrialSummary:
 class StudyState:
    study_id: str
    best_trial_id: str | None = None
    best_sampling_u: float | None = None
    best_request_rate: float | None = None
    next_trial_index: int = 1
    trials: list[TrialSummary] = field(default_factory=list)
--- a/src/aituner/store.py
+++ b/src/aituner/store.py
@@ -32,6 +32,7 @@ class StudyStore:
        return StudyState(
            study_id=str(payload["study_id"]),
            best_trial_id=payload.get("best_trial_id"),
            best_sampling_u=payload.get("best_sampling_u"),
            best_request_rate=payload.get("best_request_rate"),
            next_trial_index=int(payload.get("next_trial_index", 1)),
            trials=trials,
@@ -64,7 +65,18 @@ class StudyStore:
            study_id=study.study_id,
            trial_id=trial_id,
            config_patch=proposal.config_patch,
-            search=study.search,
+            search=replace(
                study.search,
                low=min(
                    study.search.high,
                    max(
                        study.search.low,
                        float(state.best_sampling_u)
                        if isinstance(state.best_sampling_u, (int, float))
                        else study.search.low,
                    ),
                ),
            ),
            study_spec_path=str((self.study_root(study.study_id) / "study_spec.source").resolve()),
            artifact_dir=str(trial_root),
            probe_log_path=str(trial_root / "probe_history.json"),
@@ -89,6 +101,7 @@ class StudyStore:
        by_id = {item.trial_id: item for item in state.trials}
        trials_dir = self.study_root(study_id) / "trials"
        best_trial_id = state.best_trial_id
        best_sampling_u = state.best_sampling_u
        best_rate = state.best_request_rate
        for trial_dir in sorted(trials_dir.glob("trial-*")):
            result_path = trial_dir / "result.json"
@@ -112,7 +125,13 @@ class StudyStore:
                and (best_rate is None or summary.best_request_rate > best_rate)
            ):
                best_rate = float(summary.best_request_rate)
                best_sampling_u = (
                    float(summary.best_sampling_u)
                    if isinstance(summary.best_sampling_u, (int, float))
                    else None
                )
                best_trial_id = trial_id
        state.best_sampling_u = best_sampling_u
        state.best_request_rate = best_rate
        state.best_trial_id = best_trial_id
        self.save_state(state)
--- a/src/aituner/trace.py
+++ b/src/aituner/trace.py
@@ -132,6 +132,25 @@ def _downsample_requests(
    return [requests[idx] for idx in indexes]
 def _matches_input_length_filter(study: StudySpec, *, prompt_tokens_hint: int | None) -> bool:
    length_filter = study.trace.input_length_filter
    if length_filter is None:
        return True
    if prompt_tokens_hint is None:
        return False
    if (
        length_filter.min_input_tokens is not None
        and prompt_tokens_hint < length_filter.min_input_tokens
    ):
        return False
    if (
        length_filter.max_input_tokens is not None
        and prompt_tokens_hint > length_filter.max_input_tokens
    ):
        return False
    return True
 def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[WindowRecord, list[TraceRequest]]:
    window = resolve_window_record(study, study_spec_path=study_spec_path)
    time_scale = float(study.trace.replay_time_scale)
@@ -163,6 +182,8 @@ def load_trace_requests(study: StudySpec, *, study_spec_path: Path) -> tuple[Win
            if isinstance(sampling_u, bool) or not isinstance(sampling_u, (int, float)):
                raise TraceError(f"trace row {idx} is missing numeric {study.trace.u_field}")
            prompt_tokens_hint = _coerce_prompt_tokens(row)
            if not _matches_input_length_filter(study, prompt_tokens_hint=prompt_tokens_hint):
                continue
            try:
                messages = _coerce_messages(row)
            except TraceError:
--- a/src/aituner/worker.py
+++ b/src/aituner/worker.py
@@ -177,14 +177,19 @@ def _replay_requests(
            if early_stopped:
                break
            if futures_by_request:
-                timeout = None
+                timeout = 0.5
                if next_index < len(requests):
-                    timeout = max(0.0, requests[next_index].arrival_s - elapsed)
+                    timeout = min(timeout, max(0.0, requests[next_index].arrival_s - elapsed))
                if max_elapsed_s is not None:
                    remaining_elapsed = max(0.0, max_elapsed_s - elapsed)
                    timeout = min(timeout, remaining_elapsed)
                done, _ = wait(
                    list(futures_by_request),
                    timeout=timeout,
                    return_when=FIRST_COMPLETED,
                )
                if not done:
                    continue
                for future in done:
                    request = futures_by_request.pop(future)
                    outcome = future.result()
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job
 from aituner.llm import build_prompt, parse_proposal_text
 from aituner.search import ThresholdProbe, binary_search_max_feasible
 from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
-from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec
+from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec
 from aituner.store import StudyStore
 from aituner.trace import load_trace_requests, summarize_window
 from aituner.worker import (
@@ -25,7 +25,9 @@ from aituner.worker import (
 from aituner.trace import TraceRequest
-def _write_study_assets(tmp_path: Path) -> Path:
+def _write_study_assets(
    tmp_path: Path, *, trace_overrides: dict[str, object] | None = None
 ) -> Path:
    trace_dir = tmp_path / "trace_windows" / "traces"
    trace_dir.mkdir(parents=True)
    trace_path = trace_dir / "chat_w1.jsonl"
@@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path:
    )
    study_path = tmp_path / "study.json"
    trace_payload: dict[str, object] = {
        "windows_path": str(windows_path),
        "window_id": "chat_w1",
        "u_field": "sampling_u",
        "timestamp_field": "timestamp",
        "max_concurrency": 4,
    }
    if trace_overrides:
        trace_payload.update(trace_overrides)
    study_payload = {
        "study_id": "study-1",
        "hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
@@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path:
            "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
            "python_executable": "python3"
        },
-        "trace": {
+        "trace": trace_payload,
            "windows_path": str(windows_path),
            "window_id": "chat_w1",
            "u_field": "sampling_u",
            "timestamp_field": "timestamp",
            "max_concurrency": 4
        },
        "slo": {
            "target_pass_rate": 0.95,
            "ttft_rule": {
@@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase):
            )
            self.assertIn("allowed_flag_keys", prompt)
            self.assertIn("study-1", prompt)
            self.assertIn('"current_best"', prompt)
            self.assertIn("queueing_knee_by_bucket", prompt)
            self.assertTrue(study_root.exists())
    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={
                    "input_length_filter": {
                        "min_input_tokens": 0,
                        "max_input_tokens": 8192,
                    }
                },
            )
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            summary = summarize_window(requests, window)
            self.assertEqual(len(requests), 2)
            self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
            self.assertEqual(summary["request_count"], 2)
            self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
            prompt = build_prompt(
                study=study,
                window_summary=summary,
                state=StudyState(study_id=study.study_id),
                capability_profile=None,
            )
            self.assertIn('"input_length_filter"', prompt)
            self.assertIn('"max_input_tokens": 8192', prompt)
    def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={
                    "input_length_filter": {
                        "min_input_tokens": 8193,
                        "max_input_tokens": 8192,
                    }
                },
            )
            with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
                load_study_spec(study_path)
    def test_prompt_includes_failed_trial_context(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase):
            )
            next_state = store.ingest_trial_results(study.study_id)
            self.assertEqual(next_state.best_trial_id, trial.trial_id)
            self.assertEqual(next_state.best_sampling_u, 0.75)
            self.assertEqual(next_state.best_request_rate, 12.5)
    def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store = StudyStore(tmp_path / ".aituner" / "studies")
            store.init_study(spec_path=study_path, study=study)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0001",
                best_sampling_u=0.375,
                best_request_rate=3.0,
                next_trial_index=2,
                trials=[],
            )
            proposal = Proposal.from_dict(
                {
                    "observation": "Obs",
                    "diagnosis": "Diag",
                    "config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
                    "expected_effects": ["raise rate"],
                }
            )
            trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
            self.assertEqual(trial.search.low, 0.375)
            self.assertEqual(trial.search.high, 1.0)
    def test_ingest_trial_results_records_failure_reason(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase):
            store = StudyStore(store_root)
            state = store.load_state("study-1")
            self.assertEqual(state.best_trial_id, "trial-0002")
            self.assertEqual(state.best_sampling_u, 0.75)
            self.assertEqual(state.best_request_rate, 2.0)
            self.assertEqual(state.next_trial_index, 3)
@@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase):
            ["throughput: higher", "ttft: lower"],
        )
    def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            proposal = parse_proposal_text(
                """Here is the proposal:
 ```json
 {"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
 ```""",
                study,
            )
            self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
    def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
        requests = [
            TraceRequest(
@@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase):
        self.assertEqual(len(replayed), 2)
        self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")
    def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
        requests = [
            TraceRequest(
                row_id="r0",
                arrival_s=0.0,
                sampling_u=0.1,
                body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
                prompt_tokens_hint=8,
                completion_tokens_hint=4,
            )
        ]
        class FakeFuture:
            def result(self, timeout=None):
                raise AssertionError("future should not be awaited after elapsed early stop")
            def cancel(self):
                return True
        submitted = []
        class FakeExecutor:
            def __init__(self, max_workers):
                self.max_workers = max_workers
            def submit(self, fn, request, **kwargs):
                submitted.append(request.row_id)
                return FakeFuture()
            def shutdown(self, wait=False, cancel_futures=True):
                return None
        wait_timeouts: list[float] = []
        def fake_wait(futures, timeout=None, return_when=None):
            wait_timeouts.append(timeout)
            return set(), set(futures)
        def fake_evaluate(outcome: RequestOutcome):
            return type("Eval", (), {"passed": outcome.success})()
        monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
        with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
            with mock.patch("aituner.worker.wait", side_effect=fake_wait):
                with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
                    replayed, early_stopped, reason = _replay_requests(
                        requests,
                        base_url="http://127.0.0.1:8000",
                        timeout_s=30.0,
                        max_concurrency=1,
                        target_pass_rate=0.95,
                        max_lag_s=None,
                        max_elapsed_s=1.0,
                        evaluate_outcome=fake_evaluate,
                    )
        self.assertEqual(submitted, ["r0"])
        self.assertTrue(early_stopped)
        self.assertEqual(reason, "probe_elapsed_s>1.0")
        self.assertEqual(len(replayed), 1)
        self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
        self.assertTrue(wait_timeouts)
        self.assertLessEqual(wait_timeouts[0], 0.5)
    def test_latency_summary_reports_quantiles_and_slo(self) -> None:
        study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
        outcomes = [