Add ReplayServe Frontier vLLM alignment report

2026-06-25 17:10:30 +08:00
commit a99bd00782
63 changed files with 17033 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,6 @@
 __pycache__/
 *.pyc
 .cache/
 .deps/
 .venv/
 runs/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,109 @@
 # ReplayServe
 ReplayServe is a small trace-replay workspace for reproducing real LLM serving
 traces in open simulators. The first target is Frontier trace replay with
 timestamp, prompt length, decode length, session, and prefix block reuse
 preserved from the Qwen Bailian anonymized JSONL traces.
 RS0 only bootstraps the repository, documents source versions, implements the
 Qwen JSONL to Frontier CSV adapter semantics, and creates canonical fixtures.
 It does not run the Frontier simulator; RS1 owns simulator smoke runs.
 ## First Frontier Smoke Point
 The first Frontier smoke is fixed to this plumbing-only configuration:
 - `simulation_mode=online`
 - `sys_arch=co-location`
 - `replica_scheduler=vllm_v1`
 - `device=a800`
 - `model_name=Qwen/Qwen3-32B`
 - `attn_tensor_parallel_size=2`
 - dummy execution predictor
 - analytical communication backend
 - `trace_request_generator_config_max_tokens=32768`
 - prefix cache enabled
 - block size 16
 - chunked prefill enabled
 - batch cap 128
 - max batch tokens 32768
 - KV capacity estimated by Frontier memory planner
 Frontier currently has A800 network profiles, but the checked public A800
 compute profiles do not include dense `Qwen/Qwen3-32B`. RS1 latency and
 throughput numbers from this point are therefore plumbing smoke only, not
 profile-faithful performance conclusions.
 ## Real vLLM GPU Baseline
 RS4 starts a real-backend baseline on dash2 H20 with
 `/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`. The runner
 `tools/vllm_synthetic_replay.py` uses synthetic `prompt_token_ids` derived from
 the Qwen block hashes, so equal trace blocks become equal token blocks and vLLM
 prefix-cache hits can be observed directly.
 See `docs/rs4_vllm_gpu_smoke.md` for the first TP=1/2 smoke results and the
 reliability boundary.
 ## Adapter Semantics
 `tools/qwen_to_frontier.py` converts Qwen JSONL rows to Frontier CSV rows and a
 ReplayServe sidecar JSONL.
 Field mapping:
 | Qwen JSONL | Frontier CSV | Notes |
 |---|---|---|
 | `timestamp` | `arrived_at` | Trace-relative seconds. |
 | `input_length` | `num_prefill_tokens` | Already post chat-template serving input. |
 | `output_length` | `num_decode_tokens` | Generation length. |
 | `chat_id` | `session_id` | Preserved for session-aware analysis. |
 | `hash_ids` | `block_hash_ids` | Joined with `|` for Frontier. |
 The Qwen trace uses 16-token salted SipHash blocks. The adapter asserts
 `len(hash_ids) == ceil(input_length / block_size)`. The final block can be a
 padded partial block; its true token count is `input_length % block_size`, or
 `block_size` when the prompt length is divisible by the block size. The sidecar
 records `block_token_counts` so downstream analyses can compute token-weighted
 prefix-cache accounting while Frontier replays the original block hashes.
 Overflow handling is intentionally explicit. The adapter never clips prompt or
 decode tokens. With `--fail-on-overflow`, any row where
 `input_length + output_length > --max-tokens` exits with an error before
 publishing output files.
 Example:
 ```bash
 python3 tools/qwen_to_frontier.py \
  --input /home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl \
  --frontier-csv traces/fixtures/coder_100/frontier.csv \
  --sidecar-jsonl traces/fixtures/coder_100/sidecar.jsonl \
  --source-jsonl traces/fixtures/coder_100/source.jsonl \
  --manifest-json traces/fixtures/coder_100/manifest.json \
  --fixture-name coder_100 \
  --limit 100 \
  --max-tokens 32768 \
  --block-size 16 \
  --fail-on-overflow
 ```
 Validate fixtures:
 ```bash
 python3 tools/validate_fixtures.py \
  traces/fixtures/coder_100 \
  traces/fixtures/coder_2000 \
  --max-tokens 32768 \
  --block-size 16
 ```
 ## Fixture Layout
 Each fixture directory under `traces/fixtures/` contains:
 - `source.jsonl`: the original Qwen JSONL slice.
 - `frontier.csv`: Frontier trace replay CSV.
 - `sidecar.jsonl`: ReplayServe metadata with original fields and block token
  counts.
 - `manifest.json`: generation parameters and basic stats.
--- a/configs/rs10_frontier_h20_tp1_profile_full32k_coder200_ts2_ts3.json
+++ b/configs/rs10_frontier_h20_tp1_profile_full32k_coder200_ts2_ts3.json
@@ -0,0 +1,60 @@
 {
  "suite_id": "rs10_frontier_h20_tp1_profile_full32k_coder200_ts2_ts3",
  "sim": "frontier_h20_tp1_profile_full32k",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patches": [
      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
    ],
    "profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
  },
  "fixtures": [
    "coder_200_ts2",
    "coder_200_ts3"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 1,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "explicit",
    "num_blocks": 15281,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "decode_cuda_graph_mode": "none",
    "enable_dummy_mode": false,
    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
    "prediction_max_prefill_chunk_size": 18000,
    "prediction_max_batch_size": 128,
    "prediction_max_tokens_per_request": 32768,
    "skip_cpu_overhead_modeling": true,
    "no_cache": true,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "vllm_kv_15281_profile_full32k",
      "description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 200-request replay with timestamps scaled by 2 and 3."
    }
  ]
 }
--- a/configs/rs11_frontier_h20_tp2_profile_full32k_coder200_ts2_ts3.json
+++ b/configs/rs11_frontier_h20_tp2_profile_full32k_coder200_ts2_ts3.json
@@ -0,0 +1,61 @@
 {
  "suite_id": "rs11_frontier_h20_tp2_profile_full32k_coder200_ts2_ts3",
  "sim": "frontier_h20_tp2_profile_full32k",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patches": [
      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
    ],
    "profile_source": "dash1:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp2_tp4_qwen3_30ba3b_full32k_20260625",
    "profile_note": "Timing rows include H20 TP2 and TP4 for attention/MoE and TP1/TP2/TP4 for linear ops; TP2 runs use explicit TP2 vLLM KV capacity."
  },
  "fixtures": [
    "coder_200_ts2",
    "coder_200_ts3"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 2,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 2,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "explicit",
    "num_blocks": 69055,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "decode_cuda_graph_mode": "none",
    "enable_dummy_mode": false,
    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_tp2_tp4_full32k.csv",
    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_tp2_tp4_combined.csv",
    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_tp2_tp4_full32k.csv",
    "prediction_max_prefill_chunk_size": 18000,
    "prediction_max_batch_size": 128,
    "prediction_max_tokens_per_request": 32768,
    "skip_cpu_overhead_modeling": true,
    "no_cache": true,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "vllm_kv_69055_profile_full32k",
      "description": "H20 TP2 Qwen3-30B-A3B with explicit vLLM TP2 KV blocks and H20 TP2 CUDA_EVENT profile timing."
    }
  ]
 }
--- a/configs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3.json
+++ b/configs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3.json
@@ -0,0 +1,74 @@
 {
  "suite_id": "rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3",
  "sim": "frontier_h20_tp2_tp4_profile_full32k",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patches": [
      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
    ],
    "profile_source": "dash1:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp2_tp4_qwen3_30ba3b_full32k_20260625_true_mixed"
  },
  "fixtures": [
    "coder_200_ts2",
    "coder_200_ts3"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 1,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "explicit",
    "num_blocks": 0,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "decode_cuda_graph_mode": "none",
    "enable_dummy_mode": false,
    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_tp2_tp4_full32k.csv",
    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_tp2_tp4_combined.csv",
    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_tp2_tp4_full32k.csv",
    "prediction_max_prefill_chunk_size": 18000,
    "prediction_max_batch_size": 128,
    "prediction_max_tokens_per_request": 32768,
    "skip_cpu_overhead_modeling": true,
    "no_cache": false,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "tp2_vllm_kv_69055_profile_full32k",
      "description": "H20 TP2 Qwen3-30B-A3B with explicit vLLM TP2 KV blocks and H20 TP2 CUDA_EVENT profile timing.",
      "overrides": {
        "attn_tensor_parallel_size": 2,
        "moe_tensor_parallel_size": 2,
        "num_blocks": 69055
      }
    },
    {
      "id": "tp4_vllm_kv_177077_profile_full32k",
      "description": "H20 TP4 Qwen3-30B-A3B with explicit vLLM TP4 KV blocks and H20 TP4 CUDA_EVENT profile timing.",
      "overrides": {
        "attn_tensor_parallel_size": 4,
        "moe_tensor_parallel_size": 4,
        "num_blocks": 177077
      }
    }
  ]
 }
--- a/configs/rs3_tiny_sweep.json
+++ b/configs/rs3_tiny_sweep.json
@@ -0,0 +1,51 @@
 {
  "suite_id": "rs3_tiny_smoke",
  "sim": "frontier_patched",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patch": "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch"
  },
  "fixtures": [
    "coder_100"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "Qwen/Qwen3-32B",
    "device": "a800",
    "network_device": "a800_dgx",
    "attn_tensor_parallel_size": 2,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 128,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 64,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "memory_planner",
    "gpu_memory_utilization": 0.9,
    "non_kv_cache_overhead_bytes": 0,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "fixed_prefix_on",
      "description": "RS1 fixed config on patched Frontier scratch."
    },
    {
      "id": "prefix_cache_off",
      "description": "Diagnosis/control config with prefix cache disabled; all other fixed scheduler knobs unchanged.",
      "overrides": {
        "enable_prefix_caching": false
      }
    }
  ]
 }
--- a/configs/rs4_frontier_h20_tp1.json
+++ b/configs/rs4_frontier_h20_tp1.json
@@ -0,0 +1,53 @@
 {
  "suite_id": "rs4_frontier_h20_tp1",
  "sim": "frontier_h20_tp1",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patch": "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch"
  },
  "fixtures": [
    "coder_100"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 1,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "memory_planner",
    "num_blocks": null,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "planner_kv",
      "description": "H20 TP1 Qwen3-30B-A3B with Frontier memory planner KV capacity."
    },
    {
      "id": "vllm_kv_15281",
      "description": "H20 TP1 Qwen3-30B-A3B with explicit KV blocks matching real vLLM TP1 on dash2.",
      "overrides": {
        "num_blocks_mode": "explicit",
        "num_blocks": 15281
      }
    }
  ]
 }
--- a/configs/rs5_frontier_h20_tp1_profile.json
+++ b/configs/rs5_frontier_h20_tp1_profile.json
@@ -0,0 +1,59 @@
 {
  "suite_id": "rs5_frontier_h20_tp1_profile",
  "sim": "frontier_h20_tp1_profile",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patches": [
      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
    ],
    "profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_20260624"
  },
  "fixtures": [
    "coder_100"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 1,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "explicit",
    "num_blocks": 15281,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "decode_cuda_graph_mode": "none",
    "enable_dummy_mode": false,
    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op.csv",
    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_vllm_fused.csv",
    "prediction_max_prefill_chunk_size": 18000,
    "prediction_max_batch_size": 128,
    "prediction_max_tokens_per_request": 32768,
    "skip_cpu_overhead_modeling": true,
    "no_cache": true,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "vllm_kv_15281_profile",
      "description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing."
    }
  ]
 }
--- a/configs/rs6_frontier_h20_tp1_profile_full32k.json
+++ b/configs/rs6_frontier_h20_tp1_profile_full32k.json
@@ -0,0 +1,59 @@
 {
  "suite_id": "rs6_frontier_h20_tp1_profile_full32k",
  "sim": "frontier_h20_tp1_profile_full32k",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patches": [
      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
    ],
    "profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
  },
  "fixtures": [
    "coder_100"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 1,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "explicit",
    "num_blocks": 15281,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "decode_cuda_graph_mode": "none",
    "enable_dummy_mode": false,
    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
    "prediction_max_prefill_chunk_size": 18000,
    "prediction_max_batch_size": 128,
    "prediction_max_tokens_per_request": 32768,
    "skip_cpu_overhead_modeling": true,
    "no_cache": true,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "vllm_kv_15281_profile_full32k",
      "description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; linear and MoE coverage extended to 32768 tokens."
    }
  ]
 }
--- a/configs/rs8_frontier_h20_tp1_profile_full32k_coder500.json
+++ b/configs/rs8_frontier_h20_tp1_profile_full32k_coder500.json
@@ -0,0 +1,59 @@
 {
  "suite_id": "rs8_frontier_h20_tp1_profile_full32k_coder500",
  "sim": "frontier_h20_tp1_profile_full32k",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patches": [
      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
    ],
    "profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
  },
  "fixtures": [
    "coder_500"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 1,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "explicit",
    "num_blocks": 15281,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "decode_cuda_graph_mode": "none",
    "enable_dummy_mode": false,
    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
    "prediction_max_prefill_chunk_size": 18000,
    "prediction_max_batch_size": 128,
    "prediction_max_tokens_per_request": 32768,
    "skip_cpu_overhead_modeling": true,
    "no_cache": true,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "vllm_kv_15281_profile_full32k",
      "description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 500-request replay stress."
    }
  ]
 }
--- a/configs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667.json
+++ b/configs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667.json
@@ -0,0 +1,59 @@
 {
  "suite_id": "rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667",
  "sim": "frontier_h20_tp1_profile_full32k",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patches": [
      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
    ],
    "profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
  },
  "fixtures": [
    "coder_200_ts0667"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "qwen3-a3b-30b-moe",
    "device": "h20",
    "network_device": "h20_dgx",
    "attn_tensor_parallel_size": 1,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 64,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 32768,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "explicit",
    "num_blocks": 15281,
    "gpu_memory_utilization": 0.85,
    "non_kv_cache_overhead_bytes": 0,
    "decode_cuda_graph_mode": "none",
    "enable_dummy_mode": false,
    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
    "prediction_max_prefill_chunk_size": 18000,
    "prediction_max_batch_size": 128,
    "prediction_max_tokens_per_request": 32768,
    "skip_cpu_overhead_modeling": true,
    "no_cache": true,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "vllm_kv_15281_profile_full32k",
      "description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 200-request replay with timestamps scaled by 2/3."
    }
  ]
 }
--- a/docs/assets/frontier_vllm_alignment/completion_prefix.png
+++ b/docs/assets/frontier_vllm_alignment/completion_prefix.png
--- a/docs/assets/frontier_vllm_alignment/frontier_vllm_alignment.csv
+++ b/docs/assets/frontier_vllm_alignment/frontier_vllm_alignment.csv
@@ -0,0 +1,10 @@
 run_id,label,tp,request_count,scale_label,scale_value,fixture,kv_blocks,frontier_completed,frontier_total,frontier_complete,vllm_completed,vllm_total,frontier_preemptions,vllm_preemptions,frontier_prefix_hit,vllm_prefix_hit,prefix_hit_delta,frontier_rps,vllm_rps,rps_ratio,frontier_total_tps,vllm_total_tps,total_tps_ratio,frontier_decode_tps,vllm_decode_tps,decode_tps_ratio,frontier_ttft_p50_s,vllm_ttft_p50_s,ttft_p50_ratio,frontier_ttft_p95_s,vllm_ttft_p95_s,ttft_p95_ratio,frontier_tpot_p50_s,vllm_tpot_p50_s,tpot_p50_ratio,frontier_tpot_p95_s,vllm_tpot_p95_s,tpot_p95_ratio,frontier_e2e_p50_s,vllm_e2e_p50_s,e2e_p50_ratio,frontier_e2e_p95_s,vllm_e2e_p95_s,e2e_p95_ratio,notes
 tp1_n100_scale1,TP1 N100 raw,1,100,raw,1,coder_100,15281,96,100,false,100,100,0,8,0.2487845616,0.2510820686,-0.002297507075,0.4048148795,0.6879880691,0.588403924,2348.908821,3832.320581,0.6129207541,347.7992338,567.4456795,0.6129207541,0.9087481136,4.503025495,0.201808343,12.76295815,29.06046906,0.4391862402,0.05688966428,0.06608134396,0.8609035603,0.1456880793,0.6211491471,0.2345460505,30.93928316,41.84076733,0.7394530534,119.6361376,97.36622969,1.228723121,Frontier incomplete before lifecycle fix; included as TP1 100-request baseline.
 tp1_n500_scale1,TP1 N500 raw,1,500,raw,1,coder_500,15281,439,500,false,500,500,0,63,0.1192374692,0.3868498695,-0.2676124002,0.660990472,0.8401719451,0.7867323776,4733.748762,5282.903731,0.896050544,656.2204998,732.3476384,0.896050544,136.7755789,185.6581683,0.7367064976,340.2371222,375.8950067,0.9051387119,0.05643274739,0.04975253624,1.134268756,0.08942839773,0.0918798539,0.9733188935,177.7998574,224.2697872,0.7927945162,397.29145,417.3562933,0.9519239469,Frontier incomplete; useful as high-pressure stress signal.
 tp1_n200_scale0667,TP1 N200 scale 0.667,1,200,0.667,0.6666666667,coder_200_ts0667,15281,176,200,false,200,200,0,26,0.170276008,0.2697549478,-0.09947893984,0.5830903706,0.8236788215,0.7079098737,3913.437526,4864.778909,0.8044430383,593.287826,737.51378,0.8044430383,20.58014532,34.56323652,0.595434554,96.71793818,120.8039818,0.800618794,0.05837096651,0.05145431897,1.13442307,0.235894569,0.2534757496,0.9306395954,73.20731169,83.6219905,0.875455263,189.2402903,183.726977,1.030008186,Dense-arrival run; Frontier incomplete before lifecycle fix.
 tp1_n200_scale2,TP1 N200 scale 2,1,200,2,2,coder_200_ts2,15281,200,200,true,200,200,33,43,0.23134169,0.2697549478,-0.03841325784,0.5936627655,0.8029813635,0.7393232178,3506.267279,4742.53641,0.7393232178,531.5597036,718.9814831,0.7393232178,9.595321274,9.216767096,1.041072338,77.50341053,69.21141595,1.119806747,0.05421362546,0.04970337519,1.09074334,0.06653162646,0.06863309532,0.9693811149,61.45769412,55.00248734,1.117362088,174.4840836,142.3375087,1.225847531,After Frontier decode-preemption lifecycle fix.
 tp1_n200_scale3,TP1 N200 scale 3,1,200,3,3,coder_200_ts3,15281,200,200,true,200,200,20,16,0.2176751278,0.2697549478,-0.05207982007,0.5739781652,0.7802265504,0.735655772,3390.00688,4608.142843,0.735655772,513.9343094,698.607051,0.735655772,1.001474116,1.166151478,0.8587856162,45.9466567,32.25842447,1.424330464,0.05339333437,0.04616159714,1.156661331,0.06861254671,0.0713836296,0.9611804148,44.76058145,33.21267588,1.34769573,154.5483135,122.7887113,1.258652459,After Frontier decode-preemption lifecycle fix.
 tp2_n200_scale2,TP2 N200 scale 2,2,200,2,2,coder_200_ts2,69055,200,200,true,200,200,0,0,0.2697549478,0.2697549478,0,0.7756823572,1.277818683,0.607036325,4581.304111,7547.001591,0.607036325,694.5382258,1144.14607,0.607036325,0.2690959621,0.225119116,1.195349231,6.744624223,0.715071776,9.432094022,0.04295527658,0.03004499679,1.429698158,0.05288764732,0.04340382318,1.218502046,26.05122482,16.44861007,1.583794905,106.7591651,72.5347179,1.471835394,Uses true-mixed TP2/TP4 attention profile.
 tp2_n200_scale3,TP2 N200 scale 3,2,200,3,3,coder_200_ts3,69055,200,200,true,200,200,0,0,0.2697549478,0.2697549478,0,0.6877705321,1.088050278,0.6321128225,4062.082806,6426.199028,0.6321128225,615.8228567,974.2293382,0.6321128225,0.1341535495,0.153530943,0.8737883511,0.5741378218,0.6270455511,0.9156237864,0.03937896849,0.01905767256,2.06630523,0.04670767225,0.02799082097,1.668678182,21.78596494,9.956003374,2.188223941,101.5918393,53.98348621,1.881905864,Uses true-mixed TP2/TP4 attention profile.
 tp4_n200_scale2,TP4 N200 scale 2,4,200,2,2,coder_200_ts2,177077,200,200,true,200,200,0,0,0.2697549478,0.2697549478,0,0.8525337931,1.536203537,0.5549614829,5035.200987,9073.063884,0.5549614829,763.350233,1375.501285,0.5549614829,0.09755515041,0.1704972619,0.5721801589,0.3856872342,1.419861408,0.2716372401,0.03366585047,0.01634437735,2.059781767,0.03838265621,0.02831690026,1.355468143,18.65216282,9.260885488,2.014079846,84.93775414,43.62188903,1.947136083,Uses true-mixed TP2/TP4 attention profile.
 tp4_n200_scale3,TP4 N200 scale 3,4,200,3,3,coder_200_ts3,177077,200,200,true,200,200,0,0,0.2697549478,0.2697549478,0,0.7373665172,1.253504493,0.5882440162,4355.004629,7403.398096,0.5882440162,660.2306059,1122.375388,0.5882440162,0.08859749135,0.100106278,0.885034317,0.3458954617,0.3184188101,1.086290919,0.03106778109,0.009410284212,3.301471071,0.03578285082,0.01279276668,2.79711588,16.90291941,5.54948732,3.045852424,83.00995365,27.86907583,2.978568581,Uses true-mixed TP2/TP4 attention profile.
--- a/docs/assets/frontier_vllm_alignment/frontier_vllm_alignment.json
+++ b/docs/assets/frontier_vllm_alignment/frontier_vllm_alignment.json
@@ -0,0 +1,434 @@
 [
  {
    "decode_tps_ratio": 0.6129207541251469,
    "e2e_p50_ratio": 0.7394530533987747,
    "e2e_p95_ratio": 1.2287231205931113,
    "fixture": "coder_100",
    "frontier_complete": false,
    "frontier_completed": 96,
    "frontier_decode_tps": 347.79923381681954,
    "frontier_e2e_p50_s": 30.939283157873398,
    "frontier_e2e_p95_s": 119.6361375789676,
    "frontier_preemptions": 0,
    "frontier_prefix_hit": 0.24878456156190046,
    "frontier_rps": 0.4048148795016268,
    "frontier_total": 100,
    "frontier_total_tps": 2348.908820556559,
    "frontier_tpot_p50_s": 0.056889664283438265,
    "frontier_tpot_p95_s": 0.14568807925543142,
    "frontier_ttft_p50_s": 0.9087481136376141,
    "frontier_ttft_p95_s": 12.762958146117297,
    "kv_blocks": 15281,
    "label": "TP1 N100 raw",
    "notes": "Frontier incomplete before lifecycle fix; included as TP1 100-request baseline.",
    "prefix_hit_delta": -0.0022975070751777016,
    "request_count": 100,
    "rps_ratio": 0.5884039239601411,
    "run_id": "tp1_n100_scale1",
    "scale_label": "raw",
    "scale_value": 1.0,
    "total_tps_ratio": 0.6129207541251469,
    "tp": 1,
    "tpot_p50_ratio": 0.8609035602986401,
    "tpot_p95_ratio": 0.23454605053898236,
    "ttft_p50_ratio": 0.20180834300191677,
    "ttft_p95_ratio": 0.439186240241972,
    "vllm_completed": 100,
    "vllm_decode_tps": 567.445679520595,
    "vllm_e2e_p50_s": 41.84076732886024,
    "vllm_e2e_p95_s": 97.36622968502343,
    "vllm_preemptions": 8,
    "vllm_prefix_hit": 0.25108206863707816,
    "vllm_rps": 0.6879880691092217,
    "vllm_total": 100,
    "vllm_total_tps": 3832.3205810011714,
    "vllm_tpot_p50_s": 0.06608134395878643,
    "vllm_tpot_p95_s": 0.6211491471318447,
    "vllm_ttft_p50_s": 4.503025494981557,
    "vllm_ttft_p95_s": 29.060469059972093
  },
  {
    "decode_tps_ratio": 0.8960505440100501,
    "e2e_p50_ratio": 0.7927945162118318,
    "e2e_p95_ratio": 0.951923946910999,
    "fixture": "coder_500",
    "frontier_complete": false,
    "frontier_completed": 439,
    "frontier_decode_tps": 656.2204997652797,
    "frontier_e2e_p50_s": 177.7998574092898,
    "frontier_e2e_p95_s": 397.29145000151055,
    "frontier_preemptions": 0,
    "frontier_prefix_hit": 0.11923746923408568,
    "frontier_rps": 0.6609904720097601,
    "frontier_total": 500,
    "frontier_total_tps": 4733.748762075876,
    "frontier_tpot_p50_s": 0.05643274739314083,
    "frontier_tpot_p95_s": 0.08942839772817235,
    "frontier_ttft_p50_s": 136.77557892500107,
    "frontier_ttft_p95_s": 340.237122196321,
    "kv_blocks": 15281,
    "label": "TP1 N500 raw",
    "notes": "Frontier incomplete; useful as high-pressure stress signal.",
    "prefix_hit_delta": -0.2676124002320734,
    "request_count": 500,
    "rps_ratio": 0.786732377640824,
    "run_id": "tp1_n500_scale1",
    "scale_label": "raw",
    "scale_value": 1.0,
    "total_tps_ratio": 0.8960505440100502,
    "tp": 1,
    "tpot_p50_ratio": 1.134268756171416,
    "tpot_p95_ratio": 0.9733188934802052,
    "ttft_p50_ratio": 0.736706497600023,
    "ttft_p95_ratio": 0.9051387119015346,
    "vllm_completed": 500,
    "vllm_decode_tps": 732.3476383692921,
    "vllm_e2e_p50_s": 224.26978715602309,
    "vllm_e2e_p95_s": 417.3562933159992,
    "vllm_preemptions": 63,
    "vllm_prefix_hit": 0.38684986946615907,
    "vllm_rps": 0.8401719451179492,
    "vllm_total": 500,
    "vllm_total_tps": 5282.903730956031,
    "vllm_tpot_p50_s": 0.049752536236317216,
    "vllm_tpot_p95_s": 0.09187985389702198,
    "vllm_ttft_p50_s": 185.6581683079712,
    "vllm_ttft_p95_s": 375.8950067239348
  },
  {
    "decode_tps_ratio": 0.8044430383408974,
    "e2e_p50_ratio": 0.8754552629577944,
    "e2e_p95_ratio": 1.030008185534932,
    "fixture": "coder_200_ts0667",
    "frontier_complete": false,
    "frontier_completed": 176,
    "frontier_decode_tps": 593.287826008356,
    "frontier_e2e_p50_s": 73.20731168652793,
    "frontier_e2e_p95_s": 189.24029025053343,
    "frontier_preemptions": 0,
    "frontier_prefix_hit": 0.17027600800712456,
    "frontier_rps": 0.5830903705506575,
    "frontier_total": 200,
    "frontier_total_tps": 3913.43752605849,
    "frontier_tpot_p50_s": 0.05837096651496554,
    "frontier_tpot_p95_s": 0.23589456903741046,
    "frontier_ttft_p50_s": 20.58014532403832,
    "frontier_ttft_p95_s": 96.7179381828816,
    "kv_blocks": 15281,
    "label": "TP1 N200 scale 0.667",
    "notes": "Dense-arrival run; Frontier incomplete before lifecycle fix.",
    "prefix_hit_delta": -0.09947893983522305,
    "request_count": 200,
    "rps_ratio": 0.7079098737399896,
    "run_id": "tp1_n200_scale0667",
    "scale_label": "0.667",
    "scale_value": 0.6666666666666666,
    "total_tps_ratio": 0.8044430383408974,
    "tp": 1,
    "tpot_p50_ratio": 1.1344230703885074,
    "tpot_p95_ratio": 0.930639595403931,
    "ttft_p50_ratio": 0.5954345540217358,
    "ttft_p95_ratio": 0.800618794003408,
    "vllm_completed": 200,
    "vllm_decode_tps": 737.5137800085473,
    "vllm_e2e_p50_s": 83.62199050490744,
    "vllm_e2e_p95_s": 183.7269770358689,
    "vllm_preemptions": 26,
    "vllm_prefix_hit": 0.2697549478423476,
    "vllm_rps": 0.8236788215286605,
    "vllm_total": 200,
    "vllm_total_tps": 4864.778908559713,
    "vllm_tpot_p50_s": 0.051454318973762715,
    "vllm_tpot_p95_s": 0.2534757495838373,
    "vllm_ttft_p50_s": 34.563236522022635,
    "vllm_ttft_p95_s": 120.80398175423034
  },
  {
    "decode_tps_ratio": 0.7393232177681209,
    "e2e_p50_ratio": 1.1173620884379967,
    "e2e_p95_ratio": 1.2258475306262637,
    "fixture": "coder_200_ts2",
    "frontier_complete": true,
    "frontier_completed": 200,
    "frontier_decode_tps": 531.5597035900641,
    "frontier_e2e_p50_s": 61.45769412455945,
    "frontier_e2e_p95_s": 174.48408358603848,
    "frontier_preemptions": 33,
    "frontier_prefix_hit": 0.23134168999974056,
    "frontier_rps": 0.5936627654877362,
    "frontier_total": 200,
    "frontier_total_tps": 3506.267279013048,
    "frontier_tpot_p50_s": 0.054213625462090735,
    "frontier_tpot_p95_s": 0.06653162646338621,
    "frontier_ttft_p50_s": 9.595321273711544,
    "frontier_ttft_p95_s": 77.50341053197451,
    "kv_blocks": 15281,
    "label": "TP1 N200 scale 2",
    "notes": "After Frontier decode-preemption lifecycle fix.",
    "prefix_hit_delta": -0.038413257842607046,
    "request_count": 200,
    "rps_ratio": 0.7393232177681209,
    "run_id": "tp1_n200_scale2",
    "scale_label": "2",
    "scale_value": 2.0,
    "total_tps_ratio": 0.7393232177681209,
    "tp": 1,
    "tpot_p50_ratio": 1.0907433399899442,
    "tpot_p95_ratio": 0.9693811149298648,
    "ttft_p50_ratio": 1.0410723384685256,
    "ttft_p95_ratio": 1.1198067467817787,
    "vllm_completed": 200,
    "vllm_decode_tps": 718.9814830849542,
    "vllm_e2e_p50_s": 55.002487340942025,
    "vllm_e2e_p95_s": 142.3375087250024,
    "vllm_preemptions": 43,
    "vllm_prefix_hit": 0.2697549478423476,
    "vllm_rps": 0.8029813635231063,
    "vllm_total": 200,
    "vllm_total_tps": 4742.53640998563,
    "vllm_tpot_p50_s": 0.049703375188695206,
    "vllm_tpot_p95_s": 0.06863309532102842,
    "vllm_ttft_p50_s": 9.216767095960677,
    "vllm_ttft_p95_s": 69.2114159471821
  },
  {
    "decode_tps_ratio": 0.7356557719569122,
    "e2e_p50_ratio": 1.3476957295017153,
    "e2e_p95_ratio": 1.258652459348984,
    "fixture": "coder_200_ts3",
    "frontier_complete": true,
    "frontier_completed": 200,
    "frontier_decode_tps": 513.9343093668691,
    "frontier_e2e_p50_s": 44.76058145123308,
    "frontier_e2e_p95_s": 154.54831351855702,
    "frontier_preemptions": 20,
    "frontier_prefix_hit": 0.21767512777477313,
    "frontier_rps": 0.573978165231764,
    "frontier_total": 200,
    "frontier_total_tps": 3390.0068803652352,
    "frontier_tpot_p50_s": 0.053393334371887605,
    "frontier_tpot_p95_s": 0.06861254670772189,
    "frontier_ttft_p50_s": 1.0014741156186515,
    "frontier_ttft_p95_s": 45.94665669959886,
    "kv_blocks": 15281,
    "label": "TP1 N200 scale 3",
    "notes": "After Frontier decode-preemption lifecycle fix.",
    "prefix_hit_delta": -0.05207982006757447,
    "request_count": 200,
    "rps_ratio": 0.7356557719569123,
    "run_id": "tp1_n200_scale3",
    "scale_label": "3",
    "scale_value": 3.0,
    "total_tps_ratio": 0.7356557719569123,
    "tp": 1,
    "tpot_p50_ratio": 1.1566613307426805,
    "tpot_p95_ratio": 0.9611804148017213,
    "ttft_p50_ratio": 0.8587856162345445,
    "ttft_p95_ratio": 1.4243304641052532,
    "vllm_completed": 200,
    "vllm_decode_tps": 698.607050957755,
    "vllm_e2e_p50_s": 33.2126758818049,
    "vllm_e2e_p95_s": 122.78871134808287,
    "vllm_preemptions": 16,
    "vllm_prefix_hit": 0.2697549478423476,
    "vllm_rps": 0.7802265503945264,
    "vllm_total": 200,
    "vllm_total_tps": 4608.1428428781355,
    "vllm_tpot_p50_s": 0.04616159713544178,
    "vllm_tpot_p95_s": 0.07138362959869063,
    "vllm_ttft_p50_s": 1.1661514779552817,
    "vllm_ttft_p95_s": 32.25842447206378
  },
  {
    "decode_tps_ratio": 0.6070363250137228,
    "e2e_p50_ratio": 1.5837949050918096,
    "e2e_p95_ratio": 1.4718353941122981,
    "fixture": "coder_200_ts2",
    "frontier_complete": true,
    "frontier_completed": 200,
    "frontier_decode_tps": 694.538225813865,
    "frontier_e2e_p50_s": 26.05122481685102,
    "frontier_e2e_p95_s": 106.75916510714146,
    "frontier_preemptions": 0,
    "frontier_prefix_hit": 0.2697549478423476,
    "frontier_rps": 0.7756823572006221,
    "frontier_total": 200,
    "frontier_total_tps": 4581.304110804026,
    "frontier_tpot_p50_s": 0.042955276577521156,
    "frontier_tpot_p95_s": 0.05288764732371923,
    "frontier_ttft_p50_s": 0.2690959621493789,
    "frontier_ttft_p95_s": 6.744624223172184,
    "kv_blocks": 69055,
    "label": "TP2 N200 scale 2",
    "notes": "Uses true-mixed TP2/TP4 attention profile.",
    "prefix_hit_delta": 0.0,
    "request_count": 200,
    "rps_ratio": 0.6070363250137228,
    "run_id": "tp2_n200_scale2",
    "scale_label": "2",
    "scale_value": 2.0,
    "total_tps_ratio": 0.6070363250137228,
    "tp": 2,
    "tpot_p50_ratio": 1.4296981582601855,
    "tpot_p95_ratio": 1.218502045500008,
    "ttft_p50_ratio": 1.1953492307083635,
    "ttft_p95_ratio": 9.432094021900193,
    "vllm_completed": 200,
    "vllm_decode_tps": 1144.1460703330465,
    "vllm_e2e_p50_s": 16.448610065039247,
    "vllm_e2e_p95_s": 72.53471789998002,
    "vllm_preemptions": 0,
    "vllm_prefix_hit": 0.2697549478423476,
    "vllm_rps": 1.2778186827338327,
    "vllm_total": 200,
    "vllm_total_tps": 7547.001591215254,
    "vllm_tpot_p50_s": 0.030044996791346416,
    "vllm_tpot_p95_s": 0.043403823177019754,
    "vllm_ttft_p50_s": 0.22511911601759493,
    "vllm_ttft_p95_s": 0.7150717759504914
  },
  {
    "decode_tps_ratio": 0.6321128225155744,
    "e2e_p50_ratio": 2.1882239414176055,
    "e2e_p95_ratio": 1.8819058641979227,
    "fixture": "coder_200_ts3",
    "frontier_complete": true,
    "frontier_completed": 200,
    "frontier_decode_tps": 615.822856748031,
    "frontier_e2e_p50_s": 21.785964943721574,
    "frontier_e2e_p95_s": 101.59183927019191,
    "frontier_preemptions": 0,
    "frontier_prefix_hit": 0.2697549478423476,
    "frontier_rps": 0.6877705321122985,
    "frontier_total": 200,
    "frontier_total_tps": 4062.0828059403734,
    "frontier_tpot_p50_s": 0.0393789684875167,
    "frontier_tpot_p95_s": 0.04670767224504207,
    "frontier_ttft_p50_s": 0.13415354950526392,
    "frontier_ttft_p95_s": 0.574137821753455,
    "kv_blocks": 69055,
    "label": "TP2 N200 scale 3",
    "notes": "Uses true-mixed TP2/TP4 attention profile.",
    "prefix_hit_delta": 0.0,
    "request_count": 200,
    "rps_ratio": 0.6321128225155745,
    "run_id": "tp2_n200_scale3",
    "scale_label": "3",
    "scale_value": 3.0,
    "total_tps_ratio": 0.6321128225155745,
    "tp": 2,
    "tpot_p50_ratio": 2.066305230245682,
    "tpot_p95_ratio": 1.668678182045304,
    "ttft_p50_ratio": 0.8737883511042303,
    "ttft_p95_ratio": 0.9156237864420547,
    "vllm_completed": 200,
    "vllm_decode_tps": 974.229338201501,
    "vllm_e2e_p50_s": 9.956003373954445,
    "vllm_e2e_p95_s": 53.98348621092737,
    "vllm_preemptions": 0,
    "vllm_prefix_hit": 0.2697549478423476,
    "vllm_rps": 1.0880502777577379,
    "vllm_total": 200,
    "vllm_total_tps": 6426.199028481642,
    "vllm_tpot_p50_s": 0.01905767256023186,
    "vllm_tpot_p95_s": 0.02799082096692385,
    "vllm_ttft_p50_s": 0.15353094297461212,
    "vllm_ttft_p95_s": 0.6270455510821193
  },
  {
    "decode_tps_ratio": 0.554961482872708,
    "e2e_p50_ratio": 2.0140798462106178,
    "e2e_p95_ratio": 1.9471360828275543,
    "fixture": "coder_200_ts2",
    "frontier_complete": true,
    "frontier_completed": 200,
    "frontier_decode_tps": 763.3502329676248,
    "frontier_e2e_p50_s": 18.65216281946347,
    "frontier_e2e_p95_s": 84.93775413567799,
    "frontier_preemptions": 0,
    "frontier_prefix_hit": 0.2697549478423476,
    "frontier_rps": 0.8525337930595883,
    "frontier_total": 200,
    "frontier_total_tps": 5035.200987216818,
    "frontier_tpot_p50_s": 0.03366585046876145,
    "frontier_tpot_p95_s": 0.03838265621202119,
    "frontier_ttft_p50_s": 0.09755515041058871,
    "frontier_ttft_p95_s": 0.3856872342439675,
    "kv_blocks": 177077,
    "label": "TP4 N200 scale 2",
    "notes": "Uses true-mixed TP2/TP4 attention profile.",
    "prefix_hit_delta": 0.0,
    "request_count": 200,
    "rps_ratio": 0.5549614828727081,
    "run_id": "tp4_n200_scale2",
    "scale_label": "2",
    "scale_value": 2.0,
    "total_tps_ratio": 0.5549614828727081,
    "tp": 4,
    "tpot_p50_ratio": 2.0597817670263323,
    "tpot_p95_ratio": 1.3554681431066735,
    "ttft_p50_ratio": 0.5721801588631308,
    "ttft_p95_ratio": 0.27163724014492546,
    "vllm_completed": 200,
    "vllm_decode_tps": 1375.5012852715674,
    "vllm_e2e_p50_s": 9.26088548800908,
    "vllm_e2e_p95_s": 43.621889032190666,
    "vllm_preemptions": 0,
    "vllm_prefix_hit": 0.2697549478423476,
    "vllm_rps": 1.5362035373095158,
    "vllm_total": 200,
    "vllm_total_tps": 9073.06388391597,
    "vllm_tpot_p50_s": 0.016344377354773947,
    "vllm_tpot_p95_s": 0.02831690025857032,
    "vllm_ttft_p50_s": 0.1704972619190812,
    "vllm_ttft_p95_s": 1.4198614079505205
  },
  {
    "decode_tps_ratio": 0.5882440161960838,
    "e2e_p50_ratio": 3.045852424279607,
    "e2e_p95_ratio": 2.9785685814353515,
    "fixture": "coder_200_ts3",
    "frontier_complete": true,
    "frontier_completed": 200,
    "frontier_decode_tps": 660.2306058712501,
    "frontier_e2e_p50_s": 16.902919407154563,
    "frontier_e2e_p95_s": 83.00995364867583,
    "frontier_preemptions": 0,
    "frontier_prefix_hit": 0.2697549478423476,
    "frontier_rps": 0.7373665172396945,
    "frontier_total": 200,
    "frontier_total_tps": 4355.004629460394,
    "frontier_tpot_p50_s": 0.031067781092248118,
    "frontier_tpot_p95_s": 0.035782850818878296,
    "frontier_ttft_p50_s": 0.08859749134958328,
    "frontier_ttft_p95_s": 0.3458954617429286,
    "kv_blocks": 177077,
    "label": "TP4 N200 scale 3",
    "notes": "Uses true-mixed TP2/TP4 attention profile.",
    "prefix_hit_delta": 0.0,
    "request_count": 200,
    "rps_ratio": 0.5882440161960838,
    "run_id": "tp4_n200_scale3",
    "scale_label": "3",
    "scale_value": 3.0,
    "total_tps_ratio": 0.5882440161960839,
    "tp": 4,
    "tpot_p50_ratio": 3.301471070786272,
    "tpot_p95_ratio": 2.7971158799197804,
    "ttft_p50_ratio": 0.8850343170011207,
    "ttft_p95_ratio": 1.086290918512101,
    "vllm_completed": 200,
    "vllm_decode_tps": 1122.3753879226379,
    "vllm_e2e_p50_s": 5.549487320007756,
    "vllm_e2e_p95_s": 27.869075825903565,
    "vllm_preemptions": 0,
    "vllm_prefix_hit": 0.2697549478423476,
    "vllm_rps": 1.2535044929278167,
    "vllm_total": 200,
    "vllm_total_tps": 7403.398095950554,
    "vllm_tpot_p50_s": 0.00941028421153152,
    "vllm_tpot_p95_s": 0.01279276667647553,
    "vllm_ttft_p50_s": 0.1001062779687345,
    "vllm_ttft_p95_s": 0.3184188101440668
  }
 ]
--- a/docs/assets/frontier_vllm_alignment/latency_ratios.png
+++ b/docs/assets/frontier_vllm_alignment/latency_ratios.png
--- a/docs/assets/frontier_vllm_alignment/throughput_ratio.png
+++ b/docs/assets/frontier_vllm_alignment/throughput_ratio.png
--- a/docs/assets/frontier_vllm_alignment/tp_scaling_total_tps.png
+++ b/docs/assets/frontier_vllm_alignment/tp_scaling_total_tps.png
--- a/docs/comparison.md
+++ b/docs/comparison.md
@@ -0,0 +1,253 @@
 # RS2 Simulator Comparison
 Checked on 2026-06-24. RS2 compares simulator capabilities and first local
 ReplayServe results. It does not start the RS3 sweep and does not make
 performance-quality claims.
 ## Sources
 | Source | Local path | Commit / HEAD | RS2 use |
 |---|---|---:|---|
 | ReplayServe | `/home/gahow/phd/replayserve` | local RS0/RS1/RS1B artifacts | Adapter, fixtures, runs, postprocess summaries |
 | Qwen trace | `/home/gahow/phd/qwen-bailian-usagetraces-anon` | `5f7439c51ec248a0c585f7d90a41a6f57773b912` | Source `qwen_coder_blksz_16.jsonl` |
 | Frontier canonical | `/tmp/toc-llm-sim-research/Frontier` | `d9cfeb6d8791fbf2f295dd9744c56a666171776e` | RS1 fixed config and source inspection |
 | Frontier patched scratch | `/tmp/replayserve-frontier-rs1b` | base `d9cfeb6...` plus `patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch` | RS1B unblock verification |
 | Vidur | `/tmp/toc-llm-sim-research/vidur` | `8383d2935bc62723a212090baa9f98ada206fc14` | Source inspection for baseline capability |
 | AIConfigurator | `/tmp/toc-llm-sim-research/aiconfigurator` | `e46ece7510e727fafefb8212e5846172145a30ea` | Source/docs inspection for config-estimator capability |
 Key local evidence:
 - Frontier trace replay: `/tmp/toc-llm-sim-research/Frontier/frontier/request_generator/trace_replay_request_generator.py`
 - Frontier prefix-cache validation: `/tmp/toc-llm-sim-research/Frontier/frontier/scheduler/cluster_scheduler/base_cluster_scheduler.py`
 - Frontier prefix-cache request metrics: `/tmp/toc-llm-sim-research/Frontier/frontier/metrics/metrics_store.py`
 - Vidur trace replay: `/tmp/toc-llm-sim-research/vidur/vidur/request_generator/trace_replay_request_generator.py`
 - Vidur request entity: `/tmp/toc-llm-sim-research/vidur/vidur/entities/request.py`
 - AIConfigurator CLI/docs: `/tmp/toc-llm-sim-research/aiconfigurator/README.md` and `/tmp/toc-llm-sim-research/aiconfigurator/src/aiconfigurator/cli/main.py`
 ## Capability Matrix
 | Capability | Frontier | Vidur | AIConfigurator |
 |---|---|---|---|
 | Per-request timestamp replay | Yes. `trace_replay` consumes `arrived_at` and RS1 runs `simulation_mode=online`. | Yes. `TraceReplayRequestGenerator` consumes `arrived_at`. | No per-request replay. CLI consumes workload summaries such as `--isl`, `--osl`, and SLA targets. |
 | Input/output length replay | Yes. Consumes `num_prefill_tokens` and `num_decode_tokens`. Frontier can clip overflows internally, so ReplayServe adapter validates before run. | Yes. Consumes `num_prefill_tokens` and `num_decode_tokens`; current code clips prefill length if total exceeds max tokens. | Only summary lengths, not per-request traces. |
 | Explicit `block_hash_ids` / prefix KV reuse replay | Yes. Current Frontier parses `session_id` and `block_hash_ids`, validates they are present when prefix caching is enabled, and applies prefix-cache accounting in vLLM v1. RS1B needs a patch for prefix cache plus chunked prefill under pressure. | No in this checkout. Request objects carry arrival/prefill/decode lengths and processed-token state, but no `session_id`, block hashes, or explicit prefix-reuse replay. README says prefix-caching work lives on a canary branch with sharp edges, not this main checkout. | No. `--prefix` is an aggregate prefix length/workload parameter, not an explicit hash/session replay model. |
 | Online arrival pattern | Yes. RS1 fixed config uses online mode and trace replay. | Yes for trace replay baseline. | No event-level online replay. It estimates candidate deployments from summary workload/SLA inputs. |
 | Prefix-cache hit-ratio output | Yes. Frontier emits request metrics including cached prefill tokens, query blocks, and hit blocks when present. ReplayServe postprocess adds token-weighted hit ratio using sidecar partial-block counts. | No native prefix-hit ratio in current main because no explicit prefix replay. | No prefix-hit replay metric. |
 | TTFT / TPOT / E2E / throughput output | Yes. Request and system metrics are emitted under Frontier metrics dirs. RS1 uses dummy execution predictor, so values are plumbing-only. | Yes. Vidur metrics include request E2E, prefill/TTFT-style, decode-normalized, and system metrics. Fidelity depends on matching profiles. | Yes as estimates: best throughput, per-GPU throughput, per-user throughput, TTFT, TPOT, and request latency. These are configuration-search outputs, not replay observations. |
 | TP / EP / DP / config knobs | Yes. Frontier has model, device, network device, attention TP/DP, MoE TP/EP, PP, scheduler, block, batch, prefix-cache, chunked-prefill, and memory-planner knobs. | Partial. Vidur exposes model, device, network device, tensor parallel size, pipeline stages, scheduler and batch/KV knobs. The inspected checkout is not a faithful EP/DP prefix-replay candidate. | Strong for config search. Supports TP/PP/DP and expert TP/EP style search across supported backends/systems. |
 | Arbitrary model/hardware/config boundary | Not arbitrary. Model/device configs may exist, but reliable latency/throughput requires compute/network profiles, scheduler support, matching parallel semantics, and bug-free code paths. RS1 Qwen3-32B on A800 uses dummy predictor because public A800 dense Qwen3-32B compute profiles are absent. | Not arbitrary. README lists supported model/device/profile combinations; docs say profiling on actual GPUs is needed for new model/hardware fidelity. Current public device/profile coverage does not match A800 Qwen3-32B. | Not arbitrary. It depends on supported model families, backend/system databases, and estimate modes. The support matrix includes H100/H200/B200/GB200/A100 variants; no A800 built-in silicon database was found. |
 | Needs profile/calibration | Yes for performance claims. Dummy predictor plus analytical comm is only a smoke. | Yes for performance claims. New model/device requires compute, network, and CPU-overhead style profiling. | Yes for production-quality estimates. It relies on collected silicon/perf databases or rough estimate modes; README warns memory/results need validation. |
 ## First Results
 ### Frontier Canonical
 Fixed RS1 config:
 - `simulation_mode=online`
 - `sys_arch=co-location`
 - `replica_scheduler=vllm_v1`
 - `device=a800`
 - `network_device=a800_dgx`
 - `model_name=Qwen/Qwen3-32B`
 - `attn_tensor_parallel_size=2`
 - dummy execution predictor
 - analytical communication backend
 - `trace_request_generator_config_max_tokens=32768`
 - prefix caching enabled
 - block size 16
 - chunked prefill enabled
 - batch cap 128
 - max batch tokens 32768
 - KV capacity from Frontier memory planner with `gpu_memory_utilization=0.9` and `non_kv_cache_overhead_bytes=0`
 Results:
 | Run | Result | Evidence | Notes |
 |---|---|---|---|
 | `coder_100` | Pass | `runs/rs1/coder_100/` | Frontier block hit ratio `0.04948661841440835`; ReplayServe token-weighted hit ratio `0.04956232588915065`; no preemptions. |
 | `coder_2000` | Fail | `runs/rs1/coder_2000/` | Exit code 1 after 4 seconds with `ValueError: Request 194 already scheduled.` Traceback ends at Frontier vLLM v1 waiting scheduling calling `request.on_cache_hit(prefix_cached_tokens)`. |
 The canonical failure was minimized in `docs/rs1_frontier_blocker.md`: first-N
 `N=192` passes, `N=193` fails as `Request 192 already scheduled`, and larger
 fixed-config probes fail around the same preempted prefix-cache path. Prefix off,
 chunked-prefill off, or a high long-prefill threshold avoids the failure, so this
 is not a bad Qwen trace row.
 ### Frontier Patched Scratch
 Patch:
 - File: `patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch`
 - Documentation: `docs/rs1_frontier_patch.md`
 - Scratch checkout only: `/tmp/replayserve-frontier-rs1b`
 The patch resets preempted request scheduler/cache-hit admission state before
 the request re-enters the waiting path. As of 2026-06-25 it also replays
 decode-phase preemption by moving already-produced tokens into the next prefill
 segment, preserves user-facing lengths for metrics, and fails fast if
 sequential simulation drains before all generated requests complete. It keeps
 the canonical Frontier checkout clean.
 | Run | Result | Evidence | Hit ratios | Preemption | Memory planner facts |
 |---|---|---|---|---|---|
 | `N=193` fixed config | Pass | `runs/rs1b/patched/n193_fixed_v2/` | Frontier block `0.12458971786194112`; ReplayServe token-weighted `0.12476981408429115` | 5 total events, 1 request | `num_blocks=36902`, `gpu_memory_utilization=0.9`, non-KV overhead `0`, weight shard estimate `26.953125 GiB` |
 | `coder_100` fixed config | Pass | `runs/rs1b/patched/coder_100/` | Frontier block `0.04948661841440835`; ReplayServe token-weighted `0.04956232588915065` | 0 | same derived memory planner point |
 | `coder_2000` fixed config | Pass | `runs/rs1b/patched/coder_2000/` | Frontier block `0.12318930248025924`; ReplayServe token-weighted `0.12332978217090633` | 35940 total events, 1061 requests | same derived memory planner point |
 Metrics caveats:
 - These are plumbing-smoke metrics. The run uses dummy 1 ms execution time and
  analytical communication, not calibrated Qwen3-32B A800 compute profiles.
 - `coder_2000` produced `request_metrics.csv` with 2000 rows, but 745 rows have
  blank request-level prefix-cache fields. ReplayServe token-weighted hit ratio
  therefore uses the 1255 rows with complete cache metrics. Frontier's aggregate
  prefix-cache statistics in the same summary also report 1255 requests with
  cache metrics. This is acceptable for blocker removal evidence, but it is not
  a final metric-quality result.
 - No allocation/OOM pressure log lines were found in the postprocess summaries.
 ### Vidur
 No Vidur baseline run was executed for RS2. Based on source inspection, Vidur is
 useful as an arrival-and-length baseline candidate, but it cannot faithfully
 compare ReplayServe prefix reuse without additional code:
 - `vidur/request_generator/trace_replay_request_generator.py` consumes
  `arrived_at`, `num_prefill_tokens`, and `num_decode_tokens`.
 - `vidur/entities/request.py` stores arrival, prefill length, decode length,
  processed tokens, schedule/completion timestamps, and preemption state.
 - The inspected request path does not carry `session_id`, `block_hash_ids`, or
  sidecar block-token accounting.
 - Current Vidur trace replay clips prefill lengths when total tokens exceed
  `max_tokens`; ReplayServe must keep its own hard-fail validation if Vidur is
  used later as a length-only baseline.
 Conclusion for Vidur in RS2: it can likely replay `coder_100`/`coder_2000`
 arrival and length after a simple CSV compatibility conversion, but it would
 measure a different workload because prefix KV reuse is absent.
 ### AIConfigurator
 No AIConfigurator run was executed for RS2 because it is not a per-request
 replay simulator. Source/docs show it is a deployment/config search estimator:
 - CLI examples take workload summaries such as `--isl`, `--osl`, `--prefix`,
  `--ttft`, `--tpot`, `--total-gpus`, `--system`, and model path.
 - Outputs are best throughput, per-GPU throughput, per-user throughput, TTFT,
  TPOT, request latency, concurrency, and parallel deployment choices.
 - It models operations and searches aggregated/disaggregated serving
  configurations using collected or estimated performance data.
 Conclusion for AIConfigurator in RS2: it is useful for config candidates and
 reference sizing assumptions. It cannot directly compare faithful per-request
 prefix-hit replay on Qwen trace fixtures.
 ## Metric Definitions
 `TTFT`:
 Time from request arrival to first generated token / prefill completion. Frontier
 and Vidur both have request-level prefill/first-token style timing fields, but
 RS1 Frontier values are not performance claims because the execution predictor is
 dummy.
 `TPOT`:
 Decode time per output token. Tools differ on whether they report total decode
 normalized by output tokens, inter-token latency, or a configured SLA target.
 Use each tool's native field only within that tool unless calibrated against the
 same serving definition.
 `E2E latency`:
 Completion time minus arrival time for one request.
 `Throughput`:
 Completed tokens or requests per unit time. AIConfigurator reports estimated
 tokens/s style capacity; Frontier/Vidur report simulated metrics. RS1 Frontier
 throughput is plumbing-only because compute is dummy.
 `KV-cache hit ratio`:
 - Frontier native block-level ratio:
  `sum(request_prefix_cache_hit_blocks) / sum(request_prefix_cache_query_blocks)`.
 - ReplayServe token-weighted ratio:
  use sidecar `block_token_counts` and count the first
  `request_prefix_cache_hit_blocks` blocks by true token count, so a partial
  final block contributes its actual token count instead of always 16.
 For `coder_2000` patched, both ratios are computed only for request rows with
 complete cache fields because 745 request metric rows have blank cache fields.
 This is a metrics completeness caveat, not evidence that the trace has invalid
 hashes.
 ## Non-Comparable Items
 - Frontier canonical and patched scratch are not equivalent artifacts. The
  patched result demonstrates an RS1 unblock path; it is not an upstream
  Frontier release.
 - Frontier/Vidur simulator timings and AIConfigurator estimator timings are not
  directly comparable without shared profiles, calibration, and metric
  definitions.
 - Prefix-reuse fidelity is not comparable across all three tools. Only Frontier
  currently consumes explicit block hash traces in the inspected checkouts.
 - AIConfigurator's `prefix` workload parameter is not the same as ReplayServe
  `block_hash_ids`; it cannot recover session-level sharing or partial-block
  token accounting.
 ## Conclusions
 There is no best open-source implementation that satisfies ReplayServe's target
 out of the box.
 Frontier is the closest because it supports online trace replay, prefix-cache
 metadata, vLLM v1 style scheduler controls, memory planning, and request/system
 metrics. It is still not out of the box for ReplayServe: RS0 needed an adapter,
 RS1B needed a local patch or upstream fix for prefix cache plus chunked prefill,
 and performance-quality claims need Qwen3-32B A800 profiles/calibration.
 Vidur can be a useful arrival-plus-length baseline, but not a faithful prefix KV
 reuse replay engine in the inspected checkout.
 AIConfigurator can guide candidate deployment/config choices, but it is a
 workload-summary estimator rather than a per-request simulator.
 Frontier also does not support arbitrary model plus arbitrary hardware plus
 arbitrary config in a performance-reliable sense. A model/device config may be
 accepted syntactically, but fidelity depends on compute profiles, network
 profiles, scheduler support, parallelism semantics, memory-planner assumptions,
 and bug surface. For RS1, A800 network profiles exist, but the public checkout
 does not provide dense Qwen3-32B A800 compute profiles, so latency/throughput
 remain plumbing smoke.
 ## Next Steps
 RS3 sweep prerequisites:
 - Decide whether RS3 uses the local RS1B patch, waits for upstream Frontier, or
  carries both canonical and patched modes explicitly.
 - Keep fixed-config smoke and any sweep configs separate from performance claims.
 - Add a small run manifest/check script that records Frontier commit, patch
  status, fixture, command, and metric completeness.
 - Treat the `coder_2000` blank cache fields as a metrics issue to investigate
  before using request-level hit ratios as a headline metric.
 RS4 calibration prerequisites:
 - Collect or obtain dense `Qwen/Qwen3-32B` A800 compute profiles for the Frontier
  predictor path.
 - Verify the A800 network profile and node SKU semantics match the target
  deployment.
 - Add non-KV memory overhead assumptions from a real serving stack instead of
  using `0`.
 - Validate simulator TTFT/TPOT/E2E/throughput against measured vLLM runs before
  making performance conclusions.
 Patch path recommendation:
 - Keep `patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch` pinned in
  ReplayServe until an upstream Frontier fix is available; it now covers both
  the original prefix-cache/chunked-prefill preemption bug and the RS10
  decode-phase preemption lifecycle bug.
 - Open an upstream issue or PR with the RS1B minimal repro (`N=193`) and
  evidence from `docs/rs1_frontier_blocker.md`.
 - Re-run `coder_100`, `N=193`, and `coder_2000` when changing Frontier commit or
  patch status.
--- a/docs/frontier_vllm_alignment_summary_20260625.md
+++ b/docs/frontier_vllm_alignment_summary_20260625.md
@@ -0,0 +1,177 @@
 # Frontier vs vLLM H20 Alignment Summary
 Date: 2026-06-25
 This document summarizes the current ReplayServe comparison between Frontier
 simulation and real vLLM runs on H20 for Qwen3-30B-A3B. It covers TP=1/2/4,
 different timestamp scales, and 100/200/500-request windows from
 `qwen_coder_blksz_16.jsonl`.
 The source data and plots are generated by:
 ```bash
 ~/.venv/plot/bin/python tools/build_frontier_vllm_alignment_report.py
 ```
 Generated artifacts:
 - `docs/assets/frontier_vllm_alignment/frontier_vllm_alignment.csv`
 - `docs/assets/frontier_vllm_alignment/frontier_vllm_alignment.json`
 - `docs/assets/frontier_vllm_alignment/throughput_ratio.png`
 - `docs/assets/frontier_vllm_alignment/latency_ratios.png`
 - `docs/assets/frontier_vllm_alignment/tp_scaling_total_tps.png`
 - `docs/assets/frontier_vllm_alignment/completion_prefix.png`
 ## Bottom Line
 Functional replay is now usable for the clean 200-request runs:
 - TP1 scale 2/3 after the Frontier lifecycle fix: `200/200` completed.
 - TP2/TP4 scale 2/3: `200/200` completed, no preemption on either side, matched
  vLLM KV block counts, and exact trace-side prefix reuse ratio.
 Performance is not fully calibrated:
 - TP1 scale 2/3 is the closest current operating point: Frontier throughput is
  about `0.74x` vLLM and TPOT p50/p95 is close.
 - TP2/TP4 is functionally aligned but slower: Frontier throughput is only
  `0.55-0.63x` vLLM, and TP4 TPOT is too pessimistic.
 - Frontier underestimates the TP2->TP4 speedup. vLLM improves total throughput
  by `1.15-1.20x`; Frontier improves by only `1.07-1.10x`.
 Current use: acceptable for integration work and rough qualitative trends, not
 yet acceptable as a calibrated absolute performance predictor.
 ## Run Matrix
 All vLLM runs use vLLM 0.11.1, H20, Qwen3-30B-A3B,
 `max_model_len=32768`, `max_num_seqs=64`,
 `max_num_batched_tokens=32768`, `gpu_memory_utilization=0.85`, prefix caching,
 and chunked prefill.
 | run | Frontier rows | preempt F/V | prefix hit F/V | total tok/s F/V | ratio | TPOT p50 F/V | E2E p95 F/V |
 |---|---:|---:|---:|---:|---:|---:|---:|
 | TP1 N100 raw | 96/100 | 0/8 | 0.249/0.251 | 2349/3832 | 0.61 | 0.0569/0.0661s | 119.6/97.4s |
 | TP1 N500 raw | 439/500 | 0/63 | 0.119/0.387 | 4734/5283 | 0.90 | 0.0564/0.0498s | 397.3/417.4s |
 | TP1 N200 scale 0.667 | 176/200 | 0/26 | 0.170/0.270 | 3913/4865 | 0.80 | 0.0584/0.0515s | 189.2/183.7s |
 | TP1 N200 scale 2 | 200/200 | 33/43 | 0.231/0.270 | 3506/4743 | 0.74 | 0.0542/0.0497s | 174.5/142.3s |
 | TP1 N200 scale 3 | 200/200 | 20/16 | 0.218/0.270 | 3390/4608 | 0.74 | 0.0534/0.0462s | 154.5/122.8s |
 | TP2 N200 scale 2 | 200/200 | 0/0 | 0.270/0.270 | 4581/7547 | 0.61 | 0.0430/0.0300s | 106.8/72.5s |
 | TP2 N200 scale 3 | 200/200 | 0/0 | 0.270/0.270 | 4062/6426 | 0.63 | 0.0394/0.0191s | 101.6/54.0s |
 | TP4 N200 scale 2 | 200/200 | 0/0 | 0.270/0.270 | 5035/9073 | 0.55 | 0.0337/0.0163s | 84.9/43.6s |
 | TP4 N200 scale 3 | 200/200 | 0/0 | 0.270/0.270 | 4355/7403 | 0.59 | 0.0311/0.0094s | 83.0/27.9s |
 Important prefix caveat: the vLLM prefix-hit column in this table is the
 trace-side synthetic estimate from the vLLM summaries. For TP1 runs with
 preemption and finite KV pressure, the observed vLLM scheduler `computed:`
 signal is the better comparator. Earlier analysis in
 `docs/rs4_frontier_h20_tp1_alignment.md` records those finite-cache comparisons.
 For TP2/TP4, no preemption occurs and the trace-side prefix ratio matches
 Frontier exactly.
 ## Plots
 ![Throughput ratio](assets/frontier_vllm_alignment/throughput_ratio.png)
 ![Latency ratios](assets/frontier_vllm_alignment/latency_ratios.png)
 ![TP scaling](assets/frontier_vllm_alignment/tp_scaling_total_tps.png)
 ![Completion and prefix reuse](assets/frontier_vllm_alignment/completion_prefix.png)
 ## Interpretation
 ### TP1
 The early TP1 100/500/scale-0.667 runs are still useful as historical stress
 points, but they were run before the decode-preemption lifecycle fix. Frontier
 therefore missed rows in those runs:
 - `96/100` for N100 raw
 - `439/500` for N500 raw
 - `176/200` for N200 scale 0.667
 After the lifecycle fix, TP1 scale 2 and scale 3 both complete `200/200`.
 Preemption is now in the same order as vLLM:
 - scale 2: Frontier 33 vs vLLM 43
 - scale 3: Frontier 20 vs vLLM 16
 TP1 timing is the closest current calibrated region. Throughput is about
 `0.74x` vLLM, TPOT p50/p95 is close, and E2E p95 is about `1.23-1.26x` vLLM.
 This is not perfect, but it is usable for integration-level trend checks.
 ### TP2 and TP4
 The TP2/TP4 runs are functionally cleaner than TP1:
 - `200/200` completed for all four runs.
 - Frontier and vLLM both report no preemption.
 - Frontier uses explicit vLLM KV capacities:
  - TP2: 69,055 blocks
  - TP4: 177,077 blocks
 - Prefix hit ratio matches exactly: `0.2697549478`.
 We did profile TP2/TP4 true-mixed attention. The active RS12 profile includes:
 - `attention_tp2_tp4_combined.csv`: 36,163 rows, including 1,260 true-mixed
  prefill+decode rows for TP2/TP4.
 - `linear_op_tp2_tp4_full32k.csv`: covers up to 32,768 tokens.
 - `moe_tp2_tp4_full32k.csv`: covers up to 32,768 tokens.
 Without the true-mixed rows, Frontier fails with missing
 `attn_decode_in_mixed` predictions. With them, all RS12 runs complete.
 The remaining TP2/TP4 gap is therefore not a missing-profile blocker. It is a
 timing-model gap:
 - TP2 throughput is `0.61-0.63x` vLLM.
 - TP4 throughput is `0.55-0.59x` vLLM.
 - TP4 TPOT p50 is `2.06-3.30x` vLLM.
 ## Scaling
 For the same first-200 request fixtures:
 | fixture | metric | Frontier TP4/TP2 | vLLM TP4/TP2 |
 |---|---|---:|---:|
 | scale 2 | total tok/s | 1.10 | 1.20 |
 | scale 2 | decode tok/s | 1.10 | 1.20 |
 | scale 2 | TPOT p50 | 0.78 | 0.54 |
 | scale 3 | total tok/s | 1.07 | 1.15 |
 | scale 3 | decode tok/s | 1.07 | 1.15 |
 | scale 3 | TPOT p50 | 0.79 | 0.49 |
 Frontier sees some TP4 improvement, but much less than real vLLM. This is the
 clearest current evidence that the simulator is not yet modeling vLLM's
 TP-dependent decode execution path well enough.
 ## Likely Gap Sources
 The main unresolved issues are:
 - CPU/scheduler overhead is still skipped (`skip_cpu_overhead_modeling=true`).
 - Decode CUDA graph behavior is not modeled in the current Frontier runs
  (`decode_cuda_graph_mode=none`).
 - Random-forest predictors interpolate over profile grids, while real online
  mixed batches may concentrate on shapes not directly sampled.
 - Some TP4 predictor fit errors are nontrivial, for example
  `attn_kv_cache_save` MAPE around 11% in the TP4 profile log.
 - Frontier's scheduler and preemption behavior is close but not identical for
  TP1 under finite KV pressure.
 ## ReplayServe TODO
 1. Rerun the 500-request TP1 stress after the decode-preemption lifecycle fix,
   so the 500-row result is no longer mixed with the old incomplete behavior.
 2. Record vLLM observed scheduler prefix/preemption metrics in machine-readable
   summaries, not only in docs, especially first-start and last-start
   `computed:` ratios.
 3. Add a shape-ledger analysis: compare Frontier's actual online batch shapes
   against the profile grid and identify hot shapes that are interpolated.
 4. Profile or import vLLM CPU overhead and test
   `skip_cpu_overhead_modeling=false`.
 5. Collect kernel-only / decode-CUDA-graph timing profiles before enabling a
   Frontier CUDA-graph decode mode.
 6. Calibrate TP2/TP4 timing only after the above, because current functional
   replay is aligned but the TP scaling is not.
--- a/docs/rs1_frontier_blocker.md
+++ b/docs/rs1_frontier_blocker.md
@@ -0,0 +1,199 @@
 # RS1 Frontier Blocker: Prefix Cache + Chunked Prefill
 This note narrows the RS1 `coder_2000` failure into a small Frontier repro.
 It does not change the RS1 fixed config or make performance claims.
 ## Status
 - Frontier repo: `/tmp/toc-llm-sim-research/Frontier`
 - Frontier HEAD: `d9cfeb6d8791fbf2f295dd9744c56a666171776e`
 - ReplayServe canonical fixtures were not changed.
 - Frontier source was not modified.
 - Diagnostic artifacts live under `runs/rs1/blocker_request_194/`.
 The original `coder_2000` run failed with:
 ```text
 ValueError: Request 194 already scheduled.
 ```
 First-N probing shows the smaller blocker is not a single malformed row 194.
 The smallest observed first-N failure is `N=193`, which fails as:
 ```text
 ValueError: Request 192 already scheduled.
 ```
 `N=192` passes under the same fixed config.
 ## Repro Commands
 Generate diagnostic slices:
 ```bash
 cd /home/gahow/phd/replayserve
 for n in 190 191 192 193 194 195 200; do
  out="runs/rs1/blocker_request_194/fixtures/coder_${n}"
  mkdir -p "$out"
  python3 tools/qwen_to_frontier.py \
    --input /home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl \
    --frontier-csv "$out/frontier.csv" \
    --sidecar-jsonl "$out/sidecar.jsonl" \
    --source-jsonl "$out/source.jsonl" \
    --manifest-json "$out/manifest.json" \
    --fixture-name "blocker_coder_${n}" \
    --limit "$n" \
    --max-tokens 32768 \
    --block-size 16 \
    --fail-on-overflow
 done
 ```
 Minimal failing command:
 ```bash
 cd /home/gahow/phd/replayserve
 scripts/run_frontier_blocker_probe.sh \
  n193_default \
  runs/rs1/blocker_request_194/fixtures/coder_193
 ```
 The exact Frontier CLI for every probe is preserved in each
 `runs/rs1/blocker_request_194/probes/<name>/command.txt`.
 ## First-N Matrix
 All default rows use the RS1 fixed config: prefix caching on, chunked prefill
 on, `long_prefill_token_threshold=64`, batch cap 128, max batch tokens 32768.
 | probe | rows | prefix cache | chunked prefill | threshold | exit | result |
 |---|---:|---|---|---:|---:|---|
 | `n190_default` | 190 | on | on | 64 | 0 | pass |
 | `n191_default` | 191 | on | on | 64 | 0 | pass |
 | `n192_default` | 192 | on | on | 64 | 0 | pass |
 | `n193_default` | 193 | on | on | 64 | 1 | `Request 192 already scheduled` |
 | `n194_default` | 194 | on | on | 64 | 1 | `Request 192 already scheduled` |
 | `n195_default` | 195 | on | on | 64 | 1 | `Request 194 already scheduled` |
 | `n200_default` | 200 | on | on | 64 | 1 | `Request 194 already scheduled` |
 ## Diagnostic Variants
 These are diagnosis only. They are not replacements for the RS1 fixed config.
 | probe | rows | prefix cache | chunked prefill | threshold | exit | result |
 |---|---:|---|---|---:|---:|---|
 | `n193_prefix_off` | 193 | off | on | 64 | 0 | pass |
 | `n193_chunked_off` | 193 | on | off | 64 | 1 | Frontier config rejects this combination |
 | `n193_chunked_off_threshold_0` | 193 | on | off | 0 | 0 | pass |
 | `n193_threshold_32768` | 193 | on | on | 32768 | 0 | pass |
 | `n195_prefix_off` | 195 | off | on | 64 | 0 | pass |
 | `n195_chunked_off` | 195 | on | off | 64 | 1 | Frontier config rejects this combination |
 | `n195_chunked_off_threshold_0` | 195 | on | off | 0 | 0 | pass |
 | `n195_threshold_32768` | 195 | on | on | 32768 | 0 | pass |
 | `n200_prefix_off` | 200 | off | on | 64 | 0 | pass |
 | `n200_chunked_off_threshold_0` | 200 | on | off | 0 | 0 | pass |
 | `n200_threshold_32768` | 200 | on | on | 32768 | 0 | pass |
 Frontier enforces:
 ```text
 VllmV1SchedulerConfig.long_prefill_token_threshold > 0 requires enable_chunked_prefill=True
 ```
 So a valid chunked-off diagnostic also sets
 `LONG_PREFILL_TOKEN_THRESHOLD=0`.
 ## Local Trace Analysis
 Generated files:
 - `runs/rs1/blocker_request_194/analysis/request_192_analysis.json`
 - `runs/rs1/blocker_request_194/analysis/request_192_analysis.md`
 - `runs/rs1/blocker_request_194/analysis/request_194_analysis.json`
 - `runs/rs1/blocker_request_194/analysis/request_194_analysis.md`
 Request 192, the minimal first-N failure target:
 - `timestamp=43.406`
 - `chat_id=192`, `parent_chat_id=-1`, `turn=1`, `type=coder`
 - `input_length=13436`, `output_length=1425`, total `14861`
 - `hash_count=840`
 - partial final block: yes, final block token count `12`
 - top prior prefix overlap: 7 blocks, 112 tokens
 - no parent candidate in the sidecar
 Request 194, the original `coder_2000` failing request:
 - `timestamp=43.931`
 - `chat_id=194`, `parent_chat_id=-1`, `turn=1`, `type=coder`
 - `input_length=2064`, `output_length=2278`, total `4342`
 - `hash_count=129`
 - partial final block: no, final block token count `16`
 - top prior prefix overlap: 1 block, 16 tokens
 - no parent candidate in the sidecar
 Interpretation:
 - The failing requests are independent first turns, not child turns in a chat.
 - Request 192 has a partial final block, but its observed prior prefix overlap
  is only the first 7 full blocks.
 - Request 194 has no partial final block and only a 1-block prefix overlap.
 - The failure is therefore not explained by a malformed partial final block,
  deep shared-prefix trace structure, or a parent/child chat mismatch.
 - Fixture validation confirms monotonic timestamps, max-token compliance,
  sidecar hash lengths, and block token counts.
 ## Frontier Source Localization
 Relevant Frontier files:
 - `/tmp/toc-llm-sim-research/Frontier/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py`
 - `/tmp/toc-llm-sim-research/Frontier/frontier/entities/request.py`
 - `/tmp/toc-llm-sim-research/Frontier/frontier/config/config.py`
 Key path:
 - `VllmV1EngineReplicaScheduler._prepare_prefix_cache_admission`
  at `vllm_v1_engine_replica_scheduler.py:1178` calls
  `kv_cache_manager.get_computed_blocks(request)` and returns
  `prefix_cached_tokens`.
 - `_schedule_waiting_requests` at
  `vllm_v1_engine_replica_scheduler.py:3075` runs prefix-cache admission for
  any waiting request with prefix caching enabled and incomplete prefill.
 - The same waiting path allocates KV and then calls
  `request.on_cache_hit(prefix_cached_tokens)` at
  `vllm_v1_engine_replica_scheduler.py:3179`.
 - `Request.on_cache_hit` at `request.py:503` raises if `_scheduled` is already
  true.
 - `Request.on_batch_schedule` at `request.py:1058` sets `_scheduled=True`.
 - Chunked-prefill continuations run through `_schedule_running_requests`
  around `vllm_v1_engine_replica_scheduler.py:2696`, with long-prefill
  capping applied around `:2826`.
 - Valid chunked-off CLI requires `long_prefill_token_threshold=0`; otherwise
  `config.py:714` rejects the configuration.
 The evidence points to a Frontier scheduler state issue: with prefix caching
 enabled and chunked prefill active, a request that has already been scheduled
 can later reach waiting-admission prefix-cache handling and receive
 `on_cache_hit` again. That violates `Request.on_cache_hit`'s current invariant.
 This is more consistent with a repeated cache-hit application or scheduled
 request re-admission path than with bad ReplayServe trace/hash data.
 ## Suggested Next Steps
 1. Add temporary Frontier instrumentation around `_schedule_waiting_requests`
   before `request.on_cache_hit` to log `request.id`, `_scheduled`,
   `_preempted`, `is_prefill_complete`, `num_processed_tokens`,
   `prefix_cached_tokens`, and whether the request came from
   `_preempted_requests` or `_request_queue`.
 2. Decide Frontier semantics for prefix-cache hits after a request has already
   been scheduled once. A likely fix is to apply `on_cache_hit` only for a
   first admission with `_scheduled=False` and `num_processed_tokens=0`, or to
   reset/request-restart state before re-admission if that is the intended
   vLLM parity behavior.
 3. Keep RS1 fixed config blocked for `coder_2000` until Frontier behavior is
   patched or a documented upstream-compatible workaround is selected.
 4. Do not use the passing diagnosis variants as RS1 performance evidence:
   prefix-off, chunked-off, and threshold-32768 change the fixed config.
--- a/docs/rs1_frontier_patch.md
+++ b/docs/rs1_frontier_patch.md
@@ -0,0 +1,150 @@
 # RS1B Frontier Patch
 This document records the scratch Frontier patch used to unblock RS1 fixed
 config replay. It is not applied to the canonical Frontier checkout.
 ## Patch
 - Patch file:
  `patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch`
 - Canonical Frontier checkout:
  `/tmp/toc-llm-sim-research/Frontier`
 - Scratch Frontier checkout:
  `/tmp/replayserve-frontier-rs1b`
 - Frontier base HEAD:
  `d9cfeb6d8791fbf2f295dd9744c56a666171776e`
 Apply from a Frontier checkout at the same base commit:
 ```bash
 cd /path/to/Frontier
 git apply /home/gahow/phd/replayserve/patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch
 ```
 Check applicability without modifying a checkout:
 ```bash
 cd /path/to/Frontier
 git apply --check /home/gahow/phd/replayserve/patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch
 ```
 ## Root Cause
 Instrumentation in the scratch checkout showed the minimal `N=193` failure
 has two admissions for request 192:
 ```text
 req=192 source=request_queue scheduled=False preempted=False prefill_complete=False num_processed_tokens=0 prefix_cached_tokens=112 num_new_tokens=64
 req=192 source=preempted_requests scheduled=True preempted=True prefill_complete=False num_processed_tokens=0 prefix_cached_tokens=1232 num_new_tokens=64
 ```
 The second admission comes from `_preempted_requests`. Frontier preemption
 resets `victim._num_processed_tokens` and removes the explicit scheduler
 frontier, but it leaves `victim._scheduled=True`. The request then re-enters
 waiting admission, prefix-cache admission finds cached blocks, and
 `request.on_cache_hit(prefix_cached_tokens)` raises because `on_cache_hit`
 requires `_scheduled=False`.
 The failure is therefore a Frontier runtime-state reset issue for preempted
 chunked-prefill requests with prefix caching enabled, not bad ReplayServe
 trace data.
 ## Patch Rationale
 The first patch reset two request runtime fields in
 `VLLMv1EngineReplicaScheduler._preempt_request`:
 ```python
 victim._num_prefill_tokens_cached = 0
 victim._scheduled = False
 ```
 This matches the existing preemption intent in the same block: computed tokens
 are reset and the request is re-entered into a waiting queue for recomputation.
 After that reset, waiting admission can apply prefix-cache hit state through
 the existing `Request.on_cache_hit` path before the request is scheduled again.
 An earlier conservative experiment skipped `on_cache_hit` for already scheduled
 requests and advanced only the scheduler frontier. That avoided the immediate
 exception but left request 192 incomplete at simulation shutdown, because the
 request object's processed-token state never reflected the cached prefix.
 The 2026-06-25 RS10 debug runs exposed a second lifecycle bug. Missing request
 metrics for `coder_200_ts2` and `coder_200_ts3` were not postprocess artifacts:
 Frontier drained with `completed_requests < total_requests`. Missing requests
 had this state pattern:
 ```text
 preempted=True
 is_prefill_complete=True
 num_processed_tokens=0
 scheduled=False
 completed=False
 ```
 They had been preempted after entering decode. Frontier cleared processed
 tokens but kept the request in prefill-complete state. The next waiting
 admission therefore computed `num_new_tokens=0` and dropped the request from
 the waiting queue.
 The current patch now also:
 - replays decode-phase preemption by turning already-produced tokens into the
  next prefill segment and leaving the remaining tokens as decode work;
 - preserves user-facing prompt/output lengths for metrics after runtime token
  splitting;
 - preserves unfinished zero-token waiting requests instead of silently dropping
  them;
 - makes sequential simulation fail fast if the event queue drains before all
  generated requests complete, with per-request debug snapshots.
 ## Verification Matrix
 All patched runs used RS1 fixed config unless explicitly stated otherwise:
 online, co-location, vLLM v1, A800, Qwen/Qwen3-32B, TP2, dummy predictor,
 analytical communication backend, `max_tokens=32768`, prefix cache on, block
 size 16, chunked prefill on, batch cap 128, max batch tokens 32768, memory
 planner KV capacity.
 | run | Frontier root | result | runtime | notes |
 |---|---|---:|---:|---|
 | `runs/rs1b/instrumentation/n193_instrumented_print` | scratch instrumentation | fail | 4s | Proved request 192 re-entered from `_preempted_requests` with `_scheduled=True`. |
 | `runs/rs1b/patched/n193_fixed_v2` | patched scratch | pass | 11s | `N=193` fixed config passed. |
 | `runs/rs1b/patched/coder_100` | patched scratch | pass | 8s | Prefix hit ratios matched original RS1 `coder_100`. |
 | `runs/rs1b/patched/coder_2000` | patched scratch | pass | 87s | Full fixed config run completed. |
 | `runs/rs10_preemption_replay_fix_ts2/.../coder_200_ts2` | patched scratch | pass | 462s | RS10 H20 TP1 full32K profile; completion `200/200`; 33 preemption events. |
 | `runs/rs10_preemption_replay_fix_ts3/.../coder_200_ts3` | patched scratch | pass | 465s | RS10 H20 TP1 full32K profile; completion `200/200`; 20 preemption events. |
 Prefix cache summaries:
 | run | Frontier block hit ratio | ReplayServe token-weighted hit ratio | preemption events |
 |---|---:|---:|---:|
 | original `runs/rs1/coder_100` | 0.0494866184 | 0.0495623259 | 0 |
 | patched `runs/rs1b/patched/coder_100` | 0.0494866184 | 0.0495623259 | 0 |
 | patched `runs/rs1b/patched/n193_fixed_v2` | 0.1245897179 | 0.1247698141 | 5 |
 | patched `runs/rs1b/patched/coder_2000` | 0.1231893025 | 0.1233297822 | 35940 |
 | patched `runs/rs10_preemption_replay_fix_ts2/.../coder_200_ts2` | 0.2310157359 | 0.2313416900 | 33 |
 | patched `runs/rs10_preemption_replay_fix_ts3/.../coder_200_ts3` | 0.2173684294 | 0.2176751278 | 20 |
 For `coder_2000`, ReplayServe postprocess skipped 745 request rows whose
 Frontier request metrics had blank prefix-cache fields. The run still completed
 and produced `system_metrics.json` and `request_metrics.csv`.
 ## Risks
 - The patch touches Frontier private `Request` fields from scheduler code,
  matching existing local style but still relying on internal state layout.
 - Resetting `_scheduled` during preemption may affect request scheduling
  accounting outside this RS1 path. It does not clear `_scheduled_at`, so
  schedule history remains present, but downstream assumptions about the
  boolean should be reviewed upstream.
 - Resetting `_num_prefill_tokens_cached` means request-level cached-prefill
  metrics reflect the current post-preemption admission rather than stale
  pre-preemption state. This is necessary for the existing `on_cache_hit` path
  to model cached-prefix progress, but metrics semantics should be confirmed
  with Frontier maintainers.
 - The decode-phase preemption replay mutates Frontier private request token
  fields. Metrics are explicitly anchored to user-facing prompt/output lengths,
  but upstream should review whether this should become a public Request method.
 - The patched `coder_2000` run has many preemptions. RS1 remains a plumbing
  smoke; latency and throughput should not be treated as performance evidence.
--- a/docs/rs1_frontier_smoke.md
+++ b/docs/rs1_frontier_smoke.md
@@ -0,0 +1,163 @@
 # RS1 Frontier Smoke
 RS1 runs Frontier trace replay as a plumbing smoke for the Qwen coder fixtures
 generated in RS0. It checks that Frontier can consume ReplayServe's Frontier CSV,
 preserve online arrivals, run vLLM v1 prefix caching, and emit request/system
 metrics. It does not make latency or throughput claims.
 ## Fixed Configuration
 - `simulation_mode=online`
 - `sys_arch=co-location`
 - `cluster_scheduler=sticky_round_robin`
 - `replica_scheduler=vllm_v1`
 - `device=a800`
 - `network_device=a800_dgx`
 - `model_name=Qwen/Qwen3-32B`
 - `attn_tensor_parallel_size=2`
 - dummy execution predictor, 1 ms per model execution
 - analytical communication backend
 - `trace_request_generator_config_max_tokens=32768`
 - prefix caching enabled
 - block size 16
 - chunked prefill enabled
 - batch cap 128
 - max batch tokens 32768
 - `num_blocks_mode=memory_planner`
 - `gpu_memory_utilization=0.9`
 - `non_kv_cache_overhead_bytes=0`
 The memory planner point uses Frontier's A800 device config
 (`total_memory_gb=80`) and analytical parameter memory. The non-KV overhead is
 set to 0 for this smoke, so the derived KV block count is a permissive plumbing
 budget, not a calibrated serving budget.
 Frontier also ships an `a800_pairwise_nvlink` network profile, but
 `replica_config_network_device` is used to construct a node SKU in the current
 co-location path. This checkout has `A800_DGX` as a node SKU and does not have an
 `A800_PAIRWISE_NVLINK` node SKU, so RS1 uses `a800_dgx`.
 ## Reproduce
 From `/home/gahow/phd/replayserve`:
 ```bash
 PIP_CACHE_DIR=/home/gahow/phd/replayserve/.cache/pip python3 -m pip install \
  --target /home/gahow/phd/replayserve/.deps/python \
  'ddsketch>=3.0,<4' 'fasteners>=0.19,<1' 'numpy>=1.23' 'pandas>=1.5' \
  'plotly>=5.0' 'pyyaml>=6.0' 'scikit-learn>=1.1' 'scipy>=1.9' 'tqdm>=4.64'
 scripts/run_frontier_smoke.sh coder_100
 scripts/run_frontier_smoke.sh coder_2000
 ```
 Each run writes:
 - `runs/rs1/<fixture>/command.txt`
 - `runs/rs1/<fixture>/stdout.log`
 - `runs/rs1/<fixture>/stderr.log`
 - `runs/rs1/<fixture>/exit_code.txt`
 - `runs/rs1/<fixture>/runtime_seconds.txt`
 - `runs/rs1/<fixture>/frontier_metrics/.../config.json`
 - `runs/rs1/<fixture>/frontier_metrics/.../system_metrics.json`
 - `runs/rs1/<fixture>/frontier_metrics/.../request_metrics.csv`
 - `runs/rs1/<fixture>/postprocess_summary.json`
 - `runs/rs1/<fixture>/postprocess_summary.md`
 ## Current Results
 Initial local attempt with `network_device=a800_pairwise_nvlink` failed during
 config reconstruction:
 ```text
 ValueError: [BaseNodeSKUConfig] Invalid type string: a800_pairwise_nvlink
 ```
 The preserved failed run context is under
 `runs/rs1/coder_100_failed_a800_pairwise_nvlink/`.
 The first `a800_dgx` attempt failed because the base Python environment lacked
 Frontier runtime dependencies:
 ```text
 ModuleNotFoundError: No module named 'plotly'
 ```
 Dependencies were installed into ReplayServe-local `.deps/python` with pip
 `--target`; Frontier source was not installed or modified.
 ### coder_100
 Status: passed.
 - Run dir: `runs/rs1/coder_100/`
 - Runtime: 7 seconds
 - Metrics dir:
  `runs/rs1/coder_100/frontier_metrics/qwen_qwen3_32b/online_serving/rs1_coder_100/`
 - Frontier block-level prefix hit ratio: `0.04948661841440835`
 - ReplayServe token-weighted prefix hit ratio: `0.04956232588915065`
 - Frontier total query blocks: `29705`
 - Frontier total hit blocks: `1470`
 - ReplayServe total query tokens: `474554`
 - ReplayServe total hit tokens: `23520`
 - Memory planner mode: `memory_planner`
 - GPU memory utilization: `0.9`
 - A800 memory budget: `80 GiB * 0.9 = 77309411328 bytes`
 - Qwen3-32B TP2 analytical weight shard estimate:
  `28940697600 bytes` (`26.953125 GiB`)
 - Non-KV overhead assumption: `0 bytes`
 - Available KV budget under this smoke assumption: `48368713728 bytes`
 - Derived KV blocks: `36902`
 - Preemption events: `0`
 - Allocation/preemption/OOM log lines: `0`
 The derived KV block count is recomputed by ReplayServe postprocess with the
 same formula as Frontier `MemoryPlanner.get_num_blocks` because this run did
 not emit Frontier's `[MEMORY_STATE]` line in stdout/stderr.
 ### coder_2000
 Status: blocked by Frontier runtime error under the fixed RS1 configuration.
 - Run dir: `runs/rs1/coder_2000/`
 - Runtime: 4 seconds
 - Config:
  `runs/rs1/coder_2000/frontier_metrics/qwen_qwen3_32b/online_serving/rs1_coder_2000/config.json`
 - Failure summary: `runs/rs1/coder_2000/failure_summary.md`
 Frontier failed during vLLM v1 prefix-cache scheduling:
 ```text
 ValueError: Request 194 already scheduled.
 ```
 The traceback reaches
 `vllm_v1_engine_replica_scheduler.py:3185`, where the scheduler calls
 `request.on_cache_hit(prefix_cached_tokens)`, and then
 `request.py:505`, where `Request.on_cache_hit()` rejects cache-hit updates after
 the request has already been scheduled.
 No Frontier source changes were made. RS1 stops here rather than changing
 scheduler knobs, because disabling prefix caching or chunked prefill would no
 longer match the fixed smoke point.
 ## Metric Semantics
 Frontier reports prefix-cache hits at block granularity. ReplayServe postprocess
 uses `sidecar.jsonl` to weight each request's first `hit_blocks` by
 `block_token_counts`, so a hit on a partial final block contributes the true
 partial token count rather than 16 tokens.
 If Frontier omits `request_cached_prefill_tokens`,
 `request_prefix_cache_query_blocks`, or `request_prefix_cache_hit_blocks` from
 `request_metrics.csv`, ReplayServe cannot compute token-weighted hit ratio from
 that run without additional simulator instrumentation.
 ## Limitations
 - Frontier's public A800 compute profiles in the checked source do not include a
  dense `Qwen/Qwen3-32B` profile.
 - Dummy execution predictor is enabled, so TTFT, TPOT, E2E latency, and
  throughput are only pipeline smoke outputs.
 - Memory planner uses analytical parameter memory and a 0-byte non-KV overhead
  assumption. The derived KV capacity must be replaced by calibrated overhead or
  runtime profiling before interpreting capacity pressure.
--- a/docs/rs3_sweep_harness.md
+++ b/docs/rs3_sweep_harness.md
@@ -0,0 +1,143 @@
 # RS3 Sweep Harness
 RS3 adds a reproducible Frontier sweep harness and a tiny smoke. This is not the
 full TP/EP/DP/config scan.
 ## Files
 - Config: `configs/rs3_tiny_sweep.json`
 - Runner: `tools/run_frontier_sweep.py`
 - Aggregator: `tools/aggregate_runs.py`
 - Tiny smoke outputs: `runs/rs3_tiny_smoke_20260624/`
 The output layout is:
 ```text
 runs/<suite>/<sim>/<fixture>/<config_id>/
  command.txt
  env.txt
  run_manifest.json
  run_status.json
  stdout.log
  stderr.log
  exit_code.txt
  runtime_seconds.txt
  frontier_metrics/...
  postprocess_summary.json
  postprocess_summary.md
 runs/<suite>/summary.csv
 runs/<suite>/summary.md
 ```
 ## Config Scheme
 `configs/rs3_tiny_sweep.json` is intentionally small JSON:
 - `suite_id`: output suite under `runs/`.
 - `sim`: simulator/mode name used in the run path.
 - `frontier`: Frontier checkout metadata. The tiny smoke points at patched
  scratch `/tmp/replayserve-frontier-rs1b`, not canonical Frontier.
 - `fixtures`: fixture names under `traces/fixtures/`.
 - `defaults`: fixed Frontier knobs shared by each config.
 - `configs`: named variants with optional `overrides`.
 The exposed Frontier knobs include:
 - parallelism: `attn_tensor_parallel_size`, `attn_data_parallel_size`,
  `moe_tensor_parallel_size`, `moe_expert_parallel_size`,
  `num_pipeline_stages`, `num_replicas`
 - scheduler: `batch_size_cap` / max-num-seqs equivalent,
  `max_tokens_in_batch` / max-batch-tokens equivalent, `block_size`,
  `enable_prefix_caching`, `enable_chunked_prefill`,
  `long_prefill_token_threshold`
 - fixed smoke context: model, device, network device, trace max tokens,
  memory-planner mode, GPU memory utilization, non-KV overhead, and dummy
  execution time
 For dense `Qwen/Qwen3-32B`, the EP-like knobs stay at `1` in the tiny smoke.
 They are present so later MoE configs can be represented without changing the
 harness schema.
 ## Run Commands
 From `/home/gahow/phd/replayserve`:
 ```bash
 python3 tools/run_frontier_sweep.py \
  --config configs/rs3_tiny_sweep.json \
  --suite-id rs3_tiny_smoke_20260624
 python3 tools/aggregate_runs.py runs/rs3_tiny_smoke_20260624
 ```
 The runner refuses to replace an existing selected run directory unless
 `--force` is passed. Use `--dry-run` to emit commands/manifests without running
 Frontier, and `--only-config` / `--only-fixture` to narrow the selected matrix.
 ## Frontier Mode
 The RS3 tiny smoke uses:
 - `frontier.root=/tmp/replayserve-frontier-rs1b`
 - `frontier.mode=patched_scratch`
 - patch file `patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch`
 The canonical checkout `/tmp/toc-llm-sim-research/Frontier` remains clean and is
 not modified by the harness. `summary.csv` records `frontier_dirty=true` for the
 patched scratch because the local patch is applied there; that is expected.
 To run canonical mode for a safe config, copy the JSON config, set
 `frontier.root` to `/tmp/toc-llm-sim-research/Frontier`, change `sim`, and run a
 small selected config. Do not use canonical fixed `coder_2000` until the
 prefix-cache chunked-prefill bug is fixed upstream.
 ## Tiny Smoke Results
 Command:
 ```bash
 python3 tools/run_frontier_sweep.py \
  --config configs/rs3_tiny_sweep.json \
  --suite-id rs3_tiny_smoke_20260624
 python3 tools/aggregate_runs.py runs/rs3_tiny_smoke_20260624
 ```
 Results:
 | config | status | runtime | prefix cache | chunked prefill | Frontier block hit ratio | ReplayServe token hit ratio | preemptions |
 |---|---:|---:|---:|---:|---:|---:|---:|
 | `fixed_prefix_on` | pass | 8s | on | on | `0.049486618` | `0.049562326` | 0 |
 | `prefix_cache_off` | pass | 7s | off | on | n/a | n/a | 0 |
 Aggregated files:
 - `runs/rs3_tiny_smoke_20260624/summary.csv`
 - `runs/rs3_tiny_smoke_20260624/summary.md`
 The prefix-off run does not have Frontier cache columns in `request_metrics.csv`;
 `summary.csv` records `cache_metrics_available=false` and the missing-column
 reason.
 TTFT/TPOT/E2E/throughput fields are aggregated from Frontier `system_metrics.json`
 when present. In this tiny smoke they are dummy-predictor plumbing outputs, not
 performance results.
 ## Not Yet Run
 - No `coder_2000` sweep was run in RS3.
 - No TP/DP/EP matrix was swept.
 - No batch cap, max batch tokens, block size, chunked-prefill, or threshold
  matrix was swept beyond the two-config smoke.
 - No canonical Frontier patched-vs-unpatched comparison was rerun.
 - No Vidur or AIConfigurator run is part of this harness yet.
 ## Next Harness Work
 - Add a small checked-in config for a real RS3 candidate grid only after deciding
  the patch/upstream policy.
 - Add guardrails for invalid dense/MoE parallelism combinations before launching
  larger matrices.
 - Investigate `coder_2000` missing request-level cache fields before using
  request-level hit ratio as a headline sweep metric.
 - Keep latency/throughput result tables clearly separated by predictor/profile
  mode: dummy smoke, profiled Frontier, or calibrated run.
--- a/docs/rs4_frontier_h20_tp1_alignment.md
+++ b/docs/rs4_frontier_h20_tp1_alignment.md
@@ -0,0 +1,740 @@
 # RS4 Frontier H20 TP1 Alignment
 This note compares Frontier H20 TP1 against the real vLLM TP1 run on dash2 for
 `coder_100`.
 ## Setup
 Real vLLM:
 - Runtime: vLLM 0.11.1
 - Host/GPU: dash2, NVIDIA H20
 - Model: `/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
 - TP: 1
 - KV capacity: 244,496 tokens = 15,281 blocks at block size 16
 - Run: `runs/vllm_gpu_smoke_20260624/tp1_coder100_uncapped`
 Frontier:
 - Frontier root: `/tmp/replayserve-frontier-rs1b`
 - Frontier commit: `d9cfeb6d8791fbf2f295dd9744c56a666171776e`
 - Model config name: `qwen3-a3b-30b-moe`
 - Device: `h20`
 - Network node SKU: `h20_dgx`
 - TP: `attn_tensor_parallel_size=1`, `moe_tensor_parallel_size=1`,
  `moe_expert_parallel_size=1`
 - `max_tokens_in_batch=32768`, `batch_size_cap=64`, block size 16
 - Prefix cache on, chunked prefill on
 - `long_prefill_token_threshold=32768`
 - Config: `configs/rs4_frontier_h20_tp1.json`
 - Run: `runs/rs4_frontier_h20_tp1_20260624`
 The high long-prefill threshold is deliberate. Frontier's earlier threshold 64
 run under-counted prefix hits because long prompts were admitted in 64-token
 chunks, unlike the current real vLLM run.
 ## KV Capacity
 | run | KV blocks | KV tokens | note |
 |---|---:|---:|---|
 | Frontier `planner_kv` | 17,385 | 278,160 | Frontier H20 memory planner, no non-KV overhead |
 | Frontier `vllm_kv_15281` | 15,281 | 244,496 | Explicitly matched to real vLLM TP1 |
 | vLLM TP1 | 15,281 | 244,496 | From vLLM memory profiling |
 So only `vllm_kv_15281` has the same KV block count as real vLLM TP1.
 ## Results
 | run | completed | prefix hit tokens / ratio | preemptions | TTFT p50/p95 | TPOT p50/p95 | E2E p50/p95 | decode tok/s |
 |---|---:|---:|---:|---:|---:|---:|---:|
 | Frontier `planner_kv` | 96/100 | 110,608 / 0.240691 | 0 | 0.986/128.991s | 0.582/0.582s | 279.092/1706.675s | 19.4 |
 | Frontier `vllm_kv_15281` | 92/100 | 103,168 / 0.242542 | 0 | 0.964/182.639s | 0.582/0.582s | 305.290/1765.347s | 19.4 |
 | vLLM TP1 real | 100/100 | 119,152 / 0.251082 sidecar estimate | 8 | 4.503/29.060s | 0.066/0.621s | 41.841/97.366s | 567.4 |
 The latency/throughput rows are not calibrated. Frontier still uses dummy
 execution timing, so TPOT is a constant simulator artifact.
 ## Prefix Admission Check
 For TP1, real vLLM has preemption. Therefore the sidecar theoretical prefix-hit
 estimate is not the right observed comparator for every request. The observed
 vLLM scheduler signal is the first `computed:` value in `stdout.log` for each
 request start.
 Using first-start `computed:` tokens:
 | Frontier run | compared rows | Frontier computed sum | vLLM first-start computed sum | mismatch |
 |---|---:|---:|---:|---:|
 | `planner_kv` | 96 | 110,608 | 108,208 | one request differs |
 | `vllm_kv_15281` | 92 | 103,168 | 103,168 | exact match |
 So with the KV block count explicitly matched, Frontier's prefix-cache admission
 matches real vLLM TP1 for every row where Frontier emits complete cache metrics.
 ## Current Alignment Judgment
 Aligned:
 - H20 device and Qwen3-30B-A3B structural model config can run in Frontier.
 - TP1 scheduler knobs can be matched.
 - KV block count can be matched explicitly at 15,281 blocks.
 - First-admission prefix-cache hit tokens match real vLLM TP1 on completed rows
  when KV blocks are explicit.
 Not aligned:
 - Frontier emits complete request/cache metrics for only 92/100 requests in the
  explicit-KV run, while vLLM completes 100/100.
 - Frontier reports 0 preemptions; real vLLM TP1 reports 8 preemptions across 5
  repeated-start requests.
 - Frontier timing is not comparable because it still uses dummy execution
  prediction. The current latency/throughput gap is expected and not a
  calibrated simulator error.
 Next work:
 - Treat RS6 as the current profiled baseline and investigate why it omits
  complete latency/cache metrics for requests `70`, `77`, `88`, and `90`.
 - Instrument Frontier's vLLM V1 scheduler around KV block allocation, free-block
  count, and preemption victim selection. Real vLLM TP1 has 8 preemptions, while
  Frontier still reports 0 with the same explicit 15,281-block capacity.
 - Add a per-request Frontier/vLLM comparator that reports TTFT/TPOT/E2E ratios,
  prefix hits, and completion/preemption status on the same request ids.
 - Calibrate CPU/scheduler/CUDA-graph effects separately from op profile timing;
  RS6 removed the 4096-token linear/MoE extrapolation as the primary explanation
  for the remaining gap.
 ## Performance Gap
 Use Frontier `vllm_kv_15281` as the current aligned-KV simulator point. This
 matches the real vLLM TP1 KV block count, but it still uses Frontier dummy
 execution timing.
 | metric | Frontier H20 TP1 explicit KV | real vLLM H20 TP1 | gap |
 |---|---:|---:|---:|
 | completed requests | 92/100 | 100/100 | not aligned |
 | TTFT p50 | 0.964s | 4.503s | Frontier 0.21x real |
 | TTFT p95 | 182.639s | 29.060s | Frontier 6.28x real |
 | TPOT p50 | 0.582s | 0.066s | Frontier 8.81x real |
 | TPOT p95 | 0.582s | 0.621s | Frontier 0.94x real |
 | E2E p50 | 305.290s | 41.841s | Frontier 7.30x real |
 | E2E p95 | 1765.347s | 97.366s | Frontier 18.13x real |
 | RPS | 0.0217 | 0.6880 | vLLM 31.74x Frontier |
 | decode tok/s | 19.4 | 567.4 | vLLM 29.20x Frontier |
 Interpretation:
 - The prefix admission path is close after explicit KV matching, but performance
  is not calibrated.
 - Frontier uses dummy execution timing; its TPOT is nearly constant at 582 ms,
  while real vLLM TP1 has p50 TPOT 66 ms and p95 TPOT 621 ms.
 - Frontier does not reproduce real vLLM's TP1 preemption behavior: real vLLM had
  8 preemptions, while Frontier reported 0.
 - Frontier emits complete request/cache metrics for only 92 rows in this run,
  so p95 and throughput are not yet on the same request set.
 - The TTFT sign is mixed: Frontier p50 TTFT is too optimistic, but p95 TTFT is
  far too pessimistic. This is consistent with uncalibrated execution timing plus
  different queue/preemption dynamics.
 ## RS5 Profiled Frontier Timing
 Frontier does support replacing dummy timing with real CSV profiles through the
 random-forest execution-time predictor. The required non-dummy flags are wired
 in `tools/run_frontier_sweep.py`, and the active profiled config is
 `configs/rs5_frontier_h20_tp1_profile.json`.
 Profile data collected on dash2 H20 TP1:
 - Linear ops: `linear_op.csv`, CUDA event, max tokens 4096.
 - Attention: `attention_combined.csv`, CUDA event, max sequence/chunk 18000,
  with 15417 standard rows plus 612 true-mixed rows. Online replay needs the
  true-mixed rows to train `attn_prefill_mixed` and `attn_decode_in_mixed`.
 - MoE: `moe_vllm_fused.csv`, CUDA event, max tokens 4096, vLLM fused MoE
  backend.
 Frontier vLLM 0.11.1 profiling needed local compatibility patches in
 `patches/frontier-vllm-0.11.1-profiling-compat.patch`:
 - RoPE helper fallback when vLLM 0.11.1 `get_rope()` no longer accepts the
  legacy `rotary_dim` keyword.
 - `_get_config_dtype_str` fallback for vLLM fused MoE config dtype.
 - `ReplicatedLinear(disable_tp=True)` fallback to torch `Linear` when vLLM TP
  group is not initialized in standalone profiling.
 - `fused_topk()` variable-return handling.
 - `invoke_fused_moe_kernel()` 0.11.1 signature compatibility.
 The first profiled MoE attempt used Frontier's `frontier_loop` backend and was
 not faithful to vLLM serving. It predicted `moe_grouped_gemm` at about 16 ms for
 24 tokens and 19 ms for 1024 tokens, causing TPOT around 0.93 s. The vLLM fused
 MoE profile predicts about 0.32 ms for 24 tokens and 0.87 ms for 1024 tokens.
 | run | completed | prefix hit ratio | TTFT p50/p95 | TPOT p50/p95 | E2E p50/p95 | total tok/s | decode tok/s |
 |---|---:|---:|---:|---:|---:|---:|---:|
 | Frontier dummy `vllm_kv_15281` | 92/100 | 0.2422 | 0.964/182.639s | 0.582/0.582s | 305.290/1765.347s | 131.3 | 19.4 |
 | Frontier profiled `frontier_loop` MoE | 93/100 | 0.2492 | 3.320/310.235s | 0.930/1.767s | 492.097/2038.538s | 165.9 | 24.6 |
 | Frontier profiled vLLM fused MoE | 97/100 | 0.2376 | 0.355/13.695s | 0.056/0.098s | 27.032/119.019s | 2056.7 | 304.5 |
 | Frontier profiled vLLM fused MoE, linear/MoE 32K | 96/100 | 0.2484 | 0.909/12.763s | 0.057/0.146s | 30.939/119.636s | 2348.9 | 347.8 |
 | vLLM TP1 real | 100/100 | 0.2511 | 4.503/29.060s | 0.066/0.621s | 41.841/97.366s | 3832.3 | 567.4 |
 Current judgment:
 - The profiled vLLM fused MoE run is the first useful timing baseline. TPOT p50
  is close to real vLLM, but throughput is still about 54% of real vLLM and
  TTFT/E2E tails do not align.
 - After extending linear and MoE profiles to 32768 tokens and adding
  `prefill_hot` MoE rows, the cache hit ratio is nearly aligned
  (0.2484 vs vLLM 0.2511), throughput improves to about 61% of real vLLM, and
  TTFT p50 moves from 0.08x to 0.20x of real vLLM. This confirms that the 4096
  profile ceiling was a real source of error.
 - Prefix/cache accounting remains close but not exact: the profiled run emits
  complete cache metrics for 96/100 requests in the 32K run, with token hit
  ratio 0.2488 vs vLLM's sidecar estimate 0.2511.
 - Frontier still reports zero preemptions, while real vLLM TP1 had 8 preemption
  events. This affects completion set, TTFT tail, and E2E tail.
 - The remaining gaps are no longer explained by the linear/MoE 4096-token
  extrapolation alone. The 32K run still has TTFT p50 at 0.20x, TTFT p95 at
  0.44x, TPOT p95 at 0.23x, and throughput at 0.61x of real vLLM. This points
  to missing CPU/scheduler/CUDA-graph modeling plus Frontier's scheduler and
  completion/preemption fidelity.
 - The 32K run still completes only 96/100 requests in latency/cache metrics
  (`70`, `77`, `88`, `90` missing), while real vLLM completes 100/100. This is
  a Frontier lifecycle/metrics or scheduler-fidelity issue to debug separately.
 ## 2026-06-24 Follow-Up
 Handled in the ReplayServe harness:
 - `tools/run_frontier_sweep.py` now passes an absolute metrics output path into
  Frontier. Frontier runs with `cwd=/tmp/replayserve-frontier-rs1b`; relative
  metrics paths can otherwise be written under the Frontier scratch instead of
  ReplayServe's run directory.
 - `tools/postprocess_frontier_smoke.py` now emits a `completion` block with
  `completed_requests`, `total_requests`, and `missing_latency_request_ids`.
 - `tools/aggregate_runs.py` now marks a run as `incomplete` when postprocess
  reports missing latency rows. The latest RS6 summary is therefore incomplete,
  not a clean pass.
 Latest RS6 vs real vLLM TP1 after the 32K profile and harness fixes:
 | metric | Frontier RS6 32K profile | real vLLM TP1 | Frontier / vLLM |
 |---|---:|---:|---:|
 | completed requests | 96/100 | 100/100 | 0.96 |
 | prefix token hit ratio | 0.2488 | 0.2511 | 0.99 |
 | preemption events | 0 | 8 | 0.00 |
 | TTFT p50 | 0.909s | 4.503s | 0.20 |
 | TTFT p95 | 12.763s | 29.060s | 0.44 |
 | TPOT p50 | 0.0569s | 0.0661s | 0.86 |
 | TPOT p95 | 0.146s | 0.621s | 0.23 |
 | E2E p50 | 30.939s | 41.841s | 0.74 |
 | E2E p95 | 119.636s | 97.366s | 1.23 |
 | total tok/s | 2348.9 | 3832.3 | 0.61 |
 | decode tok/s | 347.8 | 567.4 | 0.61 |
 Preemption experiment:
 - A local trial enabled waiting-admission preemption in Frontier Phase 2. It did
  produce preemption events, but it was not a valid alignment improvement:
  Frontier completed only 79/100 requests and amplified the early-decode
  disappearance pattern. That config was removed from `configs/`.
 - This means the remaining preemption gap is not just "turn on preemption in
  Phase 2". Frontier's batch/runtime-epoch lifecycle needs a deeper fix before
  its preemption behavior can be considered faithful to vLLM TP1.
 Current interpretation:
 - Prefix/cache replay is close: token-weighted prefix hit ratio is within about
  1% relative of the vLLM synthetic replay estimate.
 - Completion/preemption is not aligned. Requests `70`, `77`, `88`, and `90`
  begin decode in RS6 but never reach completion metrics; vLLM completes all
  100 requests and logs 8 preemption events.
 - Timing is partially useful but not fully calibrated. Linear and MoE profiles
  now cover the trace's long-prefill range up to 32768 tokens, so the old 4096
  extrapolation is no longer the main explanation. The remaining TTFT/TPOT/E2E
  gap likely comes from missing CPU/scheduler overhead, decode CUDA graph
  modeling, and Frontier scheduler lifecycle differences.
 ## 2026-06-25 500-Request Stress
 Generated `traces/fixtures/coder_500` from the first 500 rows of
 `qwen_coder_blksz_16.jsonl`:
 - `row_count=500`
 - `max_total_tokens=21318`
 - `overflow_count=0`
 - `partial_final_block_rows=466`
 Frontier RS8 used the same H20 TP1 Qwen3-30B-A3B full32K profile and explicit
 KV block count as RS6:
 - Config:
  `configs/rs8_frontier_h20_tp1_profile_full32k_coder500.json`
 - Run:
  `runs/rs8_frontier_h20_tp1_profile_full32k_coder500_20260625`
 - Runtime: 492 seconds
 - Status: incomplete
 | metric | Frontier RS6 100 reqs | Frontier RS8 500 reqs |
 |---|---:|---:|
 | completed requests | 96/100 | 439/500 |
 | missing latency/cache rows | 4 | 61 |
 | prefix token hit ratio | 0.2488 | 0.1192 |
 | preemption events | 0 | 0 |
 | TTFT p50/p95 | 0.909/12.763s | 136.776/340.237s |
 | TPOT p50/p95 | 0.0569/0.146s | 0.0564/0.0894s |
 | E2E p50/p95 | 30.939/119.636s | 177.800/397.291s |
 | total tok/s | 2348.9 | 4733.7 |
 | decode tok/s | 347.8 | 656.2 |
 Missing request ids in RS8:
 ```text
 70,77,88,90,103,106,134,135,142,143,153,154,176,178,183,184,186,188,210,211,216,222,245,246,263,272,274,278,291,298,299,300,320,325,334,335,347,348,363,367,373,374,393,399,403,409,412,413,414,433,434,437,439,450,453,460,469,475,476,479,497
 ```
 The incomplete-row issue clearly scales: 4/100 missing in RS6 becomes 61/500
 missing in RS8. This makes RS8 invalid for final performance claims, but useful
 as a stress signal for Frontier lifecycle/metrics fidelity.
 The lower prefix hit ratio is not by itself proof of adapter failure. The
 unbounded trace-side trie estimate for `coder_500` is 0.3868 token hit ratio,
 but the H20 TP1 configuration has finite KV capacity (`num_blocks=15281`, about
 244K tokens). The 500-request window has 2.7M prompt tokens, so KV eviction can
 substantially reduce real prefix hits. The dash1 vLLM run below is the current
 finite-cache comparator for whether Frontier's behavior is faithful.
 Real vLLM TP1 500 was first attempted on dash2 with the same settings as
 `tp1_coder100_uncapped` (`max_num_seqs=64`, `max_num_batched_tokens=32768`,
 `gpu_memory_utilization=0.85`, `CUDA_VISIBLE_DEVICES=0`), but did not start
 because dash2 was already occupied by eight existing `agentic-kvc` vLLM serve
 processes on ports 8000-8007. Each H20 had about 89GB allocated, and vLLM failed
 with free memory below the required 0.85 utilization target. Those processes
 were not killed; the temporary ReplayServe GPU lock was released.
 A replacement vLLM TP1 500 run completed on dash1:
 - Run:
  `runs/vllm_gpu_smoke_20260625_dash1/tp1_coder500_uncapped`
 - Runtime: vLLM 0.11.1
 - Host/GPU: dash1, one NVIDIA H20 via `CUDA_VISIBLE_DEVICES=0`
 - Model: `/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
 - Command knobs: `TP=1`, `max_model_len=32768`, `max_num_seqs=64`,
  `max_num_batched_tokens=32768`, `gpu_memory_utilization=0.85`,
  prefix caching on, chunked prefill on
 - vLLM profiled KV capacity: 244,496 tokens = 15,281 blocks at block size 16
 - Replay wall time after engine startup: 595.116 seconds
 - Process elapsed including model load/startup: 2026-06-25T03:08:18Z to
  2026-06-25T03:19:41Z
 | metric | Frontier RS8 500 reqs | vLLM TP1 500 reqs | vLLM / Frontier |
 |---|---:|---:|---:|
 | completed requests | 439/500 | 500/500 | not aligned |
 | preemption events | 0 | 63 | not aligned |
 | repeated/preempted request ids | 0 | 57 | not aligned |
 | TTFT p50 | 136.776s | 185.658s | 1.36 |
 | TTFT p95 | 340.237s | 375.895s | 1.10 |
 | TPOT p50 | 0.0564s | 0.0498s | 0.88 |
 | TPOT p95 | 0.0894s | 0.0919s | 1.03 |
 | E2E p50 | 177.800s | 224.270s | 1.26 |
 | E2E p95 | 397.291s | 417.356s | 1.05 |
 | requests/s | 0.661 | 0.840 | 1.27 |
 | total tok/s | 4733.7 | 5282.9 | 1.12 |
 | decode tok/s | 656.2 | 732.3 | 1.12 |
 Because Frontier emits latency/cache rows for only 439 requests, the latency
 comparison above mixes Frontier's completed subset with vLLM's complete 500-row
 run. Restricting vLLM to the same 439 request ids gives:
 | metric | Frontier RS8 439 rows | vLLM same 439 ids | vLLM / Frontier |
 |---|---:|---:|---:|
 | TTFT p50 | 136.776s | 169.968s | 1.24 |
 | TTFT p95 | 340.237s | 375.760s | 1.10 |
 | TPOT p50 | 0.0564s | 0.0498s | 0.88 |
 | TPOT p95 | 0.0894s | 0.1071s | 1.20 |
 | E2E p50 | 177.800s | 218.606s | 1.23 |
 | E2E p95 | 397.291s | 416.110s | 1.05 |
 Prefix/cache comparison needs careful metric naming:
 - The unbounded ReplayServe trie estimate for all 500 rows is 1,047,632 hit
  tokens / 2,708,110 prompt tokens = 0.3868 token hit ratio.
 - vLLM's finite-cache scheduler log is much lower under this pressure:
  first-start `computed:` ratio is 0.0979, last-start ratio is 0.1643, and
  max-per-request ratio is 0.1655.
 - On the same 439 request ids where Frontier emits complete metrics, vLLM's
  first-start `computed:` ratio is 0.1050, last-start ratio is 0.1665, and
  max-per-request ratio is 0.1679.
 - Frontier RS8 reports `replayserve_token_hit_ratio=0.1192` and
  `frontier_block_hit_ratio=0.1191`, which is in the same order as vLLM's
  finite-cache scheduler signal but far below the unbounded trace-side estimate.
 Current 500-request judgment:
 - Frontier's timing profile is now in the right broad range for this stressed
  H20 TP1 run: TPOT p50/p95 and E2E p95 are close to vLLM, and aggregate token
  throughput is within about 12%.
 - The run is still not a faithful simulator result because completion and
  preemption diverge: Frontier drops 61 latency/cache rows and reports zero
  preemptions, while real vLLM completes all 500 requests and logs 63
  preemption events across 57 request ids.
 - The 500-request trace invalidates the earlier use of the unbounded sidecar
  prefix estimate as the primary comparator. Finite KV capacity, eviction, and
  preemption must be part of the prefix-cache replay metric.
 ReplayServe TODO:
 - Treat incomplete Frontier runs as invalid for final performance claims unless
  the comparison explicitly reports the missing request set.
 - Keep the focused Frontier debug guard in the local patch: sequential mode now
  fails if `completed_requests < total_requests` at drain time and reports the
  missing request state.
 - Add a comparator that reports both unbounded trace-side prefix reuse and
  finite-cache observed reuse from vLLM scheduler logs; do not compare
  Frontier's finite-cache hit ratio directly to the unbounded trie estimate.
 - Profile or import vLLM CPU overhead records for H20 TP1 before enabling
  `skip_cpu_overhead_modeling=false`; without those records Frontier falls back
  to zero CPU overhead.
 - Collect kernel-only/decode-CUDA-graph timing profiles before using
  `decode_cuda_graph_mode=full_decode_only`; the current RS6 profile is CUDA
  event/eager timing.
 ## 2026-06-25 200-Request Timestamp Scale 2/3
 Generated `traces/fixtures/coder_200_ts0667` from the first 200 rows of
 `qwen_coder_blksz_16.jsonl`, with each timestamp multiplied by `2/3` in the
 fixture files:
 - `row_count=200`
 - `timestamp_scale=0.6666666666666666`
 - `last_timestamp=30.711333333333332`
 - `max_total_tokens=18985`
 - `partial_final_block_rows=182`
 Important: in the current replay semantics, smaller timestamp scale makes
 arrivals denser. It reduces the arrival window from about 46.1s to 30.7s for the
 first 200 requests. This does not reduce queue pressure relative to the same
 200 requests at scale 1.0; it only reduces the request count relative to the
 500-request stress.
 Frontier RS9:
 - Config:
  `configs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667.json`
 - Run:
  `runs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667`
 - Runtime: 460 seconds
 - Status: incomplete
 vLLM dash1 TP1:
 - Run:
  `runs/vllm_gpu_smoke_20260625_dash1/tp1_coder200_ts0667_uncapped`
 - Runtime: vLLM 0.11.1
 - Host/GPU: dash1, one NVIDIA H20 via `CUDA_VISIBLE_DEVICES=0`
 - vLLM profiled KV capacity: 244,496 tokens = 15,281 blocks at block size 16
 - Replay wall time after engine startup: 242.813 seconds
 | metric | Frontier RS9 200 ts=2/3 | vLLM TP1 200 ts=2/3 | vLLM / Frontier |
 |---|---:|---:|---:|
 | completed requests | 176/200 | 200/200 | not aligned |
 | preemption events | 0 | 26 | not aligned |
 | TTFT p50 | 20.580s | 34.563s | 1.68 |
 | TTFT p95 | 96.718s | 120.804s | 1.25 |
 | TPOT p50 | 0.0584s | 0.0515s | 0.88 |
 | TPOT p95 | 0.2359s | 0.2535s | 1.07 |
 | E2E p50 | 73.207s | 83.622s | 1.14 |
 | E2E p95 | 189.240s | 183.727s | 0.97 |
 | requests/s | 0.583 | 0.824 | 1.41 |
 | total tok/s | 3913.4 | 4864.8 | 1.24 |
 | decode tok/s | 593.3 | 737.5 | 1.24 |
 Restricting vLLM to the same 176 request ids where Frontier emits complete
 metrics gives:
 | metric | Frontier RS9 176 rows | vLLM same 176 ids | vLLM / Frontier |
 |---|---:|---:|---:|
 | TTFT p50 | 20.580s | 27.896s | 1.36 |
 | TTFT p95 | 96.718s | 120.804s | 1.25 |
 | TPOT p50 | 0.0584s | 0.0520s | 0.89 |
 | TPOT p95 | 0.2359s | 0.2539s | 1.08 |
 | E2E p50 | 73.207s | 82.645s | 1.13 |
 | E2E p95 | 189.240s | 183.727s | 0.97 |
 Prefix/cache comparison:
 - The unbounded ReplayServe trie estimate for all 200 rows is 270,336 hit
  tokens / 1,002,154 prompt tokens = 0.2698 token hit ratio.
 - vLLM finite-cache scheduler signal for all 200 rows: first-start `computed:`
  ratio 0.1392, last-start ratio 0.2126, max-per-request ratio 0.2129.
 - On the same 176 request ids where Frontier emits complete metrics, vLLM
  first-start ratio is 0.1487, last-start ratio is 0.1926, and max-per-request
  ratio is 0.1927.
 - Frontier RS9 reports `replayserve_token_hit_ratio=0.1703` and
  `frontier_block_hit_ratio=0.1700`, again between vLLM first-start and
  last/max finite-cache scheduler signals.
 Missing request ids in RS9:
 ```text
 70,78,80,86,87,89,96,101,102,105,125,126,131,132,135,144,145,146,147,148,149,150,151,198
 ```
 Current 200-request judgment:
 - Reducing the request count from 500 to 200 substantially reduces TTFT and E2E
  tails, but `scale=2/3` is still a dense-arrival stress test. vLLM TTFT p95 is
  still 120.8s.
 - Frontier timing is closer than the old 100-request dummy/profile baselines:
  TPOT p50/p95 and E2E p50/p95 are broadly aligned.
 - Completion/preemption remains the blocking fidelity issue: Frontier drops 24
  rows and reports zero preemptions; vLLM completes all 200 and logs 26
  preemptions across 22 repeated-start request ids.
 - To actually reduce queue pressure for the same first 200 requests, use a
  timestamp scale greater than 1. The follow-up scale 2 and 3 runs below do
  this.
 ## 2026-06-25 200-Request Timestamp Scale 2 and 3
 Generated two more first-200 fixtures from `qwen_coder_blksz_16.jsonl`:
 | fixture | timestamp scale | last timestamp | max total tokens |
 |---|---:|---:|---:|
 | `coder_200_ts2` | 2.0 | 92.134s | 18,985 |
 | `coder_200_ts3` | 3.0 | 138.201s | 18,985 |
 These are the intended lower-arrival-pressure runs. The request payloads are the
 same first 200 rows as `coder_200_ts0667`; only timestamps differ.
 Frontier RS10:
 - Config:
  `configs/rs10_frontier_h20_tp1_profile_full32k_coder200_ts2_ts3.json`
 - Run:
  `runs/rs10_frontier_h20_tp1_profile_full32k_coder200_ts2_ts3`
 - Status: incomplete for both fixtures
 vLLM dash1 TP1:
 - Runs:
  `runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts2_uncapped`
  and `runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts3_uncapped`
 - Runtime: vLLM 0.11.1
 - Host/GPU: dash1, one NVIDIA H20 via `CUDA_VISIBLE_DEVICES=0`
 - vLLM profiled KV capacity: 244,496 tokens = 15,281 blocks at block size 16
 Run-level comparison:
 | metric | Frontier scale 2 | vLLM scale 2 | Frontier scale 3 | vLLM scale 3 |
 |---|---:|---:|---:|---:|
 | completed requests | 182/200 | 200/200 | 184/200 | 200/200 |
 | preemption events | 0 | 43 | 0 | 16 |
 | TTFT p50 | 8.118s | 9.217s | 0.779s | 1.166s |
 | TTFT p95 | 67.850s | 69.211s | 35.918s | 32.258s |
 | TPOT p50 | 0.0544s | 0.0497s | 0.0544s | 0.0462s |
 | TPOT p95 | 0.0747s | 0.0686s | 0.0773s | 0.0714s |
 | E2E p50 | 51.118s | 55.002s | 40.641s | 33.213s |
 | E2E p95 | 162.607s | 142.338s | 158.434s | 122.789s |
 | requests/s | 0.593 | 0.803 | 0.544 | 0.780 |
 | total tok/s | 3846.1 | 4742.5 | 3490.6 | 4608.1 |
 | decode tok/s | 583.1 | 719.0 | 529.2 | 698.6 |
 Restricting vLLM to the same request ids where Frontier emits complete metrics:
 | metric | Frontier scale 2 182 rows | vLLM same 182 ids | Frontier scale 3 184 rows | vLLM same 184 ids |
 |---|---:|---:|---:|---:|
 | TTFT p50 | 8.118s | 8.574s | 0.779s | 0.945s |
 | TTFT p95 | 67.850s | 68.934s | 35.918s | 32.258s |
 | TPOT p50 | 0.0544s | 0.0501s | 0.0544s | 0.0461s |
 | TPOT p95 | 0.0747s | 0.0686s | 0.0773s | 0.0679s |
 | E2E p50 | 51.118s | 53.263s | 40.641s | 33.213s |
 | E2E p95 | 162.607s | 141.264s | 158.434s | 122.789s |
 Prefix/cache comparison:
 | metric | scale 2 | scale 3 |
 |---|---:|---:|
 | unbounded trace-side token hit ratio | 0.2698 | 0.2698 |
 | vLLM first-start `computed:` ratio | 0.1433 | 0.1471 |
 | vLLM last-start `computed:` ratio | 0.2382 | 0.1968 |
 | vLLM max-per-request `computed:` ratio | 0.2383 | 0.1998 |
 | Frontier `replayserve_token_hit_ratio` | 0.1448 | 0.1523 |
 | Frontier `frontier_block_hit_ratio` | 0.1446 | 0.1521 |
 Current scale 2 and 3 judgment:
 - The user's intended `scale=2` and `scale=3` runs do reduce queueing. vLLM
  TTFT p95 drops from 120.8s at `scale=2/3` to 69.2s at `scale=2` and 32.3s at
  `scale=3`.
 - `scale=3` is the first run where vLLM p50 TTFT is near 1s. The p95 is still
  high because long prompts and KV pressure remain, but the severe all-request
  queueing seen in the 500-request run is much reduced.
 - Frontier timing is now close on TTFT and TPOT for the completed-row subset,
  especially at `scale=2`. However, Frontier still misses completion/cache rows
  and still reports zero preemptions.
 - Completion/preemption is therefore still the main Frontier fidelity blocker:
  `scale=2` misses 18 rows and vLLM logs 43 preemptions; `scale=3` misses 16 rows
  and vLLM logs 16 preemptions.
 ## 2026-06-25 Frontier Lifecycle Fix For RS10
 The missing-row root cause was Frontier lifecycle handling after decode-phase
 preemption. Missing requests were preempted after prefill/decode had started,
 then left in this inconsistent state:
 ```text
 preempted=True
 is_prefill_complete=True
 num_processed_tokens=0
 scheduled=False
 completed=False
 ```
 The next waiting admission computed `num_new_tokens=0` and removed the request
 from the queue, so sequential simulation drained with fewer completed requests
 but no remaining scheduler work.
 The updated ReplayServe Frontier patch now:
 - replays decode-phase preemption by treating already-produced tokens as the
  next prefill segment and the remaining tokens as decode work;
 - preserves unfinished zero-token waiting requests instead of silently dropping
  them;
 - reports metrics against user-facing trace prompt/output lengths after runtime
  token splitting;
 - fails fast if sequential mode drains before all generated requests complete.
 Verification runs:
 | run | old completion | fixed completion | Frontier preemptions | prefix token hit ratio | status |
 |---|---:|---:|---:|---:|---|
 | `coder_200_ts2` | 182/200 | 200/200 | 33 | 0.2313 | pass |
 | `coder_200_ts3` | 184/200 | 200/200 | 20 | 0.2177 | pass |
 Fixed-run paths:
 - `runs/rs10_preemption_replay_fix_ts2/frontier_h20_tp1_profile_full32k/coder_200_ts2/vllm_kv_15281_profile_full32k`
 - `runs/rs10_preemption_replay_fix_ts3/frontier_h20_tp1_profile_full32k/coder_200_ts3/vllm_kv_15281_profile_full32k`
 Updated run-level comparison:
 | metric | Frontier scale 2 fixed | vLLM scale 2 | Frontier scale 3 fixed | vLLM scale 3 |
 |---|---:|---:|---:|---:|
 | completed requests | 200/200 | 200/200 | 200/200 | 200/200 |
 | preemption events | 33 | 43 | 20 | 16 |
 | TTFT p50 | 9.595s | 9.217s | 1.001s | 1.166s |
 | TTFT p95 | 77.503s | 69.211s | 45.947s | 32.258s |
 | TPOT p50 | 0.0542s | 0.0497s | 0.0534s | 0.0462s |
 | TPOT p95 | 0.0665s | 0.0686s | 0.0686s | 0.0714s |
 | E2E p50 | 61.458s | 55.002s | 44.761s | 33.213s |
 | E2E p95 | 174.484s | 142.338s | 154.548s | 122.789s |
 | requests/s | 0.594 | 0.803 | 0.574 | 0.780 |
 | total tok/s | 3506.3 | 4742.5 | 3390.0 | 4608.1 |
 | decode tok/s | 531.6 | 719.0 | 513.9 | 698.6 |
 Current judgment after the fix:
 - The completion/preemption lifecycle blocker for RS10 is fixed: both scale 2
  and scale 3 now emit 200 request rows and complete postprocess.
 - Frontier preemption is now in the same order as vLLM, but not exact:
  scale 2 is 33 vs 43 events, scale 3 is 20 vs 16 events.
 - Prefix hit ratio changed materially because preempted requests now replay and
  re-enter prefix-cache admission instead of disappearing. It is no longer valid
  to compare the old incomplete RS10 prefix ratios against vLLM.
 - Timing remains close in TPOT but Frontier is still slower in aggregate
  throughput, about 0.74x of vLLM total/decode token throughput for both scale 2
  and scale 3. TTFT/E2E tails are still worse after the completion set becomes
  complete.
 - Remaining gap is no longer "missing metrics rows"; it is scheduler/preemption
  fidelity plus CPU/scheduler/CUDA-graph timing calibration.
 ## 2026-06-25 H20 TP2/TP4 Comparison
 The TP2/TP4 comparison uses the same first-200 `coder_200_ts2` and
 `coder_200_ts3` fixtures. The vLLM runs are on dash1 with
 `/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`, vLLM 0.11.1,
 `max_model_len=32768`, `max_num_seqs=64`,
 `max_num_batched_tokens=32768`, `gpu_memory_utilization=0.85`,
 prefix caching on, and chunked prefill on.
 vLLM measured KV capacity:
 | TP | KV tokens | KV blocks |
 |---:|---:|---:|
 | 2 | 1,104,880 | 69,055 |
 | 4 | 2,833,232 | 177,077 |
 Frontier RS12 uses explicit matching KV blocks and fresh H20 TP2/TP4 profiles:
 - Config:
  `configs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3.json`
 - Run:
  `runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3`
 - Profile source:
  `dash1:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp2_tp4_qwen3_30ba3b_full32k_20260625_true_mixed`
 - Linear/MoE profiles cover TP2/TP4 up to 32768 tokens.
 - Attention profile covers TP2/TP4 standard attention plus 1260 true-mixed
  prefill+decode rows. The true-mixed rows are required; standard attention
  alone fails with missing `attn_decode_in_mixed` predictions.
 All four Frontier runs completed 200/200 request rows. Neither Frontier nor the
 vLLM TP2/TP4 logs reported preemption events. Prefix token hit ratio is exactly
 the same in Frontier postprocess and vLLM's trace-side synthetic estimate:
 0.2697549478.
 Run-level comparison:
 | TP | fixture | metric | Frontier | vLLM | Frontier / vLLM |
 |---:|---|---|---:|---:|---:|
 | 2 | `coder_200_ts2` | requests/s | 0.776 | 1.278 | 0.61 |
 | 2 | `coder_200_ts2` | total tok/s | 4581 | 7547 | 0.61 |
 | 2 | `coder_200_ts2` | decode tok/s | 695 | 1144 | 0.61 |
 | 2 | `coder_200_ts2` | TTFT p50/p95 | 0.269/6.745s | 0.225/0.715s | 1.20/9.43 |
 | 2 | `coder_200_ts2` | TPOT p50/p95 | 0.0430/0.0529s | 0.0300/0.0434s | 1.43/1.22 |
 | 2 | `coder_200_ts2` | E2E p50/p95 | 26.05/106.76s | 16.45/72.53s | 1.58/1.47 |
 | 4 | `coder_200_ts2` | requests/s | 0.853 | 1.536 | 0.55 |
 | 4 | `coder_200_ts2` | total tok/s | 5035 | 9073 | 0.55 |
 | 4 | `coder_200_ts2` | decode tok/s | 763 | 1376 | 0.55 |
 | 4 | `coder_200_ts2` | TTFT p50/p95 | 0.098/0.386s | 0.170/1.420s | 0.57/0.27 |
 | 4 | `coder_200_ts2` | TPOT p50/p95 | 0.0337/0.0384s | 0.0163/0.0283s | 2.06/1.36 |
 | 4 | `coder_200_ts2` | E2E p50/p95 | 18.65/84.94s | 9.26/43.62s | 2.01/1.95 |
 | 2 | `coder_200_ts3` | requests/s | 0.688 | 1.088 | 0.63 |
 | 2 | `coder_200_ts3` | total tok/s | 4062 | 6426 | 0.63 |
 | 2 | `coder_200_ts3` | decode tok/s | 616 | 974 | 0.63 |
 | 2 | `coder_200_ts3` | TTFT p50/p95 | 0.134/0.574s | 0.154/0.627s | 0.87/0.92 |
 | 2 | `coder_200_ts3` | TPOT p50/p95 | 0.0394/0.0467s | 0.0191/0.0280s | 2.07/1.67 |
 | 2 | `coder_200_ts3` | E2E p50/p95 | 21.79/101.59s | 9.96/53.98s | 2.19/1.88 |
 | 4 | `coder_200_ts3` | requests/s | 0.737 | 1.254 | 0.59 |
 | 4 | `coder_200_ts3` | total tok/s | 4355 | 7403 | 0.59 |
 | 4 | `coder_200_ts3` | decode tok/s | 660 | 1122 | 0.59 |
 | 4 | `coder_200_ts3` | TTFT p50/p95 | 0.089/0.346s | 0.100/0.318s | 0.89/1.09 |
 | 4 | `coder_200_ts3` | TPOT p50/p95 | 0.0311/0.0358s | 0.0094/0.0128s | 3.30/2.80 |
 | 4 | `coder_200_ts3` | E2E p50/p95 | 16.90/83.01s | 5.55/27.87s | 3.05/2.98 |
 TP scaling comparison:
 | fixture | metric | Frontier TP4 / TP2 | vLLM TP4 / TP2 |
 |---|---|---:|---:|
 | `coder_200_ts2` | total tok/s speedup | 1.10 | 1.20 |
 | `coder_200_ts2` | decode tok/s speedup | 1.10 | 1.20 |
 | `coder_200_ts2` | TPOT p50 reduction | 0.78 | 0.54 |
 | `coder_200_ts3` | total tok/s speedup | 1.07 | 1.15 |
 | `coder_200_ts3` | decode tok/s speedup | 1.07 | 1.15 |
 | `coder_200_ts3` | TPOT p50 reduction | 0.79 | 0.49 |
 Current TP2/TP4 judgment:
 - Functional replay is aligned for this setting: same request rows, same
  trace-side prefix reuse ratio, matched vLLM KV block counts, and no
  preemption on either side.
 - Absolute performance is not aligned. Frontier reports only 55-63% of vLLM
  total/decode throughput across TP2/TP4, and TPOT is especially pessimistic at
  TP4.
 - Relative TP scaling is also under-estimated. vLLM's TP4 improves TPOT p50 by
  about 46-51% over TP2, while Frontier improves by only about 21-22%.
 - The remaining gap is therefore not caused by missing rows, prefix-cache
  mismatch, or KV capacity mismatch in these runs. It points to timing model
  limitations: missing CPU/scheduler/CUDA-graph modeling, random-forest profile
  interpolation error, and imperfect modeling of vLLM's TP-dependent decode
  execution path.
 - These RS12 results are acceptable for continuing ReplayServe integration and
  rough qualitative trends. They are not yet acceptable as calibrated absolute
  performance predictions.
--- a/docs/rs4_vllm_gpu_smoke.md
+++ b/docs/rs4_vllm_gpu_smoke.md
@@ -0,0 +1,138 @@
 # RS4 vLLM GPU Smoke
 RS4 starts a real serving baseline for ReplayServe. This is separate from the
 Frontier dummy/patched simulator smoke: it checks whether the Qwen block-hash
 trace can drive a real vLLM engine with the intended arrival, prompt length,
 decode length, and prefix reuse patterns.
 ## Setup
 - Host: `dash2`
 - GPU: NVIDIA H20
 - Model: `/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
 - Runtime: Python 3.12.3, vLLM 0.11.1
 - Fixture: `traces/fixtures/coder_100`
 - Runner: `tools/vllm_synthetic_replay.py`
 - Replay mode: online, trace-relative timestamps preserved
 - Prompt mode: `prompt_token_ids`, generated synthetically from trace block
  hashes
 - Common vLLM knobs: `max_model_len=32768`, `block_size=16`,
  `max_num_batched_tokens=32768`, `gpu_memory_utilization=0.85`,
  prefix caching on, chunked prefill on
 The Qwen trace does not expose original token IDs or text. The runner maps each
 block hash deterministically to one stable synthetic token block. Equal block
 hashes therefore produce equal token blocks, preserving arrival, length, and
 block-prefix sharing patterns, but not original text semantics.
 ## Runs
 The first smoke used single-request runs for engine bring-up, 32-request capped
 runs for prefix-cache validation, 32-request uncapped runs for a first
 real-output baseline, and full `coder_100` uncapped runs for the first useful
 TP=1/2 comparison.
 | run | TP | rows | prompt toks | gen toks | wall s | RPS | prompt tok/s | gen tok/s | TTFT p50/p95 | TPOT p50/p95 | E2E p50/p95 |
 |---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
 | `tp1_limit1` | 1 | 1 | 1008 | 4 | 1.861 | 0.537 | 541.5 | 2.1 | 1.255/1.255 | 0.007/0.007 | 1.274/1.274 |
 | `tp2_limit1` | 2 | 1 | 1008 | 4 | 2.269 | 0.441 | 444.3 | 1.8 | 1.317/1.317 | 0.008/0.008 | 1.340/1.340 |
 | `tp1_limit32_o8` | 1 | 32 | 120813 | 253 | 11.244 | 2.846 | 10744.4 | 22.5 | 3.974/5.051 | 0.387/1.081 | 7.157/9.817 |
 | `tp2_limit32_o8` | 2 | 32 | 120813 | 253 | 9.071 | 3.528 | 13318.2 | 27.9 | 1.881/3.324 | 0.285/0.727 | 4.368/7.043 |
 | `tp1_limit32_uncapped` | 1 | 32 | 120813 | 22209 | 41.874 | 0.764 | 2885.1 | 530.4 | 1.276/1.842 | 0.024/0.102 | 14.366/29.523 |
 | `tp2_limit32_uncapped` | 2 | 32 | 120813 | 22209 | 33.588 | 0.953 | 3596.9 | 661.2 | 0.961/1.700 | 0.017/0.071 | 10.786/21.570 |
 | `tp1_coder100_uncapped` | 1 | 100 | 474554 | 82479 | 145.351 | 0.688 | 3264.9 | 567.4 | 4.503/29.060 | 0.066/0.621 | 41.841/97.366 |
 | `tp2_coder100_uncapped` | 2 | 100 | 474554 | 82479 | 102.001 | 0.980 | 4652.5 | 808.6 | 1.951/10.355 | 0.049/0.262 | 25.678/61.971 |
 Artifacts were copied back from dash2 to:
 ```text
 runs/vllm_gpu_smoke_20260624/
 ```
 That directory is ignored by git. Each run contains `summary.json` and
 `request_metrics.csv`; the 32-request runs also keep `stdout.log`.
 ## KV Capacity
 vLLM estimated KV capacity from actual H20 memory profiling:
 | TP | weights memory | available KV memory | GPU KV cache size | max concurrency at 32768 tokens/request |
 |---:|---:|---:|---:|---:|
 | 1 | 56.93 GiB | 22.39 GiB | 244,512 tokens | 7.46x |
 | 2 | 28.50 GiB/rank | 50.58 GiB/rank | 1,104,880 tokens | 33.72x |
 This satisfies the RS4 requirement that KV capacity comes from the real GPU
 memory planner rather than a manually fixed block count.
 ## Prefix-Cache Check
 For the first 32 coder requests, ReplayServe estimated:
 - query blocks: 7,564
 - hit blocks: 1,786
 - block hit ratio: 0.236118456
 - query tokens: 120,813
 - hit tokens: 28,576
 - token hit ratio: 0.236530837
 The vLLM scheduler logs for both TP=1 and TP=2 reported exactly 32 request
 starts and `computed:` token sums of 28,576 in both capped and uncapped runs.
 The largest single hit was 11,552 tokens. Examples include:
 ```text
 Request 16 started running, prompt: 12296, computed: 11552
 Request 26 started running, prompt: 5836, computed: 4336
 Request 30 started running, prompt: 11017, computed: 10768
 ```
 So this smoke validates the core ReplayServe invariant: identical Qwen block
 hash prefixes become identical synthetic token prefixes, and vLLM's prefix cache
 actually reuses them.
 For full `coder_100`, ReplayServe estimated:
 - query blocks: 29,705
 - hit blocks: 7,447
 - block hit ratio: 0.250698536
 - query tokens: 474,554
 - hit tokens: 119,152
 - token hit ratio: 0.251082069
 The TP=2 full `coder_100` run had no preemptions. Its vLLM `computed:` sum was
 119,152, matching the trace-side estimate exactly. The TP=1 run had 8
 preemptions across repeated starts for requests 70, 71, 72, 77, and 94. In that
 case, raw `computed:` sums are not a simple prefix-hit ratio:
 | run | starts | unique requests | preemptions | all-start computed | first-start computed | last-start computed | max/request computed | estimated hit tokens |
 |---|---:|---:|---:|---:|---:|---:|---:|---:|
 | `tp1_coder100_uncapped` | 108 | 100 | 8 | 180896 | 108560 | 141744 | 141984 | 119152 |
 | `tp2_coder100_uncapped` | 100 | 100 | 0 | 119152 | 119152 | 119152 | 119152 | 119152 |
 Use `tools/analyze_vllm_prefix_log.py` to reproduce this parsing.
 ## Reliability Boundary
 These numbers are useful for mechanism validation and for seeding simulator
 calibration. They are not final serving throughput claims because:
 - Some bring-up runs capped decode length to 4 or 8 tokens.
 - The largest real-output baseline so far is `coder_100`, not `coder_2000` or
  the full coder trace.
 - Synthetic token IDs preserve block identity and length but not original text
  distribution.
 - Prefix reuse in `request_metrics.csv` is a trace-side estimate. For real
  scheduler hit/miss behavior, use vLLM `stdout.log` `computed:` fields and
  account for preemption/re-admission.
 - This run uses H20 and `Qwen3-30B-A3B`, while the earlier Frontier smoke used
  dummy A800/Qwen3-32B plumbing. They should be compared as calibration inputs,
  not as one-to-one simulator accuracy evidence yet.
 ## Next
 - Move to `coder_2000` once runtime and queueing cost are acceptable.
 - Add the vLLM log parser output into the run aggregation summary.
 - Compare vLLM real-backend TTFT/TPOT/E2E against Frontier outputs only after
  selecting a matched model/hardware/profile policy.
 See `docs/rs4_frontier_h20_tp1_alignment.md` for the first Frontier H20 TP1
 alignment run against real vLLM TP1.
--- a/docs/sources.md
+++ b/docs/sources.md
@@ -0,0 +1,45 @@
 # Sources
 Checked on 2026-06-24.
 ## Local Repositories
 | Source | Local path | Commit / HEAD | Notes |
 |---|---|---|---|
 | Qwen Bailian usage traces | `/home/gahow/phd/qwen-bailian-usagetraces-anon` | `5f7439c51ec248a0c585f7d90a41a6f57773b912` | Primary RS0 input is `qwen_coder_blksz_16.jsonl`. |
 | Frontier | `/tmp/toc-llm-sim-research/Frontier` | `d9cfeb6d8791fbf2f295dd9744c56a666171776e` | Primary RS1 simulator candidate. |
 | Vidur | `/tmp/toc-llm-sim-research/vidur` | `8383d2935bc62723a212090baa9f98ada206fc14` | Baseline simulator candidate for arrival and length replay. |
 | AIConfigurator | `/tmp/toc-llm-sim-research/aiconfigurator` | `e46ece7510e727fafefb8212e5846172145a30ea` | Configuration search reference, not per-request faithful replay. |
 All four local repositories were present when RS0 was generated. No external
 repository was cloned for RS0.
 ## Frontier Findings
 - Frontier trace replay reads CSV columns `arrived_at`, `num_prefill_tokens`,
  and `num_decode_tokens`.
 - It also parses optional `session_id` and `block_hash_ids`; `block_hash_ids`
  can be `|` separated, matching `examples/fixtures/prefix_cache_shared_session_trace.csv`.
 - Frontier's trace replay generator can clip prefill tokens when total tokens
  exceed `trace_request_generator_config_max_tokens`. ReplayServe fixtures hard
  fail before Frontier sees the trace, so the RS1 smoke cannot silently clip.
 - Frontier has a built-in `Qwen/Qwen3-32B` model config.
 - Frontier has A800 network profiles:
  `data/profiling/network/a800_dgx/` and
  `data/profiling/network/a800_pairwise_nvlink/`.
 - Current public A800 compute profiles in this checkout include Llama2-7B and
  Qwen3 MoE / Qwen3-Next reduced variants, but no dense `Qwen/Qwen3-32B`
  compute profile. RS1 Qwen3-32B A800 latency and throughput results are only
  plumbing smoke until matching compute profiles or calibration data are added.
 ## Qwen Trace Findings
 - The released JSONL rows contain `chat_id`, `parent_chat_id`, `timestamp`,
  `input_length`, `output_length`, `type`, `turn`, and `hash_ids`.
 - The trace README documents `hash_ids` as salted SipHash blocks with 16 tokens
  per block.
 - The released input lengths and hashes are already after the model-specific
  chat template has been applied. ReplayServe does not apply chat templates.
 - The final input block can be padded. ReplayServe records per-block token
  counts in the sidecar so partial final blocks can be accounted for by true
  token count.
--- a/patches/frontier-vllm-0.11.1-profiling-compat.patch
+++ b/patches/frontier-vllm-0.11.1-profiling-compat.patch
@@ -0,0 +1,179 @@
 diff --git a/frontier/profiling/common/layers/rotary_embedding.py b/frontier/profiling/common/layers/rotary_embedding.py
 index 3f6d999..00be87b 100644
 --- a/frontier/profiling/common/layers/rotary_embedding.py
 +++ b/frontier/profiling/common/layers/rotary_embedding.py
@@ -576,15 +576,19 @@ def get_rope(
     if not _should_prefer_torch_rope_fallback():
         vllm_get_rope = _load_vllm_get_rope()
         if vllm_get_rope is not None:
 -            return vllm_get_rope(
 -                head_size=head_size,
 -                rotary_dim=rotary_dim,
 -                max_position=max_position,
 -                base=base,
 -                is_neox_style=is_neox_style,
 -                rope_scaling=rope_scaling,
 -                dtype=rope_dtype,
 -            )
 +            try:
 +                return vllm_get_rope(
 +                    head_size=head_size,
 +                    rotary_dim=rotary_dim,
 +                    max_position=max_position,
 +                    base=base,
 +                    is_neox_style=is_neox_style,
 +                    rope_scaling=rope_scaling,
 +                    dtype=rope_dtype,
 +                )
 +            except TypeError as exc:
 +                if "unexpected keyword argument" not in str(exc):
 +                    raise
     if cache_key in _LOCAL_ROPE_DICT:
         return _LOCAL_ROPE_DICT[cache_key]
 diff --git a/frontier/profiling/moe/moe_impl.py b/frontier/profiling/moe/moe_impl.py
 index f732980..79aed30 100644
 --- a/frontier/profiling/moe/moe_impl.py
 +++ b/frontier/profiling/moe/moe_impl.py
@@ -27,9 +27,16 @@ from frontier.profiling.common.utils import raise_if_fp8_requested
 try:
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_topk,
 -        get_config_dtype_str,
         try_get_optimal_moe_config,
     )
 +    try:
 +        from vllm.model_executor.layers.fused_moe.fused_moe import (
 +            get_config_dtype_str,
 +        )
 +    except ImportError:
 +        from vllm.model_executor.layers.fused_moe.fused_moe import (
 +            _get_config_dtype_str as get_config_dtype_str,
 +        )
     from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
         moe_align_block_size,
     )
@@ -128,14 +135,20 @@ class MoEGatingNetwork(nn.Module):
             )
         if self.use_vllm_fused_topk and HAS_VLLM_REPLICATED_LINEAR:
 -            # Align gating linear kernel family with vLLM runtime contract.
 -            # disable_tp=True avoids requiring TP group initialization in profiling jobs.
 -            self.gate = ReplicatedLinear(
 -                hidden_dim,
 -                num_experts,
 -                bias=False,
 -                disable_tp=True,
 -            )
 +            try:
 +                # Align gating linear kernel family with vLLM runtime contract.
 +                # vLLM 0.11.x still touches TP state even with disable_tp=True in
 +                # standalone profiling, so fall back to torch Linear if needed.
 +                self.gate = ReplicatedLinear(
 +                    hidden_dim,
 +                    num_experts,
 +                    bias=False,
 +                    disable_tp=True,
 +                )
 +            except AssertionError as exc:
 +                if "tensor model parallel group is not initialized" not in str(exc):
 +                    raise
 +                self.gate = nn.Linear(hidden_dim, num_experts, bias=False)
         else:
             # Fall back to native torch linear only when vLLM kernel alignment is disabled.
             self.gate = nn.Linear(hidden_dim, num_experts, bias=False)
@@ -187,13 +200,14 @@ class MoEGatingNetwork(nn.Module):
                         indices_type=None,
                     )
                 else:
 -                    routing_weights, selected_experts, _ = fused_topk(
 +                    fused_topk_outputs = fused_topk(
                         hidden_states=hidden_states,
                         gating_output=logits,
                         topk=self.router_topk,
                         renormalize=getattr(self, "renormalize", True),
                         indices_type=None,
                     )
 +                    routing_weights, selected_experts = fused_topk_outputs[:2]
             else:
                 if routing_runtime_path != "standard_fused_topk":
                     raise ValueError(
 diff --git a/frontier/profiling/moe/moe_vllm_kernel.py b/frontier/profiling/moe/moe_vllm_kernel.py
 index 7228731..726c748 100644
 --- a/frontier/profiling/moe/moe_vllm_kernel.py
 +++ b/frontier/profiling/moe/moe_vllm_kernel.py
@@ -36,8 +36,15 @@ try:
         invoke_fused_moe_kernel,
         moe_align_block_size,
         try_get_optimal_moe_config,
 -        get_config_dtype_str,
     )
 +    try:
 +        from vllm.model_executor.layers.fused_moe.fused_moe import (
 +            get_config_dtype_str,
 +        )
 +    except ImportError:
 +        from vllm.model_executor.layers.fused_moe.fused_moe import (
 +            _get_config_dtype_str as get_config_dtype_str,
 +        )
     VLLM_API_VERSION = "0.10.x"
     VLLM_AVAILABLE = True
@@ -195,6 +202,7 @@ def _invoke_kernel(
     B: torch.Tensor,
     C: torch.Tensor,
     topk_weights: torch.Tensor,
 +    topk_ids: torch.Tensor,
     sorted_token_ids: torch.Tensor,
     expert_ids: torch.Tensor,
     num_tokens_post_padded: torch.Tensor,
@@ -249,6 +257,7 @@ def _invoke_kernel(
         B_scale=B_scale,
         B_zp=None,
         topk_weights=topk_weights,
 +        topk_ids=topk_ids,
         sorted_token_ids=sorted_token_ids,
         expert_ids=expert_ids,
         num_tokens_post_padded=num_tokens_post_padded,
@@ -260,7 +269,9 @@ def _invoke_kernel(
         use_int8_w8a8=False,
         use_int8_w8a16=False,
         use_int4_w4a16=False,
 +        use_int4_w4a8=False,
         per_channel_quant=per_channel_quant,
 +        use_valu=False,
         block_shape=block_shape,
         B_bias=None,
     )
@@ -273,6 +284,7 @@ def _run_fused_moe_iteration(
     intermediate_cache1: torch.Tensor,
     intermediate_cache2: torch.Tensor,
     topk_weights: torch.Tensor,
 +    topk_ids: torch.Tensor,
     sorted_token_ids: torch.Tensor,
     expert_ids: torch.Tensor,
     num_tokens_post_padded: torch.Tensor,
@@ -292,6 +304,7 @@ def _run_fused_moe_iteration(
         B=w1.contiguous(),
         C=intermediate_cache1.contiguous(),
         topk_weights=topk_weights.contiguous(),
 +        topk_ids=topk_ids.contiguous(),
         sorted_token_ids=sorted_token_ids.contiguous(),
         expert_ids=expert_ids.contiguous(),
         num_tokens_post_padded=num_tokens_post_padded.contiguous(),
@@ -321,6 +334,7 @@ def _run_fused_moe_iteration(
         B=w2.contiguous(),
         C=intermediate_cache2.contiguous(),
         topk_weights=topk_weights.contiguous(),
 +        topk_ids=topk_ids.contiguous(),
         sorted_token_ids=sorted_token_ids.contiguous(),
         expert_ids=expert_ids.contiguous(),
         num_tokens_post_padded=num_tokens_post_padded.contiguous(),
@@ -548,6 +562,7 @@ def profile_fused_moe_kernel(
             intermediate_cache1=intermediate_cache1,
             intermediate_cache2=intermediate_cache2,
             topk_weights=topk_weights,
 +            topk_ids=topk_ids,
             sorted_token_ids=sorted_token_ids,
             expert_ids=expert_ids,
             num_tokens_post_padded=num_tokens_post_padded,
--- a/patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch
+++ b/patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch
@@ -0,0 +1,438 @@
 diff --git a/frontier/entities/request.py b/frontier/entities/request.py
 index a173caf..eee588b 100644
 --- a/frontier/entities/request.py
 +++ b/frontier/entities/request.py
@@ -301,7 +301,7 @@ class Request(BaseEntity):
     @property
     @check_scheduled
     def e2e_time_normalized(self) -> float:
 -        return self.e2e_time / self.num_decode_tokens
 +        return self.e2e_time / self.user_facing_num_decode_tokens
     @property
     @check_scheduled
@@ -315,7 +315,7 @@ class Request(BaseEntity):
     @property
     @check_scheduled
     def execution_time_normalized(self) -> float:
 -        return self.execution_time / self.num_decode_tokens
 +        return self.execution_time / self.user_facing_num_decode_tokens
     @property
     @check_scheduled
@@ -329,7 +329,7 @@ class Request(BaseEntity):
     @property
     @check_scheduled
     def model_execution_time_normalized(self) -> float:
 -        return self.model_execution_time / self.num_decode_tokens
 +        return self.model_execution_time / self.user_facing_num_decode_tokens
     @property
     def arrived_at(self) -> float:
@@ -886,10 +886,13 @@ class Request(BaseEntity):
         Average time per output token excluding the first token.
         Returns 0 if there's only one or no decode tokens.
         """
 -        if self._num_decode_tokens <= 1 or self._first_decode_token_completed_at == 0:
 +        if (
 +            self._user_facing_num_decode_tokens <= 1
 +            or self._first_decode_token_completed_at == 0
 +        ):
             return 0
         total_decode_time = self._completed_at - self._first_decode_token_completed_at
 -        return total_decode_time / (self._num_decode_tokens - 1)
 +        return total_decode_time / (self._user_facing_num_decode_tokens - 1)
     def on_kv_cache_transfer_start(self, transfer_start_time: float) -> None:
         """Record the earliest request-level KV transfer start timestamp."""
 diff --git a/frontier/metrics/metrics_store.py b/frontier/metrics/metrics_store.py
 index 422e2c2..eaa6308 100644
 --- a/frontier/metrics/metrics_store.py
 +++ b/frontier/metrics/metrics_store.py
@@ -2374,8 +2374,8 @@ class MetricsStore:
                 "cluster_type": cluster_type.name,
                 "arrived_at": float(request.arrived_at),
                 "arrived_at_ms": float(request.arrived_at) * 1000.0,
 -                "num_prefill_tokens": int(request.num_prefill_tokens),
 -                "num_decode_tokens": int(request.num_decode_tokens),
 +                "num_prefill_tokens": int(request.user_facing_num_prefill_tokens),
 +                "num_decode_tokens": int(request.user_facing_num_decode_tokens),
             }
         )
@@ -2558,8 +2558,8 @@ class MetricsStore:
                 * 1000.0,
                 "completed_at": float(request.completed_at),
                 "completed_at_ms": float(request.completed_at) * 1000.0,
 -                "num_prefill_tokens": int(request.num_prefill_tokens),
 -                "num_decode_tokens": int(request.num_decode_tokens),
 +                "num_prefill_tokens": int(request.user_facing_num_prefill_tokens),
 +                "num_decode_tokens": int(request.user_facing_num_decode_tokens),
                 "request_e2e_time_s": float(request.e2e_time),
                 "request_e2e_time_ms": float(request.e2e_time) * 1000.0,
                 "request_waiting_time_total_s": float(request_waiting_time_total),
@@ -2675,7 +2675,7 @@ class MetricsStore:
             RequestMetricsTimeDistributions.REQUEST_EXECUTION_PLUS_PREEMPTION_TIME_NORMALIZED
         ].put(
             request.id,
 -            request.execution_time / request.num_decode_tokens,
 +            request.execution_time / request.user_facing_num_decode_tokens,
         )
         if request.is_prefill_complete:
@@ -2688,23 +2688,23 @@ class MetricsStore:
             # Guard against division by zero (defensive programming)
             # Normal requests should always have num_prefill_tokens >= 1, but
             # this protects against edge cases or synthetic test requests
 -            if request.num_prefill_tokens > 0:
 +            if request.user_facing_num_prefill_tokens > 0:
                 self._request_metrics_time_distributions[
                     RequestMetricsTimeDistributions.PREFILL_EXECUTION_PLUS_PREEMPTION_PER_TOKEN
                 ].put(
                     request.id,
                     (request.prefill_completed_at - request.scheduled_at)
 -                    / request.num_prefill_tokens,
 +                    / request.user_facing_num_prefill_tokens,
                 )
             #
             # Guard against division by zero for decode tokens
 -            if request.num_decode_tokens > 0:
 +            if request.user_facing_num_decode_tokens > 0:
                 self._request_metrics_time_distributions[
                     RequestMetricsTimeDistributions.DECODE_E2E_TIME_PER_TOKEN
                 ].put(
                     request.id,
                     (request.completed_at - request.prefill_completed_at)
 -                    / request.num_decode_tokens,
 +                    / request.user_facing_num_decode_tokens,
                 )
         self._request_metrics_histogram[
@@ -2958,7 +2958,7 @@ class MetricsStore:
                     RequestMetricsTimeDistributions.TTFT_DECODE_FIRST_TOKEN
                 ].put(request.id, max(0, ttft_decode_first))  # Ensure non-negative
 -        if request.num_decode_tokens > 1:
 +        if request.user_facing_num_decode_tokens > 1:
             if request.first_decode_token_completed_at <= 0:
                 raise ValueError(
                     f"Missing first token completion timestamp for request_id={request.id}"
@@ -2983,7 +2983,7 @@ class MetricsStore:
         # TPOT (Time Per Output Token) metrics
         if (
 -            request.num_decode_tokens > 1
 +            request.user_facing_num_decode_tokens > 1
             and request.first_decode_token_completed_at > 0
         ):
             self._request_metrics_time_distributions[
@@ -2997,7 +2997,7 @@ class MetricsStore:
             )
             # M2N transfer time per token (excluding first token)
             tpot_transfer = request.total_m2n_transfer_time / (
 -                request.num_decode_tokens - 1
 +                request.user_facing_num_decode_tokens - 1
             )
             self._request_metrics_time_distributions[
                 RequestMetricsTimeDistributions.TPOT_TRANSFER
 diff --git a/frontier/scheduler/replica_scheduler/base_replica_scheduler.py b/frontier/scheduler/replica_scheduler/base_replica_scheduler.py
 index c0c4085..2d362bb 100644
 --- a/frontier/scheduler/replica_scheduler/base_replica_scheduler.py
 +++ b/frontier/scheduler/replica_scheduler/base_replica_scheduler.py
@@ -599,6 +599,9 @@ class BaseReplicaScheduler(ABC):
             "running_requests": self._debug_request_collection_state(
                 getattr(self, "_running_requests", None)
             ),
 +            "active_batch_request_counts": dict(
 +                getattr(self, "_active_batch_request_counts", {})
 +            ),
             "allocation_map": self._debug_allocation_map_state(
                 self._allocation_map
             ),
 diff --git a/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py b/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py
 index ac2e062..f61e088 100644
 --- a/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py
 +++ b/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py
@@ -2569,10 +2569,9 @@ class VLLMv1EngineReplicaScheduler(BaseReplicaScheduler):
             else -1
         )
 -        # Record preemption statistics in the request entity
 -        # This must be done BEFORE resetting num_processed_tokens
 +        # Record preemption statistics in the request entity.
 +        # This must be done before mutating the request's token accounting.
         victim.record_preemption(self._cluster_type, num_computed_tokens_before)
 -        victim.advance_runtime_epoch()
         # Remove from running requests
         if victim in self._running_requests:
@@ -2582,9 +2581,33 @@ class VLLMv1EngineReplicaScheduler(BaseReplicaScheduler):
         if victim.id in self._allocation_map:
             self._free_request_resources(victim)
 -        # Mark as preempted and reset computed tokens
 +        # Mark as preempted and reset computed tokens. Decode-phase preemption
 +        # must replay the already-produced output tokens as part of the next
 +        # prefill, matching vLLM's recompute path. Otherwise the request stays
 +        # prefill-complete with zero processed tokens and waiting admission can
 +        # compute a zero-token next step.
 +        if victim.is_prefill_complete:
 +            total_tokens = int(victim.total_tokens)
 +            replay_prefill_tokens = int(victim.num_processed_tokens)
 +            victim.advance_runtime_epoch()
 +            victim._num_prefill_tokens = replay_prefill_tokens
 +            victim._num_decode_tokens = max(total_tokens - replay_prefill_tokens, 0)
 +            victim._num_processed_tokens = 0
 +            victim._num_prefill_tokens_cached = 0
 +            victim._scheduled = False
 +            victim._completed = False
 +            victim._is_prefill_complete = False
 +            victim._current_decode_token_index = 1
 +            victim._completed_layer_count = 0
 +            victim._af_roundtrip_inflight = False
 +            victim._num_restarts += 1
 +        else:
 +            victim.advance_runtime_epoch()
 +            victim._num_processed_tokens = 0
 +            victim._num_prefill_tokens_cached = 0
 +            victim._scheduled = False
 +            victim._is_prefill_complete = False
         victim._preempted = True
 -        victim._num_processed_tokens = 0  # Reset computed tokens as in vLLM v1
         self._scheduled_num_computed_tokens_by_request.pop(victim.id, None)
         # Record re-entry to waiting queue for waiting time tracking
@@ -3139,6 +3162,19 @@ class VLLMv1EngineReplicaScheduler(BaseReplicaScheduler):
             if num_new_tokens <= 0:
                 waiting_queue.popleft()
 +                if not request.completed:
 +                    skipped_waiting_requests.append(request)
 +                    logger.warning(
 +                        "[WAITING-ZERO-TOKEN-SKIP] Preserving unfinished req=%s "
 +                        "with num_new_tokens=%s, preempted=%s, "
 +                        "is_prefill_complete=%s, processed=%s/%s",
 +                        request.id,
 +                        num_new_tokens,
 +                        getattr(request, "preempted", False),
 +                        getattr(request, "is_prefill_complete", False),
 +                        getattr(request, "num_processed_tokens", None),
 +                        getattr(request, "total_tokens", None),
 +                    )
                 continue
             # Try to allocate (no preemption for waiting requests in Phase 2)
@@ -4360,12 +4396,14 @@ class VLLMv1EngineReplicaScheduler(BaseReplicaScheduler):
         )
         waiting_len = len(self._waiting_requests)
         running_len = len(self._running_requests)
 +        active_batch_request_len = len(self._get_active_batch_request_counts())
         logger.info(
             f"[RS-IDLE-CHECK][replica={self._replica_id}][dp={self._dp_id}] "
             f"num_pending_requests={self.num_pending_requests}, waiting_requests={waiting_len}, "
             f"running_requests={running_len}, allocated_blocks={len(self._allocation_map)}, "
 -            f"num_running_batches={self._num_running_batches}, stages_empty={stages_empty}, af_immediate_len={af_len}"
 +            f"num_running_batches={self._num_running_batches}, stages_empty={stages_empty}, "
 +            f"af_immediate_len={af_len}, active_batch_requests={active_batch_request_len}"
         )
         # If AF immediate queue has pending batches, the replica is not idle
         if af_len > 0:
@@ -4374,6 +4412,7 @@ class VLLMv1EngineReplicaScheduler(BaseReplicaScheduler):
             self.num_pending_requests == 0
             and waiting_len == 0
             and running_len == 0
 +            and active_batch_request_len == 0
             and len(self._allocation_map) == 0
             and self._num_running_batches == 0
             and stages_empty
 diff --git a/frontier/scheduler/replica_stage_scheduler/replica_stage_schduler.py b/frontier/scheduler/replica_stage_scheduler/replica_stage_schduler.py
 index 2344fe3..22c29d4 100644
 --- a/frontier/scheduler/replica_stage_scheduler/replica_stage_schduler.py
 +++ b/frontier/scheduler/replica_stage_scheduler/replica_stage_schduler.py
@@ -48,7 +48,7 @@ class ReplicaStageScheduler:
         return self._is_last_stage
     def is_empty(self) -> bool:
 -        return len(self._batch_queue) == 0
 +        return len(self._batch_queue) == 0 and not self._is_busy
     def get_debug_state(self) -> dict:
         """Return scheduler state for fail-fast sequential-end diagnostics."""
 diff --git a/frontier/simulator.py b/frontier/simulator.py
 index b1e14fd..083dcba 100644
 --- a/frontier/simulator.py
 +++ b/frontier/simulator.py
@@ -543,6 +543,146 @@ class Simulator:
             + json.dumps(payload, indent=2, sort_keys=True, default=str)
         )
 +    def _build_sequential_incomplete_request_report(
 +        self,
 +        *,
 +        completed_requests: int,
 +        total_requests: int,
 +    ) -> str:
 +        """Build a structured report for drained sequential runs with open requests."""
 +        if not hasattr(self._global_scheduler, "_cluster_schedulers"):
 +            raise RuntimeError(
 +                "Global scheduler missing _cluster_schedulers for sequential diagnostics"
 +            )
 +
 +        cluster_states = []
 +        for cluster_type, cluster_scheduler in sorted(
 +            self._global_scheduler._cluster_schedulers.items(),
 +            key=lambda item: item[0].name,
 +        ):
 +            if not hasattr(cluster_scheduler, "get_debug_state"):
 +                raise RuntimeError(
 +                    f"Cluster scheduler {cluster_type.name} missing get_debug_state()"
 +                )
 +            cluster_states.append(
 +                {
 +                    "cluster_key": cluster_type.name,
 +                    "state": cluster_scheduler.get_debug_state(),
 +                }
 +            )
 +
 +        completed_ids = set(getattr(self._metric_store, "_completed_request_ids", set()))
 +        generated_requests = list(getattr(self, "_all_requests", []))
 +        missing_requests = [
 +            request for request in generated_requests if request.id not in completed_ids
 +        ]
 +        missing_request_summaries = [
 +            self._build_request_debug_snapshot(request) for request in missing_requests
 +        ]
 +        payload = {
 +            "message": (
 +                "Sequential simulation drained before all requests completed"
 +            ),
 +            "simulation_time": self._time,
 +            "terminate": self._terminate,
 +            "event_queue_length": len(self._event_queue),
 +            "global_scheduler_is_empty": self._global_scheduler.is_empty,
 +            "completed_requests": completed_requests,
 +            "total_requests": total_requests,
 +            "missing_request_count": total_requests - completed_requests,
 +            "completed_request_ids": sorted(completed_ids),
 +            "missing_request_ids": [request.id for request in missing_requests],
 +            "missing_requests": missing_request_summaries,
 +            "clusters": cluster_states,
 +        }
 +        return (
 +            "Sequential simulation drained before all requests completed:\n"
 +            + json.dumps(payload, indent=2, sort_keys=True, default=str)
 +        )
 +
 +    def _build_request_debug_snapshot(self, request) -> dict:
 +        """Return request state without calling checked metric properties."""
 +        return {
 +            "id": request.id,
 +            "session_id": getattr(request, "session_id", None),
 +            "arrived_at": getattr(request, "arrived_at", None),
 +            "num_prefill_tokens": getattr(request, "num_prefill_tokens", None),
 +            "num_decode_tokens": getattr(request, "num_decode_tokens", None),
 +            "total_tokens": getattr(request, "total_tokens", None),
 +            "num_processed_tokens": getattr(request, "num_processed_tokens", None),
 +            "num_processed_prefill_tokens": getattr(
 +                request, "num_processed_prefill_tokens", None
 +            ),
 +            "num_processed_decode_tokens": getattr(
 +                request, "num_processed_decode_tokens", None
 +            ),
 +            "remaining_decode_tokens": getattr(request, "remaining_decode_tokens", None),
 +            "num_prefill_tokens_cached": getattr(
 +                request, "num_prefill_tokens_cached", None
 +            ),
 +            "is_prefill_complete": getattr(request, "is_prefill_complete", None),
 +            "scheduled": getattr(request, "scheduled", None),
 +            "preempted": getattr(request, "preempted", None),
 +            "completed": getattr(request, "completed", None),
 +            "completed_at": getattr(request, "_completed_at", None),
 +            "prefill_completed_at": getattr(request, "_prefill_completed_at", None),
 +            "latest_stage_scheduled_at": getattr(
 +                request, "_latest_stage_scheduled_at", None
 +            ),
 +            "latest_stage_completed_at": getattr(
 +                request, "_latest_stage_completed_at", None
 +            ),
 +            "latest_iteration_scheduled_at": getattr(
 +                request, "_latest_iteration_scheduled_at", None
 +            ),
 +            "latest_iteration_completed_at": getattr(
 +                request, "_latest_iteration_completed_at", None
 +            ),
 +            "current_decode_token_index": getattr(
 +                request, "_current_decode_token_index", None
 +            ),
 +            "completed_layer_count": getattr(request, "_completed_layer_count", None),
 +            "runtime_epoch": getattr(request, "runtime_epoch", None),
 +            "execution_epoch": getattr(request, "execution_epoch", None),
 +            "num_restarts": getattr(request, "num_restarts", None),
 +            "cluster_arrival_times": {
 +                cluster_type.name: list(times)
 +                for cluster_type, times in getattr(
 +                    request, "_cluster_arrival_times", {}
 +                ).items()
 +            },
 +            "cluster_scheduled_at": {
 +                cluster_type.name: list(times)
 +                for cluster_type, times in getattr(
 +                    request, "_scheduled_at", {}
 +                ).items()
 +            },
 +            "cluster_scheduling_delay": {
 +                cluster_type.name: list(times)
 +                for cluster_type, times in getattr(
 +                    request, "_scheduling_delay", {}
 +                ).items()
 +            },
 +            "cluster_execution_time": {
 +                cluster_type.name: list(times)
 +                for cluster_type, times in getattr(
 +                    request, "_execution_time", {}
 +                ).items()
 +            },
 +            "preemption_count": {
 +                cluster_type.name: count
 +                for cluster_type, count in getattr(
 +                    request, "_preemption_count", {}
 +                ).items()
 +            },
 +            "tokens_at_preemption": {
 +                cluster_type.name: list(tokens)
 +                for cluster_type, tokens in getattr(
 +                    request, "_tokens_at_preemption", {}
 +                ).items()
 +            },
 +        }
 +
     def _try_promote_terminal_pdaf_scheduler_work(self) -> bool:
         """Promote terminal PD-AF DECODE_FFN groups when no event can fill them.
@@ -708,6 +848,25 @@ class Simulator:
                 self._trace_store.close()
             raise RuntimeError(report)
 +        total_requests = self._metric_store.get_total_requests()
 +        completed_requests = self._metric_store.get_completed_requests()
 +        if (
 +            total_requests > 0
 +            and completed_requests < total_requests
 +            and not self._terminate
 +        ):
 +            report = self._build_sequential_incomplete_request_report(
 +                completed_requests=completed_requests,
 +                total_requests=total_requests,
 +            )
 +            logger.error(report)
 +            if self._sequential_event_loggers:
 +                for logger_instance in self._sequential_event_loggers.values():
 +                    logger_instance.write_summary()
 +            if self._trace_store:
 +                self._trace_store.close()
 +            raise RuntimeError(report)
 +
         if self._sequential_event_loggers:
             for logger_instance in self._sequential_event_loggers.values():
                 logger_instance.write_summary()
--- a/scripts/run_frontier_blocker_probe.sh
+++ b/scripts/run_frontier_blocker_probe.sh
@@ -0,0 +1,179 @@
 #!/usr/bin/env bash
 set -euo pipefail
 if [ "$#" -ne 2 ]; then
  echo "usage: $0 <run-name> <fixture-dir>" >&2
  echo "example: $0 n195_default runs/rs1/blocker_request_194/fixtures/coder_195" >&2
  exit 2
 fi
 RUN_NAME="$1"
 FIXTURE_DIR="$2"
 REPLAYSERVE_ROOT="${REPLAYSERVE_ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
 FRONTIER_ROOT="${FRONTIER_ROOT:-/tmp/toc-llm-sim-research/Frontier}"
 if [ -z "${PYTHON_BIN:-}" ] && [ -x "$REPLAYSERVE_ROOT/.venv/bin/python" ]; then
  PYTHON_BIN="$REPLAYSERVE_ROOT/.venv/bin/python"
 else
  PYTHON_BIN="${PYTHON_BIN:-python3}"
 fi
 PYTHON_DEPS_DIR="${PYTHON_DEPS_DIR:-$REPLAYSERVE_ROOT/.deps/python}"
 RUN_ROOT="${RUN_ROOT:-$REPLAYSERVE_ROOT/runs/rs1/blocker_request_194/probes}"
 RUN_DIR="${RUN_ROOT}/${RUN_NAME}"
 case "$FIXTURE_DIR" in
  /*) ;;
  *) FIXTURE_DIR="${REPLAYSERVE_ROOT}/${FIXTURE_DIR}" ;;
 esac
 TRACE_FILE="${FIXTURE_DIR}/frontier.csv"
 SIDECAR_FILE="${FIXTURE_DIR}/sidecar.jsonl"
 METRICS_ROOT="${RUN_DIR}/frontier_metrics"
 RUN_ID="rs1_blocker_${RUN_NAME}"
 PREFIX_CACHING="${PREFIX_CACHING:-true}"
 CHUNKED_PREFILL="${CHUNKED_PREFILL:-true}"
 LONG_PREFILL_TOKEN_THRESHOLD="${LONG_PREFILL_TOKEN_THRESHOLD:-64}"
 BATCH_SIZE_CAP="${BATCH_SIZE_CAP:-128}"
 MAX_TOKENS_IN_BATCH="${MAX_TOKENS_IN_BATCH:-32768}"
 require_bool() {
  local name="$1"
  local value="$2"
  if [ "$value" != "true" ] && [ "$value" != "false" ]; then
    echo "ERROR: $name must be true or false; got $value" >&2
    exit 2
  fi
 }
 require_bool "PREFIX_CACHING" "$PREFIX_CACHING"
 require_bool "CHUNKED_PREFILL" "$CHUNKED_PREFILL"
 if [ ! -d "$FRONTIER_ROOT" ]; then
  echo "ERROR: Frontier root does not exist: $FRONTIER_ROOT" >&2
  exit 2
 fi
 if [ ! -f "$TRACE_FILE" ]; then
  echo "ERROR: fixture trace does not exist: $TRACE_FILE" >&2
  exit 2
 fi
 if [ ! -f "$SIDECAR_FILE" ]; then
  echo "ERROR: fixture sidecar does not exist: $SIDECAR_FILE" >&2
  exit 2
 fi
 mkdir -p "$RUN_DIR" "$METRICS_ROOT"
 if [ -d "$PYTHON_DEPS_DIR" ]; then
  export PYTHONPATH="$PYTHON_DEPS_DIR:$FRONTIER_ROOT${PYTHONPATH:+:$PYTHONPATH}"
 else
  export PYTHONPATH="$FRONTIER_ROOT${PYTHONPATH:+:$PYTHONPATH}"
 fi
 export WANDB_DISABLED=true
 export VIDUR_DISABLE_WANDB=1
 export FRONTIER_LOG_LEVEL="${FRONTIER_LOG_LEVEL:-info}"
 export PYTHONDONTWRITEBYTECODE=1
 CMD=(
  "$PYTHON_BIN" -m frontier.main
  --simulation_mode online
  --sys_arch co-location
  --cc_backend_config_type analytical
  --cluster_config_num_replicas 1
  --cluster_scheduler_config_type sticky_round_robin
  --replica_config_model_name Qwen/Qwen3-32B
  --replica_config_device a800
  --replica_config_network_device a800_dgx
  --replica_config_attn_tensor_parallel_size 2
  --replica_config_num_pipeline_stages 1
  --replica_config_attn_data_parallel_size 1
  --replica_scheduler_config_type vllm_v1
  --decode_cuda_graph_mode full_decode_only
  --vllm_v1_scheduler_config_batch_size_cap "$BATCH_SIZE_CAP"
  --vllm_v1_scheduler_config_max_tokens_in_batch "$MAX_TOKENS_IN_BATCH"
  --vllm_v1_scheduler_config_long_prefill_token_threshold "$LONG_PREFILL_TOKEN_THRESHOLD"
  --vllm_v1_scheduler_config_block_size 16
  --vllm_v1_scheduler_config_num_blocks_mode memory_planner
  --vllm_v1_scheduler_config_gpu_memory_utilization 0.9
  --vllm_v1_scheduler_config_non_kv_cache_overhead_bytes 0
  --request_generator_config_type trace_replay
  --trace_request_generator_config_trace_file "$TRACE_FILE"
  --trace_request_generator_config_max_tokens 32768
  --random_forrest_execution_time_predictor_config_enable_dummy_mode
  --random_forrest_execution_time_predictor_config_dummy_execution_time_ms 1.0
  --metrics_config_output_dir "$METRICS_ROOT"
  --metrics_config_run_id "$RUN_ID"
  --metrics_config_write_metrics
  --metrics_config_store_request_metrics
  --metrics_config_store_batch_metrics
  --metrics_config_store_token_completion_metrics
  --metrics_config_store_utilization_metrics
  --no-metrics_config_store_plots
  --no-metrics_config_enable_chrome_trace
  --no-metrics_config_write_json_trace
  --no-metrics_config_store_frontier_stage_batch_ledger
 )
 if [ "$PREFIX_CACHING" = "true" ]; then
  CMD+=(--vllm_v1_scheduler_config_enable_prefix_caching)
 else
  CMD+=(--no-vllm_v1_scheduler_config_enable_prefix_caching)
 fi
 if [ "$CHUNKED_PREFILL" = "true" ]; then
  CMD+=(--vllm_v1_scheduler_config_enable_chunked_prefill)
 else
  CMD+=(--no-vllm_v1_scheduler_config_enable_chunked_prefill)
 fi
 {
  printf 'cd %q\n' "$FRONTIER_ROOT"
  printf 'export PYTHONPATH=%q\n' "$PYTHONPATH"
  printf 'export WANDB_DISABLED=%q\n' "$WANDB_DISABLED"
  printf 'export VIDUR_DISABLE_WANDB=%q\n' "$VIDUR_DISABLE_WANDB"
  printf 'export FRONTIER_LOG_LEVEL=%q\n' "$FRONTIER_LOG_LEVEL"
  printf 'export PYTHONDONTWRITEBYTECODE=%q\n' "$PYTHONDONTWRITEBYTECODE"
  printf 'command='
  printf '%q ' "${CMD[@]}"
  printf '\n'
 } > "$RUN_DIR/command.txt"
 {
  printf 'run_name=%s\n' "$RUN_NAME"
  printf 'replayserve_root=%s\n' "$REPLAYSERVE_ROOT"
  printf 'frontier_root=%s\n' "$FRONTIER_ROOT"
  printf 'python_deps_dir=%s\n' "$PYTHON_DEPS_DIR"
  printf 'fixture_dir=%s\n' "$FIXTURE_DIR"
  printf 'trace_file=%s\n' "$TRACE_FILE"
  printf 'sidecar_file=%s\n' "$SIDECAR_FILE"
  printf 'run_dir=%s\n' "$RUN_DIR"
  printf 'metrics_root=%s\n' "$METRICS_ROOT"
  printf 'run_id=%s\n' "$RUN_ID"
  printf 'prefix_caching=%s\n' "$PREFIX_CACHING"
  printf 'chunked_prefill=%s\n' "$CHUNKED_PREFILL"
  printf 'long_prefill_token_threshold=%s\n' "$LONG_PREFILL_TOKEN_THRESHOLD"
  printf 'batch_size_cap=%s\n' "$BATCH_SIZE_CAP"
  printf 'max_tokens_in_batch=%s\n' "$MAX_TOKENS_IN_BATCH"
  printf 'frontier_head=%s\n' "$(git -C "$FRONTIER_ROOT" rev-parse HEAD)"
 } > "$RUN_DIR/env.txt"
 START_EPOCH="$(date +%s)"
 printf '%s\n' "$START_EPOCH" > "$RUN_DIR/start_epoch.txt"
 set +e
 (
  cd "$FRONTIER_ROOT"
  "${CMD[@]}"
 ) >"$RUN_DIR/stdout.log" 2>"$RUN_DIR/stderr.log"
 EXIT_CODE=$?
 set -e
 END_EPOCH="$(date +%s)"
 printf '%s\n' "$END_EPOCH" > "$RUN_DIR/end_epoch.txt"
 printf '%s\n' "$EXIT_CODE" > "$RUN_DIR/exit_code.txt"
 printf '%s\n' "$((END_EPOCH - START_EPOCH))" > "$RUN_DIR/runtime_seconds.txt"
 if [ "$EXIT_CODE" -eq 0 ]; then
  "$PYTHON_BIN" "$REPLAYSERVE_ROOT/tools/postprocess_frontier_smoke.py" \
    --run-dir "$RUN_DIR" \
    --fixture-dir "$FIXTURE_DIR" || true
 fi
 exit "$EXIT_CODE"
--- a/scripts/run_frontier_smoke.sh
+++ b/scripts/run_frontier_smoke.sh
@@ -0,0 +1,141 @@
 #!/usr/bin/env bash
 set -euo pipefail
 if [ "$#" -ne 1 ]; then
  echo "usage: $0 <fixture-name>" >&2
  echo "example: $0 coder_100" >&2
  exit 2
 fi
 FIXTURE_NAME="$1"
 REPLAYSERVE_ROOT="${REPLAYSERVE_ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
 FRONTIER_ROOT="${FRONTIER_ROOT:-/tmp/toc-llm-sim-research/Frontier}"
 if [ -z "${PYTHON_BIN:-}" ] && [ -x "$REPLAYSERVE_ROOT/.venv/bin/python" ]; then
  PYTHON_BIN="$REPLAYSERVE_ROOT/.venv/bin/python"
 else
  PYTHON_BIN="${PYTHON_BIN:-python3}"
 fi
 PYTHON_DEPS_DIR="${PYTHON_DEPS_DIR:-$REPLAYSERVE_ROOT/.deps/python}"
 RUN_ROOT="${RUN_ROOT:-$REPLAYSERVE_ROOT/runs/rs1}"
 RUN_DIR="${RUN_ROOT}/${FIXTURE_NAME}"
 FIXTURE_DIR="${REPLAYSERVE_ROOT}/traces/fixtures/${FIXTURE_NAME}"
 TRACE_FILE="${FIXTURE_DIR}/frontier.csv"
 SIDECAR_FILE="${FIXTURE_DIR}/sidecar.jsonl"
 METRICS_ROOT="${RUN_DIR}/frontier_metrics"
 RUN_ID="rs1_${FIXTURE_NAME}"
 if [ ! -d "$FRONTIER_ROOT" ]; then
  echo "ERROR: Frontier root does not exist: $FRONTIER_ROOT" >&2
  exit 2
 fi
 if [ ! -f "$TRACE_FILE" ]; then
  echo "ERROR: fixture trace does not exist: $TRACE_FILE" >&2
  exit 2
 fi
 if [ ! -f "$SIDECAR_FILE" ]; then
  echo "ERROR: fixture sidecar does not exist: $SIDECAR_FILE" >&2
  exit 2
 fi
 mkdir -p "$RUN_DIR" "$METRICS_ROOT"
 if [ -d "$PYTHON_DEPS_DIR" ]; then
  export PYTHONPATH="$PYTHON_DEPS_DIR:$FRONTIER_ROOT${PYTHONPATH:+:$PYTHONPATH}"
 else
  export PYTHONPATH="$FRONTIER_ROOT${PYTHONPATH:+:$PYTHONPATH}"
 fi
 export WANDB_DISABLED=true
 export VIDUR_DISABLE_WANDB=1
 export FRONTIER_LOG_LEVEL="${FRONTIER_LOG_LEVEL:-info}"
 export PYTHONDONTWRITEBYTECODE=1
 CMD=(
  "$PYTHON_BIN" -m frontier.main
  --simulation_mode online
  --sys_arch co-location
  --cc_backend_config_type analytical
  --cluster_config_num_replicas 1
  --cluster_scheduler_config_type sticky_round_robin
  --replica_config_model_name Qwen/Qwen3-32B
  --replica_config_device a800
  --replica_config_network_device a800_dgx
  --replica_config_attn_tensor_parallel_size 2
  --replica_config_num_pipeline_stages 1
  --replica_config_attn_data_parallel_size 1
  --replica_scheduler_config_type vllm_v1
  --decode_cuda_graph_mode full_decode_only
  --vllm_v1_scheduler_config_batch_size_cap 128
  --vllm_v1_scheduler_config_max_tokens_in_batch 32768
  --vllm_v1_scheduler_config_long_prefill_token_threshold 64
  --vllm_v1_scheduler_config_block_size 16
  --vllm_v1_scheduler_config_num_blocks_mode memory_planner
  --vllm_v1_scheduler_config_gpu_memory_utilization 0.9
  --vllm_v1_scheduler_config_non_kv_cache_overhead_bytes 0
  --vllm_v1_scheduler_config_enable_prefix_caching
  --vllm_v1_scheduler_config_enable_chunked_prefill
  --request_generator_config_type trace_replay
  --trace_request_generator_config_trace_file "$TRACE_FILE"
  --trace_request_generator_config_max_tokens 32768
  --random_forrest_execution_time_predictor_config_enable_dummy_mode
  --random_forrest_execution_time_predictor_config_dummy_execution_time_ms 1.0
  --metrics_config_output_dir "$METRICS_ROOT"
  --metrics_config_run_id "$RUN_ID"
  --metrics_config_write_metrics
  --metrics_config_store_request_metrics
  --metrics_config_store_batch_metrics
  --metrics_config_store_token_completion_metrics
  --metrics_config_store_utilization_metrics
  --no-metrics_config_store_plots
  --no-metrics_config_enable_chrome_trace
  --no-metrics_config_write_json_trace
  --no-metrics_config_store_frontier_stage_batch_ledger
 )
 {
  printf 'cd %q\n' "$FRONTIER_ROOT"
  printf 'export PYTHONPATH=%q\n' "$PYTHONPATH"
  printf 'export WANDB_DISABLED=%q\n' "$WANDB_DISABLED"
  printf 'export VIDUR_DISABLE_WANDB=%q\n' "$VIDUR_DISABLE_WANDB"
  printf 'export FRONTIER_LOG_LEVEL=%q\n' "$FRONTIER_LOG_LEVEL"
  printf 'export PYTHONDONTWRITEBYTECODE=%q\n' "$PYTHONDONTWRITEBYTECODE"
  printf 'command='
  printf '%q ' "${CMD[@]}"
  printf '\n'
 } > "$RUN_DIR/command.txt"
 {
  printf 'fixture_name=%s\n' "$FIXTURE_NAME"
  printf 'replayserve_root=%s\n' "$REPLAYSERVE_ROOT"
  printf 'frontier_root=%s\n' "$FRONTIER_ROOT"
  printf 'python_deps_dir=%s\n' "$PYTHON_DEPS_DIR"
  printf 'trace_file=%s\n' "$TRACE_FILE"
  printf 'sidecar_file=%s\n' "$SIDECAR_FILE"
  printf 'run_dir=%s\n' "$RUN_DIR"
  printf 'metrics_root=%s\n' "$METRICS_ROOT"
  printf 'run_id=%s\n' "$RUN_ID"
  printf 'frontier_head=%s\n' "$(git -C "$FRONTIER_ROOT" rev-parse HEAD)"
 } > "$RUN_DIR/env.txt"
 START_EPOCH="$(date +%s)"
 printf '%s\n' "$START_EPOCH" > "$RUN_DIR/start_epoch.txt"
 set +e
 (
  cd "$FRONTIER_ROOT"
  "${CMD[@]}"
 ) >"$RUN_DIR/stdout.log" 2>"$RUN_DIR/stderr.log"
 EXIT_CODE=$?
 set -e
 END_EPOCH="$(date +%s)"
 printf '%s\n' "$END_EPOCH" > "$RUN_DIR/end_epoch.txt"
 printf '%s\n' "$EXIT_CODE" > "$RUN_DIR/exit_code.txt"
 printf '%s\n' "$((END_EPOCH - START_EPOCH))" > "$RUN_DIR/runtime_seconds.txt"
 if [ "$EXIT_CODE" -eq 0 ]; then
  "$PYTHON_BIN" "$REPLAYSERVE_ROOT/tools/postprocess_frontier_smoke.py" \
    --run-dir "$RUN_DIR" \
    --fixture-dir "$FIXTURE_DIR"
 fi
 exit "$EXIT_CODE"
--- a/tools/aggregate_runs.py
+++ b/tools/aggregate_runs.py
@@ -0,0 +1,255 @@
 #!/usr/bin/env python3
 """Aggregate ReplayServe Frontier run directories into CSV and Markdown."""
 from __future__ import annotations
 import argparse
 import csv
 import json
 from pathlib import Path
 from typing import Any
 REPLAYSERVE_ROOT = Path(__file__).resolve().parents[1]
 FIELDNAMES = [
    "suite_id",
    "sim",
    "fixture",
    "config_id",
    "status",
    "exit_code",
    "runtime_seconds",
    "frontier_mode",
    "frontier_head",
    "frontier_dirty",
    "attn_tp",
    "attn_dp",
    "moe_tp",
    "moe_ep",
    "batch_size_cap",
    "max_tokens_in_batch",
    "block_size",
    "enable_prefix_caching",
    "enable_chunked_prefill",
    "long_prefill_token_threshold",
    "frontier_block_hit_ratio",
    "replayserve_token_hit_ratio",
    "cache_metrics_available",
    "cache_metrics_unavailable_reason",
    "cache_metric_rows_complete",
    "cache_metric_rows_total",
    "cache_metric_rows_missing",
    "completion_is_complete",
    "missing_latency_request_ids",
    "preemption_events",
    "preempted_requests",
    "ttft_mean_ms",
    "ttft_p50_ms",
    "ttft_p95_ms",
    "tpot_mean_ms",
    "tpot_p50_ms",
    "tpot_p95_ms",
    "e2e_mean_ms",
    "e2e_p50_ms",
    "e2e_p95_ms",
    "requests_per_second",
    "tokens_per_second",
    "decode_tokens_per_second",
    "completed_requests",
    "total_requests",
    "run_dir",
 ]
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Aggregate ReplayServe run outputs.")
    parser.add_argument("suite_dir", type=Path, help="Run suite directory.")
    parser.add_argument(
        "--output-csv",
        type=Path,
        help="Output CSV path. Defaults to <suite_dir>/summary.csv.",
    )
    parser.add_argument(
        "--output-md",
        type=Path,
        help="Output Markdown path. Defaults to <suite_dir>/summary.md.",
    )
    return parser.parse_args()
 def load_json(path: Path) -> dict[str, Any]:
    if not path.exists():
        return {}
    with path.open("r", encoding="utf-8") as handle:
        data = json.load(handle)
    return data if isinstance(data, dict) else {}
 def read_int(path: Path) -> int | None:
    try:
        return int(path.read_text(encoding="utf-8").strip())
    except (FileNotFoundError, ValueError):
        return None
 def nested(data: dict[str, Any], *keys: str) -> Any:
    value: Any = data
    for key in keys:
        if not isinstance(value, dict):
            return None
        value = value.get(key)
    return value
 def fmt(value: Any) -> str:
    if value is None:
        return ""
    if isinstance(value, bool):
        return "true" if value else "false"
    if isinstance(value, float):
        return f"{value:.8g}"
    return str(value)
 def summarize_run(run_dir: Path) -> dict[str, Any]:
    manifest = load_json(run_dir / "run_manifest.json")
    status_json = load_json(run_dir / "run_status.json")
    post = load_json(run_dir / "postprocess_summary.json")
    system_metrics_path = post.get("system_metrics") if post else None
    system_metrics = load_json(Path(system_metrics_path)) if system_metrics_path else {}
    knobs = manifest.get("knobs", {}) if isinstance(manifest.get("knobs"), dict) else {}
    frontier = manifest.get("frontier", {}) if isinstance(manifest.get("frontier"), dict) else {}
    prefix = post.get("prefix_cache_postprocess", {}) if isinstance(post.get("prefix_cache_postprocess"), dict) else {}
    frontier_block = prefix.get("frontier_block_level", {}) if isinstance(prefix.get("frontier_block_level"), dict) else {}
    token_weighted = prefix.get("replayserve_token_weighted", {}) if isinstance(prefix.get("replayserve_token_weighted"), dict) else {}
    missing_rows = prefix.get("rows_with_missing_cache_metrics") or []
    if not isinstance(missing_rows, list):
        missing_rows = []
    preemption = post.get("preemption_statistics", {}) if isinstance(post.get("preemption_statistics"), dict) else {}
    completion = post.get("completion", {}) if isinstance(post.get("completion"), dict) else {}
    simulation = system_metrics.get("simulation_metadata", {}) if isinstance(system_metrics.get("simulation_metadata"), dict) else {}
    throughput = system_metrics.get("throughput_metrics", {}) if isinstance(system_metrics.get("throughput_metrics"), dict) else {}
    exit_code = status_json.get("exit_code")
    if exit_code is None:
        exit_code = read_int(run_dir / "exit_code.txt")
    runtime = status_json.get("runtime_seconds")
    if runtime is None:
        runtime = read_int(run_dir / "runtime_seconds.txt")
    status = status_json.get("status") or ("pass" if exit_code == 0 else "fail")
    if completion and not completion.get("is_complete", True):
        status = "incomplete"
    missing_latency_ids = completion.get("missing_latency_request_ids") or []
    if not isinstance(missing_latency_ids, list):
        missing_latency_ids = []
    return {
        "suite_id": manifest.get("suite_id"),
        "sim": manifest.get("sim"),
        "fixture": manifest.get("fixture"),
        "config_id": manifest.get("config_id"),
        "status": status,
        "exit_code": exit_code,
        "runtime_seconds": runtime,
        "frontier_mode": frontier.get("mode"),
        "frontier_head": frontier.get("head"),
        "frontier_dirty": bool((frontier.get("status_short") or "").strip()),
        "attn_tp": knobs.get("attn_tensor_parallel_size"),
        "attn_dp": knobs.get("attn_data_parallel_size"),
        "moe_tp": knobs.get("moe_tensor_parallel_size"),
        "moe_ep": knobs.get("moe_expert_parallel_size"),
        "batch_size_cap": knobs.get("batch_size_cap"),
        "max_tokens_in_batch": knobs.get("max_tokens_in_batch"),
        "block_size": knobs.get("block_size"),
        "enable_prefix_caching": knobs.get("enable_prefix_caching"),
        "enable_chunked_prefill": knobs.get("enable_chunked_prefill"),
        "long_prefill_token_threshold": knobs.get("long_prefill_token_threshold"),
        "frontier_block_hit_ratio": frontier_block.get("hit_ratio"),
        "replayserve_token_hit_ratio": token_weighted.get("hit_ratio"),
        "cache_metrics_available": prefix.get("available"),
        "cache_metrics_unavailable_reason": prefix.get("reason"),
        "cache_metric_rows_complete": prefix.get("completed_request_rows"),
        "cache_metric_rows_total": prefix.get("total_request_metric_rows"),
        "cache_metric_rows_missing": len(missing_rows),
        "completion_is_complete": completion.get("is_complete"),
        "missing_latency_request_ids": ",".join(str(value) for value in missing_latency_ids),
        "preemption_events": preemption.get("total_preemption_events"),
        "preempted_requests": preemption.get("total_preempted_requests"),
        "ttft_mean_ms": nested(system_metrics, "ttft_statistics", "mean"),
        "ttft_p50_ms": nested(system_metrics, "ttft_statistics", "p50"),
        "ttft_p95_ms": nested(system_metrics, "ttft_statistics", "p95"),
        "tpot_mean_ms": nested(system_metrics, "tpot_statistics", "mean"),
        "tpot_p50_ms": nested(system_metrics, "tpot_statistics", "p50"),
        "tpot_p95_ms": nested(system_metrics, "tpot_statistics", "p95"),
        "e2e_mean_ms": nested(system_metrics, "request_e2e_time_statistics", "mean"),
        "e2e_p50_ms": nested(system_metrics, "request_e2e_time_statistics", "p50"),
        "e2e_p95_ms": nested(system_metrics, "request_e2e_time_statistics", "p95"),
        "requests_per_second": throughput.get("requests_per_second"),
        "tokens_per_second": throughput.get("tokens_per_second"),
        "decode_tokens_per_second": throughput.get("decode_tokens_per_second"),
        "completed_requests": simulation.get("completed_requests"),
        "total_requests": simulation.get("total_requests"),
        "run_dir": str(run_dir),
    }
 def write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
        writer.writeheader()
        for row in rows:
            writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})
 def write_markdown(path: Path, rows: list[dict[str, Any]], suite_dir: Path) -> None:
    columns = [
        "config_id",
        "fixture",
        "status",
        "runtime_seconds",
        "enable_prefix_caching",
        "enable_chunked_prefill",
        "frontier_block_hit_ratio",
        "replayserve_token_hit_ratio",
        "cache_metric_rows_missing",
        "completion_is_complete",
        "preemption_events",
        "ttft_mean_ms",
        "tpot_mean_ms",
        "e2e_mean_ms",
        "tokens_per_second",
    ]
    with path.open("w", encoding="utf-8") as handle:
        handle.write(f"# Sweep Summary: {suite_dir.name}\n\n")
        handle.write(f"- Suite dir: `{suite_dir}`\n")
        handle.write(f"- Runs: `{len(rows)}`\n\n")
        handle.write("| " + " | ".join(columns) + " |\n")
        handle.write("|" + "|".join(["---"] * len(columns)) + "|\n")
        for row in rows:
            handle.write("| " + " | ".join(fmt(row.get(col)) for col in columns) + " |\n")
        handle.write("\n")
        handle.write(
            "Latency and throughput values are Frontier smoke outputs from the "
            "configured predictor/profile mode. RS3 tiny smoke uses dummy execution "
            "time, so these are harness plumbing checks, not performance claims.\n"
        )
 def main() -> int:
    args = parse_args()
    suite_dir = args.suite_dir.resolve()
    run_dirs = sorted(path.parent for path in suite_dir.glob("**/run_manifest.json"))
    rows = [summarize_run(path) for path in run_dirs]
    output_csv = args.output_csv or (suite_dir / "summary.csv")
    output_md = args.output_md or (suite_dir / "summary.md")
    write_csv(output_csv, rows)
    write_markdown(output_md, rows, suite_dir)
    print(f"wrote {output_csv}")
    print(f"wrote {output_md}")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/analyze_trace_window.py
+++ b/tools/analyze_trace_window.py
@@ -0,0 +1,188 @@
 #!/usr/bin/env python3
 """Analyze Qwen/ReplayServe sidecar rows around a request id."""
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 from typing import Any
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Analyze sidecar prefix overlap.")
    parser.add_argument("--fixture-dir", required=True, type=Path)
    parser.add_argument("--request-id", required=True, type=int)
    parser.add_argument("--window", type=int, default=10)
    parser.add_argument("--top-k", type=int, default=15)
    parser.add_argument("--output-dir", required=True, type=Path)
    return parser.parse_args()
 def load_jsonl(path: Path) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as handle:
        for line_number, line in enumerate(handle, start=1):
            stripped = line.strip()
            if not stripped:
                continue
            row = json.loads(stripped)
            if not isinstance(row, dict):
                raise ValueError(f"{path}: line {line_number}: expected object")
            rows.append(row)
    return rows
 def common_prefix_len(left: list[int], right: list[int]) -> int:
    count = 0
    for left_item, right_item in zip(left, right):
        if left_item != right_item:
            break
        count += 1
    return count
 def summarize_row(row: dict[str, Any], block_size: int = 16) -> dict[str, Any]:
    input_length = int(row["input_length"])
    output_length = int(row["output_length"])
    hash_ids = [int(value) for value in row["hash_ids"]]
    block_token_counts = [int(value) for value in row["block_token_counts"]]
    return {
        "request_id": int(row["request_id"]),
        "chat_id": int(row["chat_id"]),
        "parent_chat_id": int(row["parent_chat_id"]),
        "turn": int(row["turn"]),
        "type": row["type"],
        "timestamp": float(row["timestamp"]),
        "input_length": input_length,
        "output_length": output_length,
        "total_tokens": input_length + output_length,
        "hash_count": len(hash_ids),
        "first_hash_ids": hash_ids[:12],
        "last_hash_id": hash_ids[-1] if hash_ids else None,
        "partial_final_block": input_length % block_size != 0,
        "final_block_token_count": block_token_counts[-1] if block_token_counts else 0,
    }
 def main() -> int:
    args = parse_args()
    sidecar_path = args.fixture_dir / "sidecar.jsonl"
    rows = load_jsonl(sidecar_path)
    by_id = {int(row["request_id"]): row for row in rows}
    if args.request_id not in by_id:
        raise SystemExit(f"request_id {args.request_id} not found in {sidecar_path}")
    target = by_id[args.request_id]
    target_hashes = [int(value) for value in target["hash_ids"]]
    target_counts = [int(value) for value in target["block_token_counts"]]
    overlaps: list[dict[str, Any]] = []
    for row in rows:
        request_id = int(row["request_id"])
        if request_id >= args.request_id:
            continue
        lcp_blocks = common_prefix_len(target_hashes, [int(value) for value in row["hash_ids"]])
        if lcp_blocks <= 0:
            continue
        overlaps.append(
            {
                **summarize_row(row),
                "common_prefix_blocks_with_target": lcp_blocks,
                "common_prefix_tokens_with_target": sum(target_counts[:lcp_blocks]),
                "target_prefix_fraction_blocks": (
                    lcp_blocks / len(target_hashes) if target_hashes else 0.0
                ),
                "target_prefix_fraction_tokens": (
                    sum(target_counts[:lcp_blocks]) / int(target["input_length"])
                    if int(target["input_length"]) > 0
                    else 0.0
                ),
            }
        )
    overlaps.sort(
        key=lambda item: (
            item["common_prefix_blocks_with_target"],
            item["request_id"],
        ),
        reverse=True,
    )
    start = max(0, args.request_id - args.window)
    end = min(len(rows), args.request_id + args.window + 1)
    local_window = [summarize_row(row) for row in rows[start:end]]
    parent_chat_id = int(target["parent_chat_id"])
    parent_rows = [
        summarize_row(row)
        for row in rows
        if int(row["chat_id"]) == parent_chat_id or int(row["request_id"]) == parent_chat_id
    ]
    result = {
        "fixture_dir": str(args.fixture_dir),
        "sidecar": str(sidecar_path),
        "request_id": args.request_id,
        "target": summarize_row(target),
        "local_window": local_window,
        "top_prior_prefix_overlaps": overlaps[: args.top_k],
        "prior_overlap_count": len(overlaps),
        "parent_candidates": parent_rows,
        "interpretation": {
            "prefix_overlap_semantics": (
                "Frontier prefix cache matches consecutive block_hash_ids from "
                "the start of the prompt. common_prefix_tokens_with_target uses "
                "the target sidecar block_token_counts, preserving partial final "
                "block token counts."
            ),
            "partial_final_block_related": bool(int(target["input_length"]) % 16 != 0),
        },
    }
    args.output_dir.mkdir(parents=True, exist_ok=True)
    json_path = args.output_dir / f"request_{args.request_id}_analysis.json"
    md_path = args.output_dir / f"request_{args.request_id}_analysis.md"
    with json_path.open("w", encoding="utf-8") as handle:
        json.dump(result, handle, indent=2, sort_keys=True)
        handle.write("\n")
    with md_path.open("w", encoding="utf-8") as handle:
        target_summary = result["target"]
        handle.write(f"# Request {args.request_id} Trace Analysis\n\n")
        handle.write(f"- Fixture: `{args.fixture_dir}`\n")
        handle.write(f"- Timestamp: `{target_summary['timestamp']}`\n")
        handle.write(f"- Chat: `{target_summary['chat_id']}` parent `{target_summary['parent_chat_id']}` turn `{target_summary['turn']}`\n")
        handle.write(f"- Input/output/total tokens: `{target_summary['input_length']}` / `{target_summary['output_length']}` / `{target_summary['total_tokens']}`\n")
        handle.write(f"- Hash blocks: `{target_summary['hash_count']}`\n")
        handle.write(f"- Partial final block: `{target_summary['partial_final_block']}` final count `{target_summary['final_block_token_count']}`\n")
        handle.write("\n## Top Prior Prefix Overlaps\n\n")
        if not overlaps:
            handle.write("No prior request shares a first block with the target.\n")
        else:
            handle.write("| prior request | timestamp | input | output | lcp blocks | lcp tokens | partial final |\n")
            handle.write("|---:|---:|---:|---:|---:|---:|---|\n")
            for item in overlaps[: args.top_k]:
                handle.write(
                    f"| {item['request_id']} | {item['timestamp']} | "
                    f"{item['input_length']} | {item['output_length']} | "
                    f"{item['common_prefix_blocks_with_target']} | "
                    f"{item['common_prefix_tokens_with_target']} | "
                    f"{item['partial_final_block']} |\n"
                )
        handle.write("\n## Local Window\n\n")
        handle.write("| request | timestamp | input | output | blocks | partial final | first hashes |\n")
        handle.write("|---:|---:|---:|---:|---:|---|---|\n")
        for item in local_window:
            handle.write(
                f"| {item['request_id']} | {item['timestamp']} | "
                f"{item['input_length']} | {item['output_length']} | "
                f"{item['hash_count']} | {item['partial_final_block']} | "
                f"`{item['first_hash_ids']}` |\n"
            )
    print(json_path)
    print(md_path)
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/analyze_vllm_prefix_log.py
+++ b/tools/analyze_vllm_prefix_log.py
@@ -0,0 +1,98 @@
 #!/usr/bin/env python3
 """Summarize vLLM scheduler prefix-cache `computed:` log lines."""
 from __future__ import annotations
 import argparse
 import json
 import re
 from pathlib import Path
 from typing import Any
 START_RE = re.compile(r"Request (\d+) started running, prompt: (\d+), computed: (\d+)")
 PREEMPT_RE = re.compile(r"Request (\d+) preempted")
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Parse vLLM scheduler logs and report observed computed-token "
            "prefix-cache behavior. Repeated starts indicate preemption or "
            "re-admission, so all-start sums are not equivalent to per-request "
            "prefix hits."
        )
    )
    parser.add_argument("stdout_log", type=Path)
    parser.add_argument("--summary-json", type=Path)
    return parser.parse_args()
 def load_estimated_hit_tokens(path: Path | None) -> int | None:
    if path is None:
        return None
    summary = json.loads(path.read_text(encoding="utf-8"))
    reuse = summary.get("estimated_prefix_reuse", {})
    hit_tokens = reuse.get("hit_tokens")
    return int(hit_tokens) if hit_tokens is not None else None
 def main() -> int:
    args = parse_args()
    text = args.stdout_log.read_text(encoding="utf-8", errors="replace")
    by_request: dict[int, list[dict[str, int]]] = {}
    for match in START_RE.finditer(text):
        request_id = int(match.group(1))
        by_request.setdefault(request_id, []).append(
            {
                "prompt_tokens": int(match.group(2)),
                "computed_tokens": int(match.group(3)),
            }
        )
    preempted_request_ids = [int(match.group(1)) for match in PREEMPT_RE.finditer(text)]
    repeated = {
        str(request_id): starts
        for request_id, starts in sorted(by_request.items())
        if len(starts) > 1
    }
    all_computed = sum(
        start["computed_tokens"]
        for starts in by_request.values()
        for start in starts
    )
    first_computed = sum(starts[0]["computed_tokens"] for starts in by_request.values())
    last_computed = sum(starts[-1]["computed_tokens"] for starts in by_request.values())
    max_computed = sum(max(start["computed_tokens"] for start in starts) for starts in by_request.values())
    estimated_hit_tokens = load_estimated_hit_tokens(args.summary_json)
    result: dict[str, Any] = {
        "stdout_log": str(args.stdout_log),
        "starts_total": sum(len(starts) for starts in by_request.values()),
        "unique_requests": len(by_request),
        "preemptions": len(preempted_request_ids),
        "preempted_request_ids": preempted_request_ids,
        "repeated_request_ids": sorted(int(request_id) for request_id in repeated),
        "computed_tokens": {
            "all_starts": all_computed,
            "first_start_per_request": first_computed,
            "last_start_per_request": last_computed,
            "max_per_request": max_computed,
        },
        "repeated_starts": repeated,
    }
    if estimated_hit_tokens is not None:
        result["estimated_prefix_hit_tokens"] = estimated_hit_tokens
        result["matches_estimate"] = {
            name: value == estimated_hit_tokens
            for name, value in result["computed_tokens"].items()
        }
    print(json.dumps(result, indent=2, sort_keys=True))
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/build_frontier_vllm_alignment_report.py
+++ b/tools/build_frontier_vllm_alignment_report.py
@@ -0,0 +1,532 @@
 #!/usr/bin/env python3
 """Build Frontier-vs-vLLM alignment tables and plots for the current H20 runs."""
 from __future__ import annotations
 import csv
 import json
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 ROOT = Path(__file__).resolve().parents[1]
 OUT_DIR = ROOT / "docs" / "assets" / "frontier_vllm_alignment"
 DASH1_VLLM_ROOT = Path("/home/admin/cpfs/wjh/replayserve/runs/vllm_gpu_smoke_20260625_dash1")
@dataclass(frozen=True)
 class RunSpec:
    run_id: str
    label: str
    tp: int
    request_count: int
    scale_label: str
    scale_value: float
    fixture: str
    frontier_summary: str
    vllm_summary: str
    vllm_preemptions: int
    kv_blocks: int
    notes: str = ""
    vllm_remote: bool = False
 RUNS: list[RunSpec] = [
    RunSpec(
        run_id="tp1_n100_scale1",
        label="TP1 N100 raw",
        tp=1,
        request_count=100,
        scale_label="raw",
        scale_value=1.0,
        fixture="coder_100",
        frontier_summary=(
            "runs/rs6_frontier_h20_tp1_profile_full32k_20260624/"
            "frontier_h20_tp1_profile_full32k/coder_100/"
            "vllm_kv_15281_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary="runs/vllm_gpu_smoke_20260624/tp1_coder100_uncapped/summary.json",
        vllm_preemptions=8,
        kv_blocks=15281,
        notes="Frontier incomplete before lifecycle fix; included as TP1 100-request baseline.",
    ),
    RunSpec(
        run_id="tp1_n500_scale1",
        label="TP1 N500 raw",
        tp=1,
        request_count=500,
        scale_label="raw",
        scale_value=1.0,
        fixture="coder_500",
        frontier_summary=(
            "runs/rs8_frontier_h20_tp1_profile_full32k_coder500_20260625/"
            "frontier_h20_tp1_profile_full32k/coder_500/"
            "vllm_kv_15281_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder500_uncapped/summary.json",
        vllm_preemptions=63,
        kv_blocks=15281,
        notes="Frontier incomplete; useful as high-pressure stress signal.",
    ),
    RunSpec(
        run_id="tp1_n200_scale0667",
        label="TP1 N200 scale 0.667",
        tp=1,
        request_count=200,
        scale_label="0.667",
        scale_value=2 / 3,
        fixture="coder_200_ts0667",
        frontier_summary=(
            "runs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667/"
            "frontier_h20_tp1_profile_full32k/coder_200_ts0667/"
            "vllm_kv_15281_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder200_ts0667_uncapped/summary.json",
        vllm_preemptions=26,
        kv_blocks=15281,
        notes="Dense-arrival run; Frontier incomplete before lifecycle fix.",
    ),
    RunSpec(
        run_id="tp1_n200_scale2",
        label="TP1 N200 scale 2",
        tp=1,
        request_count=200,
        scale_label="2",
        scale_value=2.0,
        fixture="coder_200_ts2",
        frontier_summary=(
            "runs/rs10_preemption_replay_fix_ts2/frontier_h20_tp1_profile_full32k/"
            "coder_200_ts2/vllm_kv_15281_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts2_uncapped/summary.json",
        vllm_preemptions=43,
        kv_blocks=15281,
        notes="After Frontier decode-preemption lifecycle fix.",
    ),
    RunSpec(
        run_id="tp1_n200_scale3",
        label="TP1 N200 scale 3",
        tp=1,
        request_count=200,
        scale_label="3",
        scale_value=3.0,
        fixture="coder_200_ts3",
        frontier_summary=(
            "runs/rs10_preemption_replay_fix_ts3/frontier_h20_tp1_profile_full32k/"
            "coder_200_ts3/vllm_kv_15281_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts3_uncapped/summary.json",
        vllm_preemptions=16,
        kv_blocks=15281,
        notes="After Frontier decode-preemption lifecycle fix.",
    ),
    RunSpec(
        run_id="tp2_n200_scale2",
        label="TP2 N200 scale 2",
        tp=2,
        request_count=200,
        scale_label="2",
        scale_value=2.0,
        fixture="coder_200_ts2",
        frontier_summary=(
            "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
            "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
            "tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts2_uncapped" / "summary.json"),
        vllm_preemptions=0,
        kv_blocks=69055,
        notes="Uses true-mixed TP2/TP4 attention profile.",
        vllm_remote=True,
    ),
    RunSpec(
        run_id="tp2_n200_scale3",
        label="TP2 N200 scale 3",
        tp=2,
        request_count=200,
        scale_label="3",
        scale_value=3.0,
        fixture="coder_200_ts3",
        frontier_summary=(
            "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
            "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
            "tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts3_uncapped" / "summary.json"),
        vllm_preemptions=0,
        kv_blocks=69055,
        notes="Uses true-mixed TP2/TP4 attention profile.",
        vllm_remote=True,
    ),
    RunSpec(
        run_id="tp4_n200_scale2",
        label="TP4 N200 scale 2",
        tp=4,
        request_count=200,
        scale_label="2",
        scale_value=2.0,
        fixture="coder_200_ts2",
        frontier_summary=(
            "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
            "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
            "tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts2_uncapped" / "summary.json"),
        vllm_preemptions=0,
        kv_blocks=177077,
        notes="Uses true-mixed TP2/TP4 attention profile.",
        vllm_remote=True,
    ),
    RunSpec(
        run_id="tp4_n200_scale3",
        label="TP4 N200 scale 3",
        tp=4,
        request_count=200,
        scale_label="3",
        scale_value=3.0,
        fixture="coder_200_ts3",
        frontier_summary=(
            "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
            "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
            "tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
        ),
        vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts3_uncapped" / "summary.json"),
        vllm_preemptions=0,
        kv_blocks=177077,
        notes="Uses true-mixed TP2/TP4 attention profile.",
        vllm_remote=True,
    ),
 ]
 FIELDNAMES = [
    "run_id",
    "label",
    "tp",
    "request_count",
    "scale_label",
    "scale_value",
    "fixture",
    "kv_blocks",
    "frontier_completed",
    "frontier_total",
    "frontier_complete",
    "vllm_completed",
    "vllm_total",
    "frontier_preemptions",
    "vllm_preemptions",
    "frontier_prefix_hit",
    "vllm_prefix_hit",
    "prefix_hit_delta",
    "frontier_rps",
    "vllm_rps",
    "rps_ratio",
    "frontier_total_tps",
    "vllm_total_tps",
    "total_tps_ratio",
    "frontier_decode_tps",
    "vllm_decode_tps",
    "decode_tps_ratio",
    "frontier_ttft_p50_s",
    "vllm_ttft_p50_s",
    "ttft_p50_ratio",
    "frontier_ttft_p95_s",
    "vllm_ttft_p95_s",
    "ttft_p95_ratio",
    "frontier_tpot_p50_s",
    "vllm_tpot_p50_s",
    "tpot_p50_ratio",
    "frontier_tpot_p95_s",
    "vllm_tpot_p95_s",
    "tpot_p95_ratio",
    "frontier_e2e_p50_s",
    "vllm_e2e_p50_s",
    "e2e_p50_ratio",
    "frontier_e2e_p95_s",
    "vllm_e2e_p95_s",
    "e2e_p95_ratio",
    "notes",
 ]
 def load_json(path: Path) -> dict[str, Any]:
    with path.open("r", encoding="utf-8") as handle:
        data = json.load(handle)
    if not isinstance(data, dict):
        raise ValueError(f"{path}: expected JSON object")
    return data
 def load_vllm_summary(spec: RunSpec) -> dict[str, Any]:
    path = Path(spec.vllm_summary)
    if not spec.vllm_remote:
        return load_json(ROOT / path)
    local_candidate = ROOT / "runs" / "vllm_gpu_smoke_20260625_dash1" / path.parent.name / path.name
    if local_candidate.exists():
        return load_json(local_candidate)
    raw = subprocess.check_output(["ssh", "dash1", f"cat {spec.vllm_summary}"], text=True)
    data = json.loads(raw)
    if not isinstance(data, dict):
        raise ValueError(f"{spec.vllm_summary}: expected JSON object")
    return data
 def load_frontier_summary(spec: RunSpec) -> tuple[dict[str, Any], dict[str, Any]]:
    post = load_json(ROOT / spec.frontier_summary)
    system_path = Path(post["system_metrics"])
    if not system_path.is_absolute():
        system_path = ROOT / system_path
    return post, load_json(system_path)
 def ratio(numerator: float | int | None, denominator: float | int | None) -> float | None:
    if numerator is None or denominator in (None, 0):
        return None
    return float(numerator) / float(denominator)
 def nested(data: dict[str, Any], *keys: str) -> Any:
    value: Any = data
    for key in keys:
        if not isinstance(value, dict):
            return None
        value = value.get(key)
    return value
 def summarize(spec: RunSpec) -> dict[str, Any]:
    post, system = load_frontier_summary(spec)
    vllm = load_vllm_summary(spec)
    completion = post.get("completion", {})
    preemption = post.get("preemption_statistics", {})
    prefix = post.get("prefix_cache_postprocess", {})
    token_weighted = prefix.get("replayserve_token_weighted", {})
    throughput = system.get("throughput_metrics", {})
    frontier_total_tps = throughput.get("tokens_per_second")
    vllm_total_tps = vllm["prompt_tokens_per_second"] + vllm["generated_tokens_per_second"]
    frontier_prefix_hit = token_weighted.get("hit_ratio")
    vllm_prefix_hit = nested(vllm, "estimated_prefix_reuse", "token_hit_ratio")
    row: dict[str, Any] = {
        "run_id": spec.run_id,
        "label": spec.label,
        "tp": spec.tp,
        "request_count": spec.request_count,
        "scale_label": spec.scale_label,
        "scale_value": spec.scale_value,
        "fixture": spec.fixture,
        "kv_blocks": spec.kv_blocks,
        "frontier_completed": completion.get("completed_requests"),
        "frontier_total": completion.get("total_requests"),
        "frontier_complete": completion.get("is_complete"),
        "vllm_completed": vllm.get("rows"),
        "vllm_total": vllm.get("rows"),
        "frontier_preemptions": preemption.get("total_preemption_events"),
        "vllm_preemptions": spec.vllm_preemptions,
        "frontier_prefix_hit": frontier_prefix_hit,
        "vllm_prefix_hit": vllm_prefix_hit,
        "prefix_hit_delta": (
            float(frontier_prefix_hit) - float(vllm_prefix_hit)
            if frontier_prefix_hit is not None and vllm_prefix_hit is not None
            else None
        ),
        "frontier_rps": throughput.get("requests_per_second"),
        "vllm_rps": vllm.get("requests_per_second"),
        "frontier_total_tps": frontier_total_tps,
        "vllm_total_tps": vllm_total_tps,
        "frontier_decode_tps": throughput.get("decode_tokens_per_second"),
        "vllm_decode_tps": vllm.get("generated_tokens_per_second"),
        "frontier_ttft_p50_s": nested(system, "ttft_statistics", "p50") / 1000,
        "vllm_ttft_p50_s": nested(vllm, "ttft_s", "p50"),
        "frontier_ttft_p95_s": nested(system, "ttft_statistics", "p95") / 1000,
        "vllm_ttft_p95_s": nested(vllm, "ttft_s", "p95"),
        "frontier_tpot_p50_s": nested(system, "tpot_statistics", "p50") / 1000,
        "vllm_tpot_p50_s": nested(vllm, "tpot_s", "p50"),
        "frontier_tpot_p95_s": nested(system, "tpot_statistics", "p95") / 1000,
        "vllm_tpot_p95_s": nested(vllm, "tpot_s", "p95"),
        "frontier_e2e_p50_s": nested(system, "request_e2e_time_statistics", "p50") / 1000,
        "vllm_e2e_p50_s": nested(vllm, "e2e_s", "p50"),
        "frontier_e2e_p95_s": nested(system, "request_e2e_time_statistics", "p95") / 1000,
        "vllm_e2e_p95_s": nested(vllm, "e2e_s", "p95"),
        "notes": spec.notes,
    }
    for name in [
        "rps",
        "total_tps",
        "decode_tps",
        "ttft_p50_s",
        "ttft_p95_s",
        "tpot_p50_s",
        "tpot_p95_s",
        "e2e_p50_s",
        "e2e_p95_s",
    ]:
        row[f"{name.removesuffix('_s')}_ratio"] = ratio(
            row.get(f"frontier_{name}"), row.get(f"vllm_{name}")
        )
    return row
 def fmt(value: Any) -> str:
    if value is None:
        return ""
    if isinstance(value, bool):
        return "true" if value else "false"
    if isinstance(value, float):
        return f"{value:.10g}"
    return str(value)
 def write_csv(rows: list[dict[str, Any]]) -> None:
    path = OUT_DIR / "frontier_vllm_alignment.csv"
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
        writer.writeheader()
        for row in rows:
            writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})
 def write_json(rows: list[dict[str, Any]]) -> None:
    path = OUT_DIR / "frontier_vllm_alignment.json"
    with path.open("w", encoding="utf-8") as handle:
        json.dump(rows, handle, indent=2, sort_keys=True)
        handle.write("\n")
 def setup_axis(ax: plt.Axes, title: str, ylabel: str) -> None:
    ax.set_title(title, fontsize=12, pad=10)
    ax.set_ylabel(ylabel)
    ax.grid(axis="y", alpha=0.25)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
 def annotate_bars(ax: plt.Axes, bars: Any, fmt_text: str = "{:.2f}") -> None:
    for bar in bars:
        height = bar.get_height()
        if height != height:
            continue
        ax.annotate(
            fmt_text.format(height),
            xy=(bar.get_x() + bar.get_width() / 2, height),
            xytext=(0, 3),
            textcoords="offset points",
            ha="center",
            va="bottom",
            fontsize=7,
            rotation=90 if height > 2.5 else 0,
        )
 def savefig(name: str) -> None:
    plt.tight_layout()
    plt.savefig(OUT_DIR / name, dpi=180)
    plt.close()
 def plot_throughput_ratio(rows: list[dict[str, Any]]) -> None:
    labels = [row["label"] for row in rows]
    x = range(len(rows))
    colors = {1: "#4C78A8", 2: "#F58518", 4: "#54A24B"}
    fig, ax = plt.subplots(figsize=(12, 4.8))
    bars = ax.bar(
        x,
        [row["total_tps_ratio"] for row in rows],
        color=[colors[row["tp"]] for row in rows],
        alpha=0.9,
    )
    for bar, row in zip(bars, rows, strict=True):
        if not row["frontier_complete"]:
            bar.set_hatch("//")
            bar.set_alpha(0.65)
    ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
    ax.set_xticks(list(x))
    ax.set_xticklabels(labels, rotation=35, ha="right")
    setup_axis(ax, "Frontier Throughput Relative to vLLM", "Frontier / vLLM total tok/s")
    annotate_bars(ax, bars)
    savefig("throughput_ratio.png")
 def plot_latency_ratios(rows: list[dict[str, Any]]) -> None:
    labels = [row["label"] for row in rows]
    x = list(range(len(rows)))
    width = 0.26
    fig, ax = plt.subplots(figsize=(13, 5.2))
    b1 = ax.bar([i - width for i in x], [row["ttft_p95_ratio"] for row in rows], width, label="TTFT p95")
    b2 = ax.bar(x, [row["tpot_p50_ratio"] for row in rows], width, label="TPOT p50")
    b3 = ax.bar([i + width for i in x], [row["e2e_p95_ratio"] for row in rows], width, label="E2E p95")
    ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=35, ha="right")
    ax.legend(frameon=False, ncols=3, loc="upper left")
    setup_axis(ax, "Latency Ratios", "Frontier / vLLM")
    annotate_bars(ax, b1)
    annotate_bars(ax, b2)
    annotate_bars(ax, b3)
    savefig("latency_ratios.png")
 def plot_tp_scaling(rows: list[dict[str, Any]]) -> None:
    selected = [row for row in rows if row["request_count"] == 200 and row["scale_label"] in {"2", "3"}]
    groups = {}
    for row in selected:
        groups.setdefault(row["scale_label"], {})[row["tp"]] = row
    fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), sharey=False)
    for ax, scale in zip(axes, ["2", "3"], strict=True):
        group = groups[scale]
        tps = sorted(group)
        ax.plot(tps, [group[tp]["frontier_total_tps"] for tp in tps], marker="o", label="Frontier")
        ax.plot(tps, [group[tp]["vllm_total_tps"] for tp in tps], marker="o", label="vLLM")
        ax.set_xticks(tps)
        ax.set_xlabel("Tensor parallel size")
        setup_axis(ax, f"N=200, timestamp scale {scale}", "total tok/s")
        ax.legend(frameon=False)
    savefig("tp_scaling_total_tps.png")
 def plot_completion_prefix(rows: list[dict[str, Any]]) -> None:
    labels = [row["label"] for row in rows]
    x = list(range(len(rows)))
    fig, ax1 = plt.subplots(figsize=(12, 4.8))
    completion = [row["frontier_completed"] / row["frontier_total"] for row in rows]
    bars = ax1.bar(x, completion, color="#72B7B2", alpha=0.8, label="Frontier completion")
    ax1.set_ylim(0, 1.08)
    ax1.set_xticks(x)
    ax1.set_xticklabels(labels, rotation=35, ha="right")
    setup_axis(ax1, "Completion and Prefix Reuse", "Frontier completed / total")
    ax2 = ax1.twinx()
    ax2.plot(x, [row["frontier_prefix_hit"] for row in rows], color="#E45756", marker="o", label="Frontier prefix hit")
    ax2.plot(x, [row["vllm_prefix_hit"] for row in rows], color="#4C78A8", marker="x", linestyle="--", label="vLLM trace-side prefix hit")
    ax2.set_ylabel("prefix token hit ratio")
    ax2.set_ylim(0, 0.45)
    lines, labels2 = ax2.get_legend_handles_labels()
    ax1.legend([bars, *lines], ["Frontier completion", *labels2], frameon=False, loc="upper left", ncols=2)
    savefig("completion_prefix.png")
 def main() -> None:
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    rows = [summarize(spec) for spec in RUNS]
    write_csv(rows)
    write_json(rows)
    plot_throughput_ratio(rows)
    plot_latency_ratios(rows)
    plot_tp_scaling(rows)
    plot_completion_prefix(rows)
    print(f"Wrote {len(rows)} rows to {OUT_DIR}")
 if __name__ == "__main__":
    main()
--- a/tools/postprocess_frontier_smoke.py
+++ b/tools/postprocess_frontier_smoke.py
@@ -0,0 +1,454 @@
 #!/usr/bin/env python3
 """Summarize a Frontier RS1 smoke run."""
 from __future__ import annotations
 import argparse
 import csv
 import json
 import math
 import re
 import sys
 from pathlib import Path
 from typing import Any
 CACHE_COLUMNS = {
    "request_cached_prefill_tokens",
    "request_prefix_cache_query_blocks",
    "request_prefix_cache_hit_blocks",
 }
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Postprocess Frontier smoke output.")
    parser.add_argument("--run-dir", required=True, type=Path)
    parser.add_argument("--fixture-dir", required=True, type=Path)
    return parser.parse_args()
 def load_json(path: Path) -> dict[str, Any]:
    with path.open("r", encoding="utf-8") as handle:
        data = json.load(handle)
    if not isinstance(data, dict):
        raise ValueError(f"{path}: JSON value must be an object")
    return data
 def load_jsonl(path: Path) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as handle:
        for line_number, line in enumerate(handle, start=1):
            stripped = line.strip()
            if not stripped:
                continue
            row = json.loads(stripped)
            if not isinstance(row, dict):
                raise ValueError(f"{path}: line {line_number}: expected object")
            rows.append(row)
    return rows
 def load_csv(path: Path) -> tuple[list[str], list[dict[str, str]]]:
    with path.open("r", encoding="utf-8", newline="") as handle:
        reader = csv.DictReader(handle)
        return list(reader.fieldnames or []), list(reader)
 def find_metrics_dir(run_dir: Path) -> Path:
    candidates = sorted(run_dir.glob("frontier_metrics/**/system_metrics.json"))
    if len(candidates) != 1:
        raise ValueError(
            f"{run_dir}: expected exactly one system_metrics.json under "
            f"frontier_metrics, found {len(candidates)}"
        )
    return candidates[0].parent
 def read_text_if_exists(path: Path) -> str:
    if not path.exists():
        return ""
    return path.read_text(encoding="utf-8", errors="replace")
 def parse_memory_state(log_text: str) -> dict[str, Any]:
    matches = re.findall(
        r"\[MEMORY_STATE\]\s+total_blocks=(?P<total_blocks>\d+),\s+"
        r"max_blocks_per_sequence=(?P<max_blocks_per_sequence>\d+),\s+"
        r"max_request_slots=(?P<max_request_slots>[^,]+),\s+"
        r"max_batch_size=(?P<max_batch_size>\d+)",
        log_text,
    )
    if not matches:
        return {"available": False}
    total_blocks, max_blocks_per_sequence, max_request_slots, max_batch_size = matches[-1]
    return {
        "available": True,
        "total_blocks": int(total_blocks),
        "max_blocks_per_sequence": int(max_blocks_per_sequence),
        "max_request_slots": max_request_slots,
        "max_batch_size": int(max_batch_size),
        "source": "last [MEMORY_STATE] log line",
    }
 def extract_scheduler_config(config: dict[str, Any]) -> dict[str, Any]:
    cluster = config.get("cluster_config")
    if not isinstance(cluster, dict):
        return {}
    scheduler = cluster.get("replica_scheduler_config")
    return scheduler if isinstance(scheduler, dict) else {}
 def extract_replica_config(config: dict[str, Any]) -> dict[str, Any]:
    cluster = config.get("cluster_config")
    if not isinstance(cluster, dict):
        return {}
    replica = cluster.get("replica_config")
    return replica if isinstance(replica, dict) else {}
 def compute_token_weighted_cache(
    request_metrics_path: Path,
    sidecar_path: Path,
 ) -> dict[str, Any]:
    fieldnames, rows = load_csv(request_metrics_path)
    missing = sorted(CACHE_COLUMNS - set(fieldnames))
    if missing:
        return {
            "available": False,
            "reason": f"request_metrics.csv missing cache columns: {missing}",
        }
    sidecar_by_id = {int(row["request_id"]): row for row in load_jsonl(sidecar_path)}
    total_query_blocks = 0
    total_hit_blocks = 0
    total_query_tokens = 0
    total_hit_tokens = 0
    total_frontier_cached_tokens = 0
    completed_rows = 0
    rows_with_missing_cache_metrics: list[int] = []
    for row in rows:
        request_id = int(float(row["Request Id"]))
        sidecar = sidecar_by_id.get(request_id)
        if sidecar is None:
            raise ValueError(f"request_metrics.csv contains unknown request id {request_id}")
        cache_values = [
            row["request_prefix_cache_query_blocks"],
            row["request_prefix_cache_hit_blocks"],
            row["request_cached_prefill_tokens"],
        ]
        if any(value == "" for value in cache_values):
            rows_with_missing_cache_metrics.append(request_id)
            continue
        query_blocks = int(float(cache_values[0]))
        hit_blocks = int(float(cache_values[1]))
        cached_prefill_tokens = int(float(cache_values[2]))
        block_token_counts = [int(value) for value in sidecar["block_token_counts"]]
        input_length = int(sidecar["input_length"])
        if query_blocks != len(block_token_counts):
            raise ValueError(
                f"request {request_id}: query_blocks={query_blocks} does not match "
                f"sidecar blocks={len(block_token_counts)}"
            )
        if hit_blocks > query_blocks:
            raise ValueError(
                f"request {request_id}: hit_blocks={hit_blocks} > query_blocks={query_blocks}"
            )
        total_query_blocks += query_blocks
        total_hit_blocks += hit_blocks
        total_query_tokens += input_length
        total_hit_tokens += sum(block_token_counts[:hit_blocks])
        total_frontier_cached_tokens += cached_prefill_tokens
        completed_rows += 1
    if completed_rows == 0:
        return {
            "available": False,
            "reason": "no request rows had complete prefix-cache metrics",
            "rows_with_missing_cache_metrics": rows_with_missing_cache_metrics,
        }
    return {
        "available": True,
        "completed_request_rows": completed_rows,
        "total_request_metric_rows": len(rows),
        "rows_with_missing_cache_metrics": rows_with_missing_cache_metrics,
        "frontier_block_level": {
            "total_query_blocks": total_query_blocks,
            "total_hit_blocks": total_hit_blocks,
            "hit_ratio": (
                total_hit_blocks / total_query_blocks if total_query_blocks else 0.0
            ),
            "total_cached_prefill_tokens_frontier_whole_block": total_frontier_cached_tokens,
        },
        "replayserve_token_weighted": {
            "total_query_tokens": total_query_tokens,
            "total_hit_tokens": total_hit_tokens,
            "hit_ratio": (
                total_hit_tokens / total_query_tokens if total_query_tokens else 0.0
            ),
        },
        "semantics": (
            "Frontier reports whole-block hits; ReplayServe weights the first "
            "hit_blocks sidecar block_token_counts, so partial final blocks count "
            "by their true token length when they are hit."
        ),
    }
 def compute_completion_summary(
    system_metrics: dict[str, Any],
    request_metrics_path: Path,
 ) -> dict[str, Any]:
    fieldnames, rows = load_csv(request_metrics_path)
    missing_latency_rows: list[int] = []
    if "Request Id" in fieldnames and "request_e2e_time" in fieldnames:
        for row in rows:
            if row.get("request_e2e_time", "") == "":
                missing_latency_rows.append(int(float(row["Request Id"])))
    metadata = system_metrics.get("simulation_metadata", {})
    total_requests = int(metadata.get("total_requests") or len(rows))
    completed_requests = int(metadata.get("completed_requests") or 0)
    is_complete = (
        total_requests > 0
        and completed_requests == total_requests
        and not missing_latency_rows
    )
    return {
        "is_complete": is_complete,
        "total_requests": total_requests,
        "completed_requests": completed_requests,
        "request_metric_rows": len(rows),
        "missing_latency_request_ids": missing_latency_rows,
    }
 def get_nested(data: dict[str, Any], *keys: str) -> Any:
    value: Any = data
    for key in keys:
        if not isinstance(value, dict):
            return None
        value = value.get(key)
    return value
 def estimate_memory_planner_blocks(
    *,
    config: dict[str, Any],
    scheduler_config: dict[str, Any],
    model_weight_memory: dict[str, Any] | None,
 ) -> dict[str, Any]:
    replica_config = extract_replica_config(config)
    model_config = replica_config.get("model_config")
    device_config = replica_config.get("device_config")
    if (
        not isinstance(model_config, dict)
        or not isinstance(device_config, dict)
        or not isinstance(model_weight_memory, dict)
    ):
        return {"available": False, "reason": "missing model/device/weight config"}
    block_size = int(scheduler_config.get("block_size", 0))
    if block_size <= 0:
        return {"available": False, "reason": "missing positive block_size"}
    total_memory_gb = float(device_config["total_memory_gb"])
    gpu_memory_utilization = scheduler_config.get("gpu_memory_utilization")
    if gpu_memory_utilization is None:
        gpu_memory_utilization = 1.0 - float(replica_config.get("memory_margin_fraction", 0.1))
    gpu_memory_utilization = float(gpu_memory_utilization)
    parameter_memory_bytes = int(model_weight_memory["total_memory_bytes"])
    overhead_bytes = int(scheduler_config.get("non_kv_cache_overhead_bytes") or 0)
    requested_memory_bytes = int(total_memory_gb * 1024**3 * gpu_memory_utilization)
    available_kv_cache_memory_bytes = (
        requested_memory_bytes - parameter_memory_bytes - overhead_bytes
    )
    embedding_dim = int(model_config["embedding_dim"])
    num_q_heads = int(model_config["num_q_heads"])
    head_dim = model_config.get("head_dim")
    if head_dim is None:
        head_dim = embedding_dim // num_q_heads
    head_dim = int(head_dim)
    num_kv_heads = int(model_config["num_kv_heads"])
    attn_tp = int(replica_config["attn_tensor_parallel_size"])
    kv_heads_per_tensor_parallel_worker = math.ceil(num_kv_heads / attn_tp)
    num_layers = int(model_config["num_layers"])
    page_size_bytes_per_layer_per_block = (
        2 * 2 * block_size * kv_heads_per_tensor_parallel_worker * head_dim
    )
    if available_kv_cache_memory_bytes <= 0 or page_size_bytes_per_layer_per_block <= 0:
        derived_num_blocks = 0
    else:
        derived_num_blocks = int(
            available_kv_cache_memory_bytes
            // page_size_bytes_per_layer_per_block
            // num_layers
        )
    return {
        "available": True,
        "source": "ReplayServe fallback using Frontier MemoryPlanner.get_num_blocks formula",
        "total_blocks": derived_num_blocks,
        "requested_memory_bytes": requested_memory_bytes,
        "parameter_memory_per_device_bytes": parameter_memory_bytes,
        "non_kv_cache_overhead_bytes": overhead_bytes,
        "available_kv_cache_memory_bytes": available_kv_cache_memory_bytes,
        "block_size": block_size,
        "num_layers": num_layers,
        "head_dim": head_dim,
        "num_kv_heads": num_kv_heads,
        "attn_tensor_parallel_size": attn_tp,
        "kv_heads_per_tensor_parallel_worker": kv_heads_per_tensor_parallel_worker,
        "page_size_bytes_per_layer_per_block": page_size_bytes_per_layer_per_block,
        "gpu_memory_utilization": gpu_memory_utilization,
        "total_memory_gb": total_memory_gb,
    }
 def main() -> int:
    args = parse_args()
    try:
        run_dir = args.run_dir
        fixture_dir = args.fixture_dir
        metrics_dir = find_metrics_dir(run_dir)
        system_metrics_path = metrics_dir / "system_metrics.json"
        request_metrics_path = metrics_dir / "request_metrics.csv"
        config_path = metrics_dir / "config.json"
        sidecar_path = fixture_dir / "sidecar.jsonl"
        system_metrics = load_json(system_metrics_path)
        config = load_json(config_path)
        scheduler_config = extract_scheduler_config(config)
        log_text = (
            read_text_if_exists(run_dir / "stdout.log")
            + "\n"
            + read_text_if_exists(run_dir / "stderr.log")
        )
        memory_state = parse_memory_state(log_text)
        completion_summary = compute_completion_summary(
            system_metrics, request_metrics_path
        )
        cache_summary = compute_token_weighted_cache(request_metrics_path, sidecar_path)
        model_weight_memory = get_nested(system_metrics, "model_weight_memory", "MONOLITHIC")
        if not memory_state.get("available"):
            memory_state = estimate_memory_planner_blocks(
                config=config,
                scheduler_config=scheduler_config,
                model_weight_memory=model_weight_memory,
            )
        preemption_statistics = system_metrics.get("preemption_statistics", {})
        allocation_pressure_lines = [
            line
            for line in log_text.splitlines()
            if re.search(
                r"preempt|insufficient|cannot allocate|allocation pressure|oom",
                line,
                flags=re.IGNORECASE,
            )
        ]
        summary = {
            "run_dir": str(run_dir),
            "fixture_dir": str(fixture_dir),
            "metrics_dir": str(metrics_dir),
            "system_metrics": str(system_metrics_path),
            "request_metrics": str(request_metrics_path),
            "config": str(config_path),
            "frontier_prefix_cache_statistics": system_metrics.get(
                "prefix_cache_statistics"
            ),
            "completion": completion_summary,
            "prefix_cache_postprocess": cache_summary,
            "memory_planner": {
                "mode": scheduler_config.get("num_blocks_mode"),
                "gpu_memory_utilization": scheduler_config.get(
                    "gpu_memory_utilization"
                ),
                "non_kv_cache_overhead_bytes": scheduler_config.get(
                    "non_kv_cache_overhead_bytes"
                ),
                "derived": memory_state,
                "model_weight_memory_monolithic": model_weight_memory,
                "assumption": (
                    "RS1 uses Frontier memory_planner with analytical parameter "
                    "memory and non_kv_cache_overhead_bytes=0 for plumbing smoke."
                ),
            },
            "preemption_statistics": preemption_statistics,
            "allocation_pressure_log_line_count": len(allocation_pressure_lines),
            "allocation_pressure_log_excerpt": allocation_pressure_lines[:20],
        }
        output_json = run_dir / "postprocess_summary.json"
        output_md = run_dir / "postprocess_summary.md"
        with output_json.open("w", encoding="utf-8") as handle:
            json.dump(summary, handle, indent=2, sort_keys=True)
            handle.write("\n")
        cache = summary["prefix_cache_postprocess"]
        mem = summary["memory_planner"]
        with output_md.open("w", encoding="utf-8") as handle:
            handle.write(f"# RS1 Frontier Smoke: {fixture_dir.name}\n\n")
            handle.write(f"- Metrics dir: `{metrics_dir}`\n")
            handle.write(f"- Frontier system metrics: `{system_metrics_path}`\n")
            handle.write(f"- Frontier request metrics: `{request_metrics_path}`\n")
            handle.write(
                "- Completion: "
                f"`{completion_summary['completed_requests']}/"
                f"{completion_summary['total_requests']}`\n"
            )
            missing_latency_rows = completion_summary.get("missing_latency_request_ids") or []
            if missing_latency_rows:
                handle.write(
                    "- Missing latency request rows: "
                    f"`{missing_latency_rows}`\n"
                )
            if cache.get("available"):
                frontier_ratio = cache["frontier_block_level"]["hit_ratio"]
                token_ratio = cache["replayserve_token_weighted"]["hit_ratio"]
                handle.write(f"- Frontier block-level prefix hit ratio: {frontier_ratio:.8f}\n")
                handle.write(f"- ReplayServe token-weighted prefix hit ratio: {token_ratio:.8f}\n")
                missing_cache_rows = cache.get("rows_with_missing_cache_metrics") or []
                if missing_cache_rows:
                    handle.write(
                        "- Prefix-cache metric rows skipped: "
                        f"`{missing_cache_rows}`\n"
                    )
            else:
                handle.write(f"- Prefix cache postprocess unavailable: {cache.get('reason')}\n")
            derived = mem.get("derived", {})
            handle.write(f"- Memory planner mode: `{mem.get('mode')}`\n")
            handle.write(f"- GPU memory utilization: `{mem.get('gpu_memory_utilization')}`\n")
            handle.write(
                f"- Non-KV overhead bytes assumption: `{mem.get('non_kv_cache_overhead_bytes')}`\n"
            )
            if derived.get("available"):
                handle.write(f"- Derived KV blocks: `{derived.get('total_blocks')}`\n")
                handle.write(f"- Max batch size: `{derived.get('max_batch_size', 'n/a')}`\n")
            else:
                handle.write("- Derived KV blocks: not found in logs\n")
            preemptions = preemption_statistics.get("total_preemption_events")
            handle.write(f"- Total preemption events: `{preemptions}`\n")
            handle.write(
                f"- Allocation/preemption/OOM log lines: `{len(allocation_pressure_lines)}`\n"
            )
    except Exception as exc:
        print(f"postprocess_frontier_smoke.py: error: {exc}", file=sys.stderr)
        return 1
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/qwen_to_frontier.py
+++ b/tools/qwen_to_frontier.py
@@ -0,0 +1,391 @@
 #!/usr/bin/env python3
 """Convert Qwen JSONL traces to Frontier trace-replay CSV fixtures."""
 from __future__ import annotations
 import argparse
 import csv
 import json
 import math
 import os
 import sys
 from pathlib import Path
 from typing import Any
 CSV_FIELDS = [
    "arrived_at",
    "num_prefill_tokens",
    "num_decode_tokens",
    "session_id",
    "block_hash_ids",
 ]
 SIDECAR_FIELDS = [
    "request_id",
    "chat_id",
    "parent_chat_id",
    "turn",
    "type",
    "timestamp",
    "input_length",
    "output_length",
    "hash_ids",
    "block_token_counts",
 ]
 def positive_int(value: str) -> int:
    parsed = int(value)
    if parsed <= 0:
        raise argparse.ArgumentTypeError("must be positive")
    return parsed
 def positive_float(value: str) -> float:
    parsed = float(value)
    if parsed <= 0:
        raise argparse.ArgumentTypeError("must be positive")
    return parsed
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Convert Qwen JSONL to Frontier CSV plus ReplayServe sidecar."
    )
    parser.add_argument("--input", required=True, type=Path, help="Qwen JSONL path.")
    parser.add_argument(
        "--frontier-csv", required=True, type=Path, help="Output Frontier CSV path."
    )
    parser.add_argument(
        "--sidecar-jsonl",
        required=True,
        type=Path,
        help="Output ReplayServe sidecar JSONL path.",
    )
    parser.add_argument(
        "--source-jsonl",
        type=Path,
        help="Optional path for the original source JSONL slice.",
    )
    parser.add_argument(
        "--manifest-json", type=Path, help="Optional path for fixture manifest JSON."
    )
    parser.add_argument(
        "--fixture-name", help="Optional fixture name stored in the manifest."
    )
    parser.add_argument(
        "--limit", type=positive_int, help="Maximum number of rows to convert."
    )
    parser.add_argument("--max-tokens", type=positive_int, default=32768)
    parser.add_argument("--block-size", type=positive_int, default=16)
    parser.add_argument(
        "--timestamp-scale",
        type=positive_float,
        default=1.0,
        help="Multiply each source timestamp before writing fixture files.",
    )
    parser.add_argument(
        "--fail-on-overflow",
        action="store_true",
        help="Hard fail if input_length + output_length exceeds --max-tokens.",
    )
    return parser.parse_args()
 def require_int(row: dict[str, Any], key: str, line_number: int) -> int:
    try:
        value = row[key]
    except KeyError as exc:
        raise ValueError(f"line {line_number}: missing field {key!r}") from exc
    if isinstance(value, bool) or not isinstance(value, int):
        raise ValueError(f"line {line_number}: field {key!r} must be an int")
    return value
 def require_number(row: dict[str, Any], key: str, line_number: int) -> int | float:
    try:
        value = row[key]
    except KeyError as exc:
        raise ValueError(f"line {line_number}: missing field {key!r}") from exc
    if isinstance(value, bool) or not isinstance(value, (int, float)):
        raise ValueError(f"line {line_number}: field {key!r} must be numeric")
    return value
 def require_hash_ids(row: dict[str, Any], line_number: int) -> list[int]:
    try:
        value = row["hash_ids"]
    except KeyError as exc:
        raise ValueError(f"line {line_number}: missing field 'hash_ids'") from exc
    if not isinstance(value, list):
        raise ValueError(f"line {line_number}: field 'hash_ids' must be a list")
    hash_ids: list[int] = []
    for index, item in enumerate(value):
        if isinstance(item, bool) or not isinstance(item, int):
            raise ValueError(
                f"line {line_number}: hash_ids[{index}] must be an int"
            )
        hash_ids.append(item)
    return hash_ids
 def block_token_counts(input_length: int, hash_count: int, block_size: int) -> list[int]:
    if hash_count == 0:
        return []
    last_count = input_length % block_size
    if last_count == 0:
        last_count = block_size
    return [block_size] * (hash_count - 1) + [last_count]
 def convert_row(
    row: dict[str, Any],
    request_id: int,
    line_number: int,
    block_size: int,
    max_tokens: int,
    fail_on_overflow: bool,
    timestamp_scale: float,
 ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
    chat_id = require_int(row, "chat_id", line_number)
    parent_chat_id = require_int(row, "parent_chat_id", line_number)
    timestamp = float(require_number(row, "timestamp", line_number)) * timestamp_scale
    input_length = require_int(row, "input_length", line_number)
    output_length = require_int(row, "output_length", line_number)
    turn = require_int(row, "turn", line_number)
    request_type = row.get("type")
    hash_ids = require_hash_ids(row, line_number)
    if input_length <= 0:
        raise ValueError(f"line {line_number}: input_length must be positive")
    if output_length <= 0:
        raise ValueError(f"line {line_number}: output_length must be positive")
    expected_hash_count = math.ceil(input_length / block_size)
    if len(hash_ids) != expected_hash_count:
        raise ValueError(
            f"line {line_number}: len(hash_ids)={len(hash_ids)} does not match "
            f"ceil(input_length / block_size)={expected_hash_count}"
        )
    total_tokens = input_length + output_length
    overflow = total_tokens > max_tokens
    if overflow and fail_on_overflow:
        raise ValueError(
            f"line {line_number}: total_tokens={total_tokens} exceeds "
            f"max_tokens={max_tokens}"
        )
    counts = block_token_counts(input_length, len(hash_ids), block_size)
    frontier_row = {
        "arrived_at": timestamp,
        "num_prefill_tokens": input_length,
        "num_decode_tokens": output_length,
        "session_id": chat_id,
        "block_hash_ids": "|".join(str(item) for item in hash_ids),
    }
    sidecar_row = {
        "request_id": request_id,
        "chat_id": chat_id,
        "parent_chat_id": parent_chat_id,
        "turn": turn,
        "type": request_type,
        "timestamp": timestamp,
        "input_length": input_length,
        "output_length": output_length,
        "hash_ids": hash_ids,
        "block_token_counts": counts,
    }
    stats = {
        "total_tokens": total_tokens,
        "input_length": input_length,
        "output_length": output_length,
        "timestamp": timestamp,
        "partial_final_block": input_length % block_size != 0,
        "overflow": overflow,
    }
    return frontier_row, sidecar_row, stats
 def tmp_path(path: Path) -> Path:
    return path.with_name(f".{path.name}.tmp")
 def ensure_parent(path: Path | None) -> None:
    if path is not None:
        path.parent.mkdir(parents=True, exist_ok=True)
 def publish_tmp_files(paths: list[tuple[Path, Path]]) -> None:
    for temporary, final in paths:
        os.replace(temporary, final)
 def cleanup_tmp_files(paths: list[tuple[Path, Path]]) -> None:
    for temporary, _ in paths:
        try:
            temporary.unlink()
        except FileNotFoundError:
            pass
 def main() -> int:
    args = parse_args()
    for output_path in (
        args.frontier_csv,
        args.sidecar_jsonl,
        args.source_jsonl,
        args.manifest_json,
    ):
        ensure_parent(output_path)
    temporary_paths: list[tuple[Path, Path]] = [
        (tmp_path(args.frontier_csv), args.frontier_csv),
        (tmp_path(args.sidecar_jsonl), args.sidecar_jsonl),
    ]
    if args.source_jsonl is not None:
        temporary_paths.append((tmp_path(args.source_jsonl), args.source_jsonl))
    if args.manifest_json is not None:
        temporary_paths.append((tmp_path(args.manifest_json), args.manifest_json))
    row_count = 0
    overflow_count = 0
    max_total_tokens = 0
    max_input_length = 0
    max_output_length = 0
    first_timestamp: float | None = None
    last_timestamp: float | None = None
    timestamp_monotonic = True
    partial_final_block_rows = 0
    try:
        with (
            args.input.open("r", encoding="utf-8") as input_file,
            tmp_path(args.frontier_csv).open("w", encoding="utf-8", newline="") as csv_file,
            tmp_path(args.sidecar_jsonl).open("w", encoding="utf-8") as sidecar_file,
        ):
            csv_writer = csv.DictWriter(
                csv_file, fieldnames=CSV_FIELDS, lineterminator="\n"
            )
            csv_writer.writeheader()
            source_file = None
            if args.source_jsonl is not None:
                source_file = tmp_path(args.source_jsonl).open("w", encoding="utf-8")
            try:
                for line_number, raw_line in enumerate(input_file, start=1):
                    if args.limit is not None and row_count >= args.limit:
                        break
                    stripped = raw_line.strip()
                    if not stripped:
                        continue
                    row = json.loads(stripped)
                    frontier_row, sidecar_row, stats = convert_row(
                        row=row,
                        request_id=line_number - 1,
                        line_number=line_number,
                        block_size=args.block_size,
                        max_tokens=args.max_tokens,
                        fail_on_overflow=args.fail_on_overflow,
                        timestamp_scale=args.timestamp_scale,
                    )
                    csv_writer.writerow(frontier_row)
                    sidecar_file.write(
                        json.dumps(sidecar_row, sort_keys=True, separators=(",", ":"))
                        + "\n"
                    )
                    if source_file is not None:
                        if args.timestamp_scale == 1.0:
                            source_file.write(
                                raw_line if raw_line.endswith("\n") else raw_line + "\n"
                            )
                        else:
                            source_row = dict(row)
                            source_row["timestamp"] = stats["timestamp"]
                            source_file.write(
                                json.dumps(
                                    source_row, sort_keys=True, separators=(",", ":")
                                )
                                + "\n"
                            )
                    row_count += 1
                    overflow_count += int(stats["overflow"])
                    max_total_tokens = max(max_total_tokens, int(stats["total_tokens"]))
                    max_input_length = max(max_input_length, int(stats["input_length"]))
                    max_output_length = max(max_output_length, int(stats["output_length"]))
                    partial_final_block_rows += int(stats["partial_final_block"])
                    timestamp = float(stats["timestamp"])
                    if first_timestamp is None:
                        first_timestamp = timestamp
                    if last_timestamp is not None and timestamp < last_timestamp:
                        timestamp_monotonic = False
                    last_timestamp = timestamp
            finally:
                if source_file is not None:
                    source_file.close()
        if args.manifest_json is not None:
            manifest = {
                "fixture_name": args.fixture_name,
                "generated_by": "tools/qwen_to_frontier.py",
                "input_jsonl": str(args.input),
                "source_jsonl": str(args.source_jsonl) if args.source_jsonl else None,
                "frontier_csv": str(args.frontier_csv),
                "sidecar_jsonl": str(args.sidecar_jsonl),
                "csv_fields": CSV_FIELDS,
                "sidecar_fields": SIDECAR_FIELDS,
                "limit": args.limit,
                "row_count": row_count,
                "block_size": args.block_size,
                "max_tokens": args.max_tokens,
                "fail_on_overflow": args.fail_on_overflow,
                "timestamp_scale": args.timestamp_scale,
                "overflow_count": overflow_count,
                "max_total_tokens": max_total_tokens,
                "max_input_length": max_input_length,
                "max_output_length": max_output_length,
                "first_timestamp": first_timestamp,
                "last_timestamp": last_timestamp,
                "timestamp_monotonic": timestamp_monotonic,
                "partial_final_block_rows": partial_final_block_rows,
                "adapter_semantics": {
                    "timestamp": "arrived_at",
                    "input_length": "num_prefill_tokens",
                    "output_length": "num_decode_tokens",
                    "chat_id": "session_id",
                    "hash_ids": "block_hash_ids joined by |",
                    "block_token_counts": (
                        "full blocks use block_size tokens; final partial block "
                        "uses input_length % block_size, or block_size when zero"
                    ),
                },
            }
            with tmp_path(args.manifest_json).open("w", encoding="utf-8") as manifest_file:
                json.dump(manifest, manifest_file, indent=2, sort_keys=True)
                manifest_file.write("\n")
        publish_tmp_files(temporary_paths)
    except Exception as exc:
        cleanup_tmp_files(temporary_paths)
        print(f"qwen_to_frontier.py: error: {exc}", file=sys.stderr)
        return 1
    if overflow_count and not args.fail_on_overflow:
        print(
            f"qwen_to_frontier.py: warning: {overflow_count} rows exceed "
            f"max_tokens={args.max_tokens}; no clipping was applied",
            file=sys.stderr,
        )
    print(
        f"converted rows={row_count} max_total_tokens={max_total_tokens} "
        f"overflows={overflow_count}",
        file=sys.stderr,
    )
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/run_frontier_sweep.py
+++ b/tools/run_frontier_sweep.py
@@ -0,0 +1,534 @@
 #!/usr/bin/env python3
 """Run a small Frontier sweep from a ReplayServe JSON config."""
 from __future__ import annotations
 import argparse
 import json
 import os
 import shutil
 import subprocess
 import sys
 import time
 from pathlib import Path
 from typing import Any
 REPLAYSERVE_ROOT = Path(__file__).resolve().parents[1]
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run Frontier configs from JSON.")
    parser.add_argument(
        "--config",
        type=Path,
        default=REPLAYSERVE_ROOT / "configs" / "rs3_tiny_sweep.json",
        help="Sweep JSON config.",
    )
    parser.add_argument("--suite-id", help="Override suite_id from the config.")
    parser.add_argument(
        "--run-root",
        type=Path,
        help="Override run root. Defaults to runs/<suite_id>.",
    )
    parser.add_argument(
        "--only-config",
        action="append",
        default=[],
        help="Run only a config id. Can be repeated.",
    )
    parser.add_argument(
        "--only-fixture",
        action="append",
        default=[],
        help="Run only a fixture. Can be repeated.",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Write manifests and commands, but do not execute Frontier.",
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Replace existing run dirs selected by this invocation.",
    )
    return parser.parse_args()
 def load_json(path: Path) -> dict[str, Any]:
    with path.open("r", encoding="utf-8") as handle:
        data = json.load(handle)
    if not isinstance(data, dict):
        raise ValueError(f"{path}: top-level JSON must be an object")
    return data
 def git_head(path: Path) -> str | None:
    try:
        result = subprocess.run(
            ["git", "-C", str(path), "rev-parse", "HEAD"],
            check=True,
            text=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
    except subprocess.CalledProcessError:
        return None
    return result.stdout.strip()
 def git_status(path: Path) -> str | None:
    try:
        result = subprocess.run(
            ["git", "-C", str(path), "status", "--short"],
            check=True,
            text=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
    except subprocess.CalledProcessError:
        return None
    return result.stdout
 def shell_join(argv: list[str]) -> str:
    import shlex
    return " ".join(shlex.quote(part) for part in argv)
 def merge_config(defaults: dict[str, Any], item: dict[str, Any]) -> dict[str, Any]:
    merged = dict(defaults)
    overrides = item.get("overrides", {})
    if overrides is None:
        overrides = {}
    if not isinstance(overrides, dict):
        raise ValueError(f"config {item.get('id')}: overrides must be an object")
    merged.update(overrides)
    if "max_num_seqs" in merged and "batch_size_cap" not in overrides:
        merged["batch_size_cap"] = merged["max_num_seqs"]
    return merged
 def build_frontier_command(
    *,
    python_bin: str,
    trace_file: Path,
    metrics_root: Path,
    run_id: str,
    knobs: dict[str, Any],
 ) -> list[str]:
    cmd = [
        python_bin,
        "-m",
        "frontier.main",
        "--simulation_mode",
        str(knobs["simulation_mode"]),
        "--sys_arch",
        str(knobs["sys_arch"]),
        "--cc_backend_config_type",
        "analytical",
        "--cluster_config_num_replicas",
        str(knobs["num_replicas"]),
        "--cluster_scheduler_config_type",
        str(knobs["cluster_scheduler"]),
        "--replica_config_model_name",
        str(knobs["model_name"]),
        "--replica_config_device",
        str(knobs["device"]),
        "--replica_config_network_device",
        str(knobs["network_device"]),
        "--replica_config_attn_tensor_parallel_size",
        str(knobs["attn_tensor_parallel_size"]),
        "--replica_config_attn_data_parallel_size",
        str(knobs["attn_data_parallel_size"]),
        "--replica_config_moe_tensor_parallel_size",
        str(knobs["moe_tensor_parallel_size"]),
        "--replica_config_moe_expert_parallel_size",
        str(knobs["moe_expert_parallel_size"]),
        "--replica_config_num_pipeline_stages",
        str(knobs["num_pipeline_stages"]),
        "--replica_scheduler_config_type",
        str(knobs["replica_scheduler"]),
        "--decode_cuda_graph_mode",
        str(knobs.get("decode_cuda_graph_mode", "full_decode_only")),
        "--vllm_v1_scheduler_config_batch_size_cap",
        str(knobs["batch_size_cap"]),
        "--vllm_v1_scheduler_config_max_tokens_in_batch",
        str(knobs["max_tokens_in_batch"]),
        "--vllm_v1_scheduler_config_long_prefill_token_threshold",
        str(knobs["long_prefill_token_threshold"]),
        "--vllm_v1_scheduler_config_block_size",
        str(knobs["block_size"]),
        "--vllm_v1_scheduler_config_num_blocks_mode",
        str(knobs["num_blocks_mode"]),
        "--vllm_v1_scheduler_config_gpu_memory_utilization",
        str(knobs["gpu_memory_utilization"]),
        "--vllm_v1_scheduler_config_non_kv_cache_overhead_bytes",
        str(knobs["non_kv_cache_overhead_bytes"]),
        "--request_generator_config_type",
        "trace_replay",
        "--trace_request_generator_config_trace_file",
        str(trace_file),
        "--trace_request_generator_config_max_tokens",
        str(knobs["trace_max_tokens"]),
        "--metrics_config_output_dir",
        str(metrics_root),
        "--metrics_config_run_id",
        run_id,
        "--metrics_config_write_metrics",
        "--metrics_config_store_request_metrics",
        "--metrics_config_store_batch_metrics",
        "--metrics_config_store_token_completion_metrics",
        "--metrics_config_store_utilization_metrics",
        "--no-metrics_config_store_plots",
        "--no-metrics_config_enable_chrome_trace",
        "--no-metrics_config_write_json_trace",
        "--no-metrics_config_store_frontier_stage_batch_ledger",
    ]
    if bool(knobs.get("enable_dummy_mode", True)):
        cmd.extend(
            [
                "--random_forrest_execution_time_predictor_config_enable_dummy_mode",
                "--random_forrest_execution_time_predictor_config_dummy_execution_time_ms",
                str(knobs["dummy_execution_time_ms"]),
            ]
        )
    else:
        cmd.append("--no-random_forrest_execution_time_predictor_config_enable_dummy_mode")
        profile_arg_names = {
            "linear_op_input_file": "linear_op_input_file",
            "atten_input_file": "atten_input_file",
            "moe_input_file": "moe_input_file",
            "linear_op_kernel_only_input_file": "linear_op_kernel_only_input_file",
            "atten_kernel_only_input_file": "atten_kernel_only_input_file",
            "moe_kernel_only_input_file": "moe_kernel_only_input_file",
        }
        for knob_name, cli_name in profile_arg_names.items():
            value = knobs.get(knob_name)
            if value:
                cmd.extend(
                    [
                        f"--random_forrest_execution_time_predictor_config_{cli_name}",
                        str(value),
                    ]
                )
        for knob_name in (
            "prediction_max_prefill_chunk_size",
            "prediction_max_batch_size",
            "prediction_max_tokens_per_request",
        ):
            value = knobs.get(knob_name)
            if value is not None:
                cmd.extend(
                    [
                        f"--random_forrest_execution_time_predictor_config_{knob_name}",
                        str(value),
                    ]
                )
        if bool(knobs.get("no_cache", False)):
            cmd.append("--random_forrest_execution_time_predictor_config_no_cache")
        if bool(knobs.get("skip_cpu_overhead_modeling", True)):
            cmd.append(
                "--random_forrest_execution_time_predictor_config_skip_cpu_overhead_modeling"
            )
    if knobs.get("num_blocks") is not None:
        cmd.extend(
            [
                "--vllm_v1_scheduler_config_num_blocks",
                str(knobs["num_blocks"]),
            ]
        )
    if bool(knobs["enable_prefix_caching"]):
        cmd.append("--vllm_v1_scheduler_config_enable_prefix_caching")
    if bool(knobs["enable_chunked_prefill"]):
        cmd.append("--vllm_v1_scheduler_config_enable_chunked_prefill")
    return cmd
 def write_text(path: Path, text: str) -> None:
    path.write_text(text, encoding="utf-8")
 def run_one(
    *,
    suite_id: str,
    sim: str,
    frontier_info: dict[str, Any],
    frontier_root: Path,
    fixture: str,
    config_item: dict[str, Any],
    knobs: dict[str, Any],
    run_root: Path,
    python_bin: str,
    python_deps_dir: Path,
    dry_run: bool,
    force: bool,
 ) -> dict[str, Any]:
    config_id = str(config_item["id"])
    fixture_dir = REPLAYSERVE_ROOT / "traces" / "fixtures" / fixture
    trace_file = fixture_dir / "frontier.csv"
    sidecar_file = fixture_dir / "sidecar.jsonl"
    if not trace_file.exists():
        raise FileNotFoundError(f"missing trace file: {trace_file}")
    if not sidecar_file.exists():
        raise FileNotFoundError(f"missing sidecar file: {sidecar_file}")
    run_dir = (run_root / sim / fixture / config_id).resolve()
    metrics_root = (run_dir / "frontier_metrics").resolve()
    if run_dir.exists():
        if not force:
            raise FileExistsError(f"run dir exists, use --force to replace: {run_dir}")
        shutil.rmtree(run_dir)
    run_dir.mkdir(parents=True)
    metrics_root.mkdir(parents=True)
    run_id = f"{suite_id}_{fixture}_{config_id}"
    cmd = build_frontier_command(
        python_bin=python_bin,
        trace_file=trace_file,
        metrics_root=metrics_root,
        run_id=run_id,
        knobs=knobs,
    )
    existing_pythonpath = os.environ.get("PYTHONPATH")
    pythonpath_parts = []
    if python_deps_dir.is_dir():
        pythonpath_parts.append(str(python_deps_dir))
    pythonpath_parts.append(str(frontier_root))
    if existing_pythonpath:
        pythonpath_parts.append(existing_pythonpath)
    env = os.environ.copy()
    env.update(
        {
            "PYTHONPATH": ":".join(pythonpath_parts),
            "WANDB_DISABLED": "true",
            "VIDUR_DISABLE_WANDB": "1",
            "FRONTIER_LOG_LEVEL": env.get("FRONTIER_LOG_LEVEL", "info"),
            "PYTHONDONTWRITEBYTECODE": "1",
        }
    )
    frontier_head = git_head(frontier_root)
    frontier_status = git_status(frontier_root)
    manifest = {
        "suite_id": suite_id,
        "sim": sim,
        "fixture": fixture,
        "config_id": config_id,
        "description": config_item.get("description", ""),
        "run_dir": str(run_dir),
        "metrics_root": str(metrics_root),
        "run_id": run_id,
        "frontier": {
            **frontier_info,
            "root": str(frontier_root),
            "head": frontier_head,
            "status_short": frontier_status,
        },
        "fixture_dir": str(fixture_dir),
        "trace_file": str(trace_file),
        "sidecar_file": str(sidecar_file),
        "knobs": knobs,
        "command": cmd,
    }
    with (run_dir / "run_manifest.json").open("w", encoding="utf-8") as handle:
        json.dump(manifest, handle, indent=2, sort_keys=True)
        handle.write("\n")
    write_text(
        run_dir / "command.txt",
        "\n".join(
            [
                f"cd {frontier_root}",
                f"export PYTHONPATH={env['PYTHONPATH']}",
                f"export WANDB_DISABLED={env['WANDB_DISABLED']}",
                f"export VIDUR_DISABLE_WANDB={env['VIDUR_DISABLE_WANDB']}",
                f"export FRONTIER_LOG_LEVEL={env['FRONTIER_LOG_LEVEL']}",
                f"export PYTHONDONTWRITEBYTECODE={env['PYTHONDONTWRITEBYTECODE']}",
                f"command={shell_join(cmd)}",
                "",
            ]
        ),
    )
    write_text(
        run_dir / "env.txt",
        "\n".join(
            [
                f"suite_id={suite_id}",
                f"sim={sim}",
                f"fixture={fixture}",
                f"config_id={config_id}",
                f"replayserve_root={REPLAYSERVE_ROOT}",
                f"frontier_root={frontier_root}",
                f"frontier_head={frontier_head}",
                f"python_deps_dir={python_deps_dir}",
                f"trace_file={trace_file}",
                f"sidecar_file={sidecar_file}",
                f"run_dir={run_dir}",
                f"metrics_root={metrics_root}",
                f"run_id={run_id}",
                "",
            ]
        ),
    )
    if dry_run:
        write_text(run_dir / "exit_code.txt", "0\n")
        status = {
            "status": "dry_run",
            "exit_code": 0,
            "runtime_seconds": 0,
            "postprocess_exit_code": None,
        }
        with (run_dir / "run_status.json").open("w", encoding="utf-8") as handle:
            json.dump(status, handle, indent=2, sort_keys=True)
            handle.write("\n")
        return status
    start_epoch = int(time.time())
    write_text(run_dir / "start_epoch.txt", f"{start_epoch}\n")
    with (run_dir / "stdout.log").open("w", encoding="utf-8") as stdout, (
        run_dir / "stderr.log"
    ).open("w", encoding="utf-8") as stderr:
        proc = subprocess.run(cmd, cwd=frontier_root, env=env, stdout=stdout, stderr=stderr)
    end_epoch = int(time.time())
    runtime_seconds = end_epoch - start_epoch
    write_text(run_dir / "end_epoch.txt", f"{end_epoch}\n")
    write_text(run_dir / "exit_code.txt", f"{proc.returncode}\n")
    write_text(run_dir / "runtime_seconds.txt", f"{runtime_seconds}\n")
    postprocess_exit_code: int | None = None
    if proc.returncode == 0:
        postprocess_cmd = [
            python_bin,
            str(REPLAYSERVE_ROOT / "tools" / "postprocess_frontier_smoke.py"),
            "--run-dir",
            str(run_dir),
            "--fixture-dir",
            str(fixture_dir),
        ]
        with (run_dir / "postprocess.stdout.log").open("w", encoding="utf-8") as stdout, (
            run_dir / "postprocess.stderr.log"
        ).open("w", encoding="utf-8") as stderr:
            post = subprocess.run(
                postprocess_cmd,
                cwd=REPLAYSERVE_ROOT,
                env={**env, "PYTHONPATH": env["PYTHONPATH"]},
                stdout=stdout,
                stderr=stderr,
            )
        postprocess_exit_code = post.returncode
    status_name = "pass" if proc.returncode == 0 and postprocess_exit_code in (0, None) else "fail"
    if proc.returncode == 0 and postprocess_exit_code not in (0, None):
        status_name = "postprocess_fail"
    if status_name == "pass":
        summary_path = run_dir / "postprocess_summary.json"
        if summary_path.exists():
            try:
                summary = load_json(summary_path)
                completion = summary.get("completion", {})
                if isinstance(completion, dict) and not completion.get("is_complete", True):
                    status_name = "incomplete"
            except Exception:
                status_name = "postprocess_fail"
    status = {
        "status": status_name,
        "exit_code": proc.returncode,
        "runtime_seconds": runtime_seconds,
        "postprocess_exit_code": postprocess_exit_code,
    }
    with (run_dir / "run_status.json").open("w", encoding="utf-8") as handle:
        json.dump(status, handle, indent=2, sort_keys=True)
        handle.write("\n")
    return status
 def main() -> int:
    args = parse_args()
    config_path = args.config.resolve()
    config = load_json(config_path)
    suite_id = args.suite_id or str(config.get("suite_id") or "rs3_sweep")
    run_root = args.run_root or (REPLAYSERVE_ROOT / "runs" / suite_id)
    sim = str(config.get("sim") or "frontier")
    frontier_info = config.get("frontier", {})
    if not isinstance(frontier_info, dict):
        raise ValueError("frontier must be an object")
    frontier_root = Path(str(frontier_info.get("root") or "/tmp/toc-llm-sim-research/Frontier"))
    if not frontier_root.is_dir():
        raise FileNotFoundError(f"Frontier root does not exist: {frontier_root}")
    fixtures = [str(value) for value in config.get("fixtures", [])]
    if args.only_fixture:
        selected = set(args.only_fixture)
        fixtures = [value for value in fixtures if value in selected]
    if not fixtures:
        raise ValueError("no fixtures selected")
    defaults = config.get("defaults", {})
    if not isinstance(defaults, dict):
        raise ValueError("defaults must be an object")
    config_items = config.get("configs", [])
    if not isinstance(config_items, list) or not config_items:
        raise ValueError("configs must be a non-empty list")
    if args.only_config:
        selected_configs = set(args.only_config)
        config_items = [
            item
            for item in config_items
            if isinstance(item, dict) and str(item.get("id")) in selected_configs
        ]
    if not config_items:
        raise ValueError("no configs selected")
    if (REPLAYSERVE_ROOT / ".venv" / "bin" / "python").is_file():
        python_bin = str(REPLAYSERVE_ROOT / ".venv" / "bin" / "python")
    else:
        python_bin = os.environ.get("PYTHON_BIN", sys.executable or "python3")
    python_deps_dir = Path(
        os.environ.get("PYTHON_DEPS_DIR", str(REPLAYSERVE_ROOT / ".deps" / "python"))
    )
    results: list[dict[str, Any]] = []
    for fixture in fixtures:
        for item in config_items:
            if not isinstance(item, dict):
                raise ValueError("each configs entry must be an object")
            if "id" not in item:
                raise ValueError("each configs entry needs id")
            knobs = merge_config(defaults, item)
            status = run_one(
                suite_id=suite_id,
                sim=sim,
                frontier_info=frontier_info,
                frontier_root=frontier_root,
                fixture=fixture,
                config_item=item,
                knobs=knobs,
                run_root=run_root,
                python_bin=python_bin,
                python_deps_dir=python_deps_dir,
                dry_run=args.dry_run,
                force=args.force,
            )
            results.append(
                {
                    "fixture": fixture,
                    "config_id": item["id"],
                    **status,
                }
            )
            print(
                f"{fixture}/{item['id']}: {status['status']} "
                f"exit={status['exit_code']} runtime={status['runtime_seconds']}s"
            )
    failures = [row for row in results if row["status"] not in {"pass", "dry_run"}]
    return 1 if failures else 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/validate_fixtures.py
+++ b/tools/validate_fixtures.py
@@ -0,0 +1,240 @@
 #!/usr/bin/env python3
 """Validate ReplayServe fixture directories."""
 from __future__ import annotations
 import argparse
 import csv
 import json
 import math
 import sys
 from pathlib import Path
 from typing import Any
 def positive_int(value: str) -> int:
    parsed = int(value)
    if parsed <= 0:
        raise argparse.ArgumentTypeError("must be positive")
    return parsed
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Validate ReplayServe fixtures.")
    parser.add_argument("fixture_dirs", nargs="+", type=Path)
    parser.add_argument("--max-tokens", type=positive_int, default=32768)
    parser.add_argument("--block-size", type=positive_int, default=16)
    return parser.parse_args()
 def parse_block_hash_ids(value: str) -> list[int]:
    stripped = value.strip()
    if not stripped:
        return []
    return [int(part) for part in stripped.split("|") if part]
 def expected_block_counts(input_length: int, block_size: int) -> list[int]:
    hash_count = math.ceil(input_length / block_size)
    if hash_count == 0:
        return []
    last_count = input_length % block_size
    if last_count == 0:
        last_count = block_size
    return [block_size] * (hash_count - 1) + [last_count]
 def load_jsonl(path: Path) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as handle:
        for line_number, line in enumerate(handle, start=1):
            stripped = line.strip()
            if not stripped:
                continue
            try:
                row = json.loads(stripped)
            except json.JSONDecodeError as exc:
                raise ValueError(f"{path}: line {line_number}: invalid JSON") from exc
            if not isinstance(row, dict):
                raise ValueError(f"{path}: line {line_number}: JSON value must be object")
            rows.append(row)
    return rows
 def load_csv(path: Path) -> list[dict[str, str]]:
    with path.open("r", encoding="utf-8", newline="") as handle:
        reader = csv.DictReader(handle)
        required = {
            "arrived_at",
            "num_prefill_tokens",
            "num_decode_tokens",
            "session_id",
            "block_hash_ids",
        }
        missing = required - set(reader.fieldnames or [])
        if missing:
            raise ValueError(f"{path}: missing CSV columns: {sorted(missing)}")
        return list(reader)
 def require_paths(fixture_dir: Path) -> tuple[Path, Path, Path, Path]:
    source_path = fixture_dir / "source.jsonl"
    csv_path = fixture_dir / "frontier.csv"
    sidecar_path = fixture_dir / "sidecar.jsonl"
    manifest_path = fixture_dir / "manifest.json"
    for path in (source_path, csv_path, sidecar_path, manifest_path):
        if not path.exists():
            raise ValueError(f"{fixture_dir}: missing {path.name}")
    return source_path, csv_path, sidecar_path, manifest_path
 def validate_fixture(fixture_dir: Path, block_size: int, max_tokens: int) -> str:
    source_path, csv_path, sidecar_path, manifest_path = require_paths(fixture_dir)
    source_rows = load_jsonl(source_path)
    csv_rows = load_csv(csv_path)
    sidecar_rows = load_jsonl(sidecar_path)
    with manifest_path.open("r", encoding="utf-8") as handle:
        manifest = json.load(handle)
    row_count = len(csv_rows)
    if len(source_rows) != row_count or len(sidecar_rows) != row_count:
        raise ValueError(
            f"{fixture_dir}: row count mismatch source={len(source_rows)} "
            f"csv={row_count} sidecar={len(sidecar_rows)}"
        )
    if manifest.get("row_count") != row_count:
        raise ValueError(
            f"{fixture_dir}: manifest row_count={manifest.get('row_count')} "
            f"does not match csv rows={row_count}"
        )
    if manifest.get("block_size") != block_size:
        raise ValueError(
            f"{fixture_dir}: manifest block_size={manifest.get('block_size')} "
            f"does not match expected {block_size}"
        )
    if manifest.get("max_tokens") != max_tokens:
        raise ValueError(
            f"{fixture_dir}: manifest max_tokens={manifest.get('max_tokens')} "
            f"does not match expected {max_tokens}"
        )
    previous_timestamp: float | None = None
    max_total_tokens = 0
    partial_final_block_rows = 0
    for index, (source, csv_row, sidecar) in enumerate(
        zip(source_rows, csv_rows, sidecar_rows)
    ):
        prefix = f"{fixture_dir}: row {index}"
        input_length = int(csv_row["num_prefill_tokens"])
        output_length = int(csv_row["num_decode_tokens"])
        total_tokens = input_length + output_length
        if total_tokens > max_tokens:
            raise ValueError(
                f"{prefix}: total_tokens={total_tokens} exceeds max_tokens={max_tokens}"
            )
        max_total_tokens = max(max_total_tokens, total_tokens)
        timestamp = float(csv_row["arrived_at"])
        if previous_timestamp is not None and timestamp < previous_timestamp:
            raise ValueError(f"{prefix}: timestamp is not monotonic")
        previous_timestamp = timestamp
        hash_ids = parse_block_hash_ids(csv_row["block_hash_ids"])
        expected_hash_count = math.ceil(input_length / block_size)
        if len(hash_ids) != expected_hash_count:
            raise ValueError(
                f"{prefix}: hash count {len(hash_ids)} != {expected_hash_count}"
            )
        counts = expected_block_counts(input_length, block_size)
        if sum(counts) != input_length:
            raise ValueError(f"{prefix}: expected block counts do not sum to input")
        partial_final_block_rows += int(input_length % block_size != 0)
        if int(csv_row["session_id"]) != int(source["chat_id"]):
            raise ValueError(f"{prefix}: session_id does not match source chat_id")
        if timestamp != float(source["timestamp"]):
            raise ValueError(f"{prefix}: arrived_at does not match source timestamp")
        if input_length != int(source["input_length"]):
            raise ValueError(f"{prefix}: num_prefill_tokens does not match source")
        if output_length != int(source["output_length"]):
            raise ValueError(f"{prefix}: num_decode_tokens does not match source")
        if hash_ids != source["hash_ids"]:
            raise ValueError(f"{prefix}: block_hash_ids do not match source hash_ids")
        required_sidecar_keys = {
            "request_id",
            "chat_id",
            "parent_chat_id",
            "turn",
            "type",
            "timestamp",
            "input_length",
            "output_length",
            "hash_ids",
            "block_token_counts",
        }
        missing = required_sidecar_keys - set(sidecar)
        if missing:
            raise ValueError(f"{prefix}: missing sidecar keys {sorted(missing)}")
        if int(sidecar["request_id"]) != index:
            raise ValueError(f"{prefix}: sidecar request_id mismatch")
        if int(sidecar["chat_id"]) != int(source["chat_id"]):
            raise ValueError(f"{prefix}: sidecar chat_id mismatch")
        if int(sidecar["parent_chat_id"]) != int(source["parent_chat_id"]):
            raise ValueError(f"{prefix}: sidecar parent_chat_id mismatch")
        if int(sidecar["turn"]) != int(source["turn"]):
            raise ValueError(f"{prefix}: sidecar turn mismatch")
        if sidecar["type"] != source["type"]:
            raise ValueError(f"{prefix}: sidecar type mismatch")
        if float(sidecar["timestamp"]) != float(source["timestamp"]):
            raise ValueError(f"{prefix}: sidecar timestamp mismatch")
        if int(sidecar["input_length"]) != input_length:
            raise ValueError(f"{prefix}: sidecar input_length mismatch")
        if int(sidecar["output_length"]) != output_length:
            raise ValueError(f"{prefix}: sidecar output_length mismatch")
        if sidecar["hash_ids"] != hash_ids:
            raise ValueError(f"{prefix}: sidecar hash_ids mismatch")
        if sidecar["block_token_counts"] != counts:
            raise ValueError(f"{prefix}: sidecar block_token_counts mismatch")
    if manifest.get("max_total_tokens") != max_total_tokens:
        raise ValueError(
            f"{fixture_dir}: manifest max_total_tokens="
            f"{manifest.get('max_total_tokens')} does not match {max_total_tokens}"
        )
    if manifest.get("partial_final_block_rows") != partial_final_block_rows:
        raise ValueError(
            f"{fixture_dir}: manifest partial_final_block_rows="
            f"{manifest.get('partial_final_block_rows')} does not match "
            f"{partial_final_block_rows}"
        )
    if manifest.get("overflow_count") != 0:
        raise ValueError(f"{fixture_dir}: manifest overflow_count is not zero")
    if manifest.get("timestamp_monotonic") is not True:
        raise ValueError(f"{fixture_dir}: manifest timestamp_monotonic is not true")
    return (
        f"{fixture_dir.name}: rows={row_count} max_total_tokens={max_total_tokens} "
        f"partial_final_block_rows={partial_final_block_rows}"
    )
 def main() -> int:
    args = parse_args()
    try:
        for fixture_dir in args.fixture_dirs:
            print(
                validate_fixture(
                    fixture_dir=fixture_dir,
                    block_size=args.block_size,
                    max_tokens=args.max_tokens,
                )
            )
    except Exception as exc:
        print(f"validate_fixtures.py: error: {exc}", file=sys.stderr)
        return 1
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/vllm_synthetic_replay.py
+++ b/tools/vllm_synthetic_replay.py
@@ -0,0 +1,405 @@
 #!/usr/bin/env python3
 """Replay a ReplayServe fixture on vLLM with synthetic prompt token blocks."""
 from __future__ import annotations
 import argparse
 import asyncio
 import csv
 import hashlib
 import json
 import os
 import random
 import statistics
 import sys
 import time
 from pathlib import Path
 from typing import Any
 def positive_int(value: str) -> int:
    parsed = int(value)
    if parsed <= 0:
        raise argparse.ArgumentTypeError("must be positive")
    return parsed
 def positive_float(value: str) -> float:
    parsed = float(value)
    if parsed <= 0:
        raise argparse.ArgumentTypeError("must be positive")
    return parsed
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Run an online vLLM smoke/replay using synthetic prompt_token_ids "
            "derived from ReplayServe block hashes."
        )
    )
    parser.add_argument("--fixture-dir", required=True, type=Path)
    parser.add_argument("--model", required=True, type=str)
    parser.add_argument("--output-dir", required=True, type=Path)
    parser.add_argument("--tensor-parallel-size", type=positive_int, default=1)
    parser.add_argument("--limit", type=positive_int)
    parser.add_argument("--block-size", type=positive_int, default=16)
    parser.add_argument("--max-model-len", type=positive_int, default=32768)
    parser.add_argument("--max-num-seqs", type=positive_int, default=128)
    parser.add_argument("--max-num-batched-tokens", type=positive_int, default=32768)
    parser.add_argument("--gpu-memory-utilization", type=positive_float, default=0.9)
    parser.add_argument("--time-scale", type=positive_float, default=1.0)
    parser.add_argument(
        "--max-output-tokens",
        type=positive_int,
        help="Cap each row's output_length for smoke tests.",
    )
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--dtype", default="auto")
    parser.add_argument("--enforce-eager", action="store_true")
    parser.add_argument("--trust-remote-code", action=argparse.BooleanOptionalAction, default=True)
    parser.add_argument("--enable-prefix-caching", action=argparse.BooleanOptionalAction, default=True)
    parser.add_argument("--enable-chunked-prefill", action=argparse.BooleanOptionalAction, default=True)
    return parser.parse_args()
 def load_jsonl(path: Path) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as handle:
        for line_number, line in enumerate(handle, start=1):
            stripped = line.strip()
            if not stripped:
                continue
            row = json.loads(stripped)
            if not isinstance(row, dict):
                raise ValueError(f"{path}: line {line_number}: expected object")
            rows.append(row)
    return rows
 def percentile(values: list[float], pct: float) -> float | None:
    if not values:
        return None
    ordered = sorted(values)
    index = min(len(ordered) - 1, max(0, int((len(ordered) - 1) * pct)))
    return ordered[index]
 def block_seed(hash_id: int, seed: int) -> int:
    digest = hashlib.blake2b(
        f"{seed}:{hash_id}".encode("utf-8"), digest_size=8
    ).digest()
    return int.from_bytes(digest, "big")
 def block_tokens(
    hash_id: int,
    *,
    seed: int,
    block_size: int,
    vocab_size: int,
    special_ids: set[int],
 ) -> list[int]:
    rng = random.Random(block_seed(hash_id, seed))
    low = 1000
    high = max(low + 1, vocab_size - 1000)
    tokens: list[int] = []
    while len(tokens) < block_size:
        token_id = rng.randrange(low, high)
        if token_id not in special_ids:
            tokens.append(token_id)
    return tokens
 def make_prompt_token_ids(
    row: dict[str, Any],
    *,
    seed: int,
    block_size: int,
    vocab_size: int,
    special_ids: set[int],
 ) -> list[int]:
    hash_ids = [int(value) for value in row["hash_ids"]]
    counts = [int(value) for value in row["block_token_counts"]]
    if len(hash_ids) != len(counts):
        raise ValueError(f"request {row.get('request_id')}: hash/count length mismatch")
    token_ids: list[int] = []
    for hash_id, count in zip(hash_ids, counts):
        token_ids.extend(
            block_tokens(
                hash_id,
                seed=seed,
                block_size=block_size,
                vocab_size=vocab_size,
                special_ids=special_ids,
            )[:count]
        )
    expected = int(row["input_length"])
    if len(token_ids) != expected:
        raise ValueError(
            f"request {row.get('request_id')}: synthetic prompt length "
            f"{len(token_ids)} != input_length {expected}"
        )
    return token_ids
 def estimate_prefix_reuse(rows: list[dict[str, Any]]) -> dict[int, dict[str, int | float]]:
    trie: dict[int, dict[Any, Any]] = {}
    estimates: dict[int, dict[str, int | float]] = {}
    for row in rows:
        request_id = int(row["request_id"])
        hash_ids = [int(value) for value in row["hash_ids"]]
        counts = [int(value) for value in row["block_token_counts"]]
        node = trie
        hit_blocks = 0
        for hash_id in hash_ids:
            if hash_id not in node:
                break
            hit_blocks += 1
            node = node[hash_id]
        node = trie
        for hash_id in hash_ids:
            node = node.setdefault(hash_id, {})
        query_tokens = int(row["input_length"])
        hit_tokens = sum(counts[:hit_blocks])
        estimates[request_id] = {
            "query_blocks": len(hash_ids),
            "hit_blocks": hit_blocks,
            "query_tokens": query_tokens,
            "hit_tokens": hit_tokens,
            "block_hit_ratio": hit_blocks / len(hash_ids) if hash_ids else 0.0,
            "token_hit_ratio": hit_tokens / query_tokens if query_tokens else 0.0,
        }
    return estimates
 async def run_replay(args: argparse.Namespace) -> dict[str, Any]:
    try:
        from transformers import AutoTokenizer
        from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
        from vllm.inputs import TokensPrompt
    except Exception as exc:  # pragma: no cover - exercised on GPU host.
        raise RuntimeError(f"failed to import vLLM runtime dependencies: {exc}") from exc
    sidecar_path = args.fixture_dir / "sidecar.jsonl"
    rows = load_jsonl(sidecar_path)
    if args.limit is not None:
        rows = rows[: args.limit]
    if not rows:
        raise ValueError("no rows selected")
    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
    special_ids = {int(value) for value in tokenizer.all_special_ids}
    vocab_size = len(tokenizer)
    synthetic_prompts = {
        int(row["request_id"]): make_prompt_token_ids(
            row,
            seed=args.seed,
            block_size=args.block_size,
            vocab_size=vocab_size,
            special_ids=special_ids,
        )
        for row in rows
    }
    prefix_reuse = estimate_prefix_reuse(rows)
    engine_args = AsyncEngineArgs(
        model=args.model,
        tokenizer=args.model,
        trust_remote_code=args.trust_remote_code,
        tensor_parallel_size=args.tensor_parallel_size,
        dtype=args.dtype,
        max_model_len=args.max_model_len,
        block_size=args.block_size,
        enable_prefix_caching=args.enable_prefix_caching,
        enable_chunked_prefill=args.enable_chunked_prefill,
        max_num_seqs=args.max_num_seqs,
        max_num_batched_tokens=args.max_num_batched_tokens,
        gpu_memory_utilization=args.gpu_memory_utilization,
        enforce_eager=args.enforce_eager,
        disable_log_stats=True,
    )
    engine = AsyncLLMEngine.from_engine_args(engine_args)
    output_rows: list[dict[str, Any]] = []
    first_timestamp = float(rows[0]["timestamp"])
    replay_start = time.perf_counter()
    async def run_one(row: dict[str, Any]) -> None:
        request_id = int(row["request_id"])
        scheduled_arrival_s = (float(row["timestamp"]) - first_timestamp) * args.time_scale
        await asyncio.sleep(max(0.0, replay_start + scheduled_arrival_s - time.perf_counter()))
        prompt_token_ids = synthetic_prompts[request_id]
        requested_output_tokens = int(row["output_length"])
        effective_output_tokens = requested_output_tokens
        if args.max_output_tokens is not None:
            effective_output_tokens = min(effective_output_tokens, args.max_output_tokens)
        sampling_params = SamplingParams(
            temperature=0.0,
            max_tokens=effective_output_tokens,
            min_tokens=effective_output_tokens,
            ignore_eos=True,
            detokenize=False,
            seed=args.seed + request_id,
        )
        arrival_wall = time.perf_counter()
        first_token_wall: float | None = None
        last_output_tokens = 0
        final_output: Any = None
        generator = engine.generate(
            TokensPrompt(prompt_token_ids=prompt_token_ids),
            sampling_params,
            request_id=str(request_id),
        )
        async for output in generator:
            final_output = output
            if output.outputs:
                token_count = len(output.outputs[0].token_ids)
                if token_count > 0 and first_token_wall is None:
                    first_token_wall = time.perf_counter()
                last_output_tokens = token_count
        done_wall = time.perf_counter()
        finish_reason = ""
        if final_output is not None and final_output.outputs:
            finish_reason = str(final_output.outputs[0].finish_reason)
        ttft_s = None if first_token_wall is None else first_token_wall - arrival_wall
        e2e_s = done_wall - arrival_wall
        tpot_s = None
        if first_token_wall is not None and last_output_tokens > 1:
            tpot_s = (done_wall - first_token_wall) / (last_output_tokens - 1)
        reuse = prefix_reuse[request_id]
        output_rows.append(
            {
                "request_id": request_id,
                "scheduled_arrival_s": scheduled_arrival_s,
                "arrival_delay_s": arrival_wall - replay_start - scheduled_arrival_s,
                "input_length": int(row["input_length"]),
                "requested_output_length": requested_output_tokens,
                "effective_output_length": effective_output_tokens,
                "generated_output_tokens": last_output_tokens,
                "ttft_s": ttft_s,
                "tpot_s": tpot_s,
                "e2e_s": e2e_s,
                "finish_reason": finish_reason,
                "prefix_query_blocks_est": reuse["query_blocks"],
                "prefix_hit_blocks_est": reuse["hit_blocks"],
                "prefix_query_tokens_est": reuse["query_tokens"],
                "prefix_hit_tokens_est": reuse["hit_tokens"],
                "prefix_block_hit_ratio_est": reuse["block_hit_ratio"],
                "prefix_token_hit_ratio_est": reuse["token_hit_ratio"],
            }
        )
    try:
        await asyncio.gather(*(run_one(row) for row in rows))
    finally:
        engine.shutdown()
    replay_end = time.perf_counter()
    output_rows.sort(key=lambda item: int(item["request_id"]))
    args.output_dir.mkdir(parents=True, exist_ok=True)
    request_metrics_path = args.output_dir / "request_metrics.csv"
    fieldnames = list(output_rows[0].keys())
    with request_metrics_path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(output_rows)
    ttft_values = [float(row["ttft_s"]) for row in output_rows if row["ttft_s"] is not None]
    tpot_values = [float(row["tpot_s"]) for row in output_rows if row["tpot_s"] is not None]
    e2e_values = [float(row["e2e_s"]) for row in output_rows]
    generated_tokens = sum(int(row["generated_output_tokens"]) for row in output_rows)
    prompt_tokens = sum(int(row["input_length"]) for row in output_rows)
    wall_s = replay_end - replay_start
    summary = {
        "status": "pass",
        "fixture_dir": str(args.fixture_dir),
        "model": args.model,
        "tensor_parallel_size": args.tensor_parallel_size,
        "cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES", ""),
        "rows": len(output_rows),
        "block_size": args.block_size,
        "max_model_len": args.max_model_len,
        "max_num_seqs": args.max_num_seqs,
        "max_num_batched_tokens": args.max_num_batched_tokens,
        "gpu_memory_utilization": args.gpu_memory_utilization,
        "enable_prefix_caching": args.enable_prefix_caching,
        "enable_chunked_prefill": args.enable_chunked_prefill,
        "time_scale": args.time_scale,
        "max_output_tokens": args.max_output_tokens,
        "synthetic_replay": {
            "semantics": (
                "Each trace block hash is deterministically mapped to a stable "
                "block of prompt token ids; equal hashes reuse equal token blocks. "
                "This preserves arrival, length, and block-prefix sharing patterns, "
                "but it is not original text/token recovery."
            ),
            "seed": args.seed,
            "vocab_size": vocab_size,
            "special_token_ids_excluded": sorted(special_ids),
        },
        "wall_time_s": wall_s,
        "requests_per_second": len(output_rows) / wall_s if wall_s else 0.0,
        "prompt_tokens_per_second": prompt_tokens / wall_s if wall_s else 0.0,
        "generated_tokens_per_second": generated_tokens / wall_s if wall_s else 0.0,
        "total_prompt_tokens": prompt_tokens,
        "total_generated_tokens": generated_tokens,
        "ttft_s": {
            "mean": statistics.fmean(ttft_values) if ttft_values else None,
            "p50": percentile(ttft_values, 0.50),
            "p95": percentile(ttft_values, 0.95),
        },
        "tpot_s": {
            "mean": statistics.fmean(tpot_values) if tpot_values else None,
            "p50": percentile(tpot_values, 0.50),
            "p95": percentile(tpot_values, 0.95),
        },
        "e2e_s": {
            "mean": statistics.fmean(e2e_values) if e2e_values else None,
            "p50": percentile(e2e_values, 0.50),
            "p95": percentile(e2e_values, 0.95),
        },
        "estimated_prefix_reuse": {
            "query_blocks": sum(int(row["prefix_query_blocks_est"]) for row in output_rows),
            "hit_blocks": sum(int(row["prefix_hit_blocks_est"]) for row in output_rows),
            "query_tokens": sum(int(row["prefix_query_tokens_est"]) for row in output_rows),
            "hit_tokens": sum(int(row["prefix_hit_tokens_est"]) for row in output_rows),
        },
        "request_metrics_csv": str(request_metrics_path),
    }
    reuse = summary["estimated_prefix_reuse"]
    summary["estimated_prefix_reuse"]["block_hit_ratio"] = (
        reuse["hit_blocks"] / reuse["query_blocks"] if reuse["query_blocks"] else 0.0
    )
    summary["estimated_prefix_reuse"]["token_hit_ratio"] = (
        reuse["hit_tokens"] / reuse["query_tokens"] if reuse["query_tokens"] else 0.0
    )
    with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
        json.dump(summary, handle, indent=2, sort_keys=True)
        handle.write("\n")
    return summary
 def main() -> int:
    args = parse_args()
    try:
        summary = asyncio.run(run_replay(args))
    except Exception as exc:
        args.output_dir.mkdir(parents=True, exist_ok=True)
        with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
            json.dump({"status": "fail", "error": str(exc)}, handle, indent=2)
            handle.write("\n")
        print(f"vllm_synthetic_replay.py: error: {exc}", file=sys.stderr)
        return 1
    print(json.dumps(summary, indent=2, sort_keys=True))
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/traces/fixtures/coder_100/frontier.csv
+++ b/traces/fixtures/coder_100/frontier.csv
--- a/traces/fixtures/coder_100/manifest.json
+++ b/traces/fixtures/coder_100/manifest.json
@@ -0,0 +1,48 @@
 {
  "adapter_semantics": {
    "block_token_counts": "full blocks use block_size tokens; final partial block uses input_length % block_size, or block_size when zero",
    "chat_id": "session_id",
    "hash_ids": "block_hash_ids joined by |",
    "input_length": "num_prefill_tokens",
    "output_length": "num_decode_tokens",
    "timestamp": "arrived_at"
  },
  "block_size": 16,
  "csv_fields": [
    "arrived_at",
    "num_prefill_tokens",
    "num_decode_tokens",
    "session_id",
    "block_hash_ids"
  ],
  "fail_on_overflow": true,
  "first_timestamp": 0.0,
  "fixture_name": "coder_100",
  "frontier_csv": "traces/fixtures/coder_100/frontier.csv",
  "generated_by": "tools/qwen_to_frontier.py",
  "input_jsonl": "/home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl",
  "last_timestamp": 22.623,
  "limit": 100,
  "max_input_length": 17972,
  "max_output_length": 7279,
  "max_tokens": 32768,
  "max_total_tokens": 18985,
  "overflow_count": 0,
  "partial_final_block_rows": 91,
  "row_count": 100,
  "sidecar_fields": [
    "request_id",
    "chat_id",
    "parent_chat_id",
    "turn",
    "type",
    "timestamp",
    "input_length",
    "output_length",
    "hash_ids",
    "block_token_counts"
  ],
  "sidecar_jsonl": "traces/fixtures/coder_100/sidecar.jsonl",
  "source_jsonl": "traces/fixtures/coder_100/source.jsonl",
  "timestamp_monotonic": true
 }
--- a/traces/fixtures/coder_100/sidecar.jsonl
+++ b/traces/fixtures/coder_100/sidecar.jsonl
--- a/traces/fixtures/coder_100/source.jsonl
+++ b/traces/fixtures/coder_100/source.jsonl
--- a/traces/fixtures/coder_2000/frontier.csv
+++ b/traces/fixtures/coder_2000/frontier.csv
--- a/traces/fixtures/coder_2000/manifest.json
+++ b/traces/fixtures/coder_2000/manifest.json
@@ -0,0 +1,48 @@
 {
  "adapter_semantics": {
    "block_token_counts": "full blocks use block_size tokens; final partial block uses input_length % block_size, or block_size when zero",
    "chat_id": "session_id",
    "hash_ids": "block_hash_ids joined by |",
    "input_length": "num_prefill_tokens",
    "output_length": "num_decode_tokens",
    "timestamp": "arrived_at"
  },
  "block_size": 16,
  "csv_fields": [
    "arrived_at",
    "num_prefill_tokens",
    "num_decode_tokens",
    "session_id",
    "block_hash_ids"
  ],
  "fail_on_overflow": true,
  "first_timestamp": 0.0,
  "fixture_name": "coder_2000",
  "frontier_csv": "traces/fixtures/coder_2000/frontier.csv",
  "generated_by": "tools/qwen_to_frontier.py",
  "input_jsonl": "/home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl",
  "last_timestamp": 451.684,
  "limit": 2000,
  "max_input_length": 17972,
  "max_output_length": 10768,
  "max_tokens": 32768,
  "max_total_tokens": 21318,
  "overflow_count": 0,
  "partial_final_block_rows": 1879,
  "row_count": 2000,
  "sidecar_fields": [
    "request_id",
    "chat_id",
    "parent_chat_id",
    "turn",
    "type",
    "timestamp",
    "input_length",
    "output_length",
    "hash_ids",
    "block_token_counts"
  ],
  "sidecar_jsonl": "traces/fixtures/coder_2000/sidecar.jsonl",
  "source_jsonl": "traces/fixtures/coder_2000/source.jsonl",
  "timestamp_monotonic": true
 }
--- a/traces/fixtures/coder_2000/sidecar.jsonl
+++ b/traces/fixtures/coder_2000/sidecar.jsonl
--- a/traces/fixtures/coder_2000/source.jsonl
+++ b/traces/fixtures/coder_2000/source.jsonl
--- a/traces/fixtures/coder_200_ts0667/frontier.csv
+++ b/traces/fixtures/coder_200_ts0667/frontier.csv
--- a/traces/fixtures/coder_200_ts0667/manifest.json
+++ b/traces/fixtures/coder_200_ts0667/manifest.json
@@ -0,0 +1,49 @@
 {
  "adapter_semantics": {
    "block_token_counts": "full blocks use block_size tokens; final partial block uses input_length % block_size, or block_size when zero",
    "chat_id": "session_id",
    "hash_ids": "block_hash_ids joined by |",
    "input_length": "num_prefill_tokens",
    "output_length": "num_decode_tokens",
    "timestamp": "arrived_at"
  },
  "block_size": 16,
  "csv_fields": [
    "arrived_at",
    "num_prefill_tokens",
    "num_decode_tokens",
    "session_id",
    "block_hash_ids"
  ],
  "fail_on_overflow": true,
  "first_timestamp": 0.0,
  "fixture_name": "coder_200_ts0667",
  "frontier_csv": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts0667/frontier.csv",
  "generated_by": "tools/qwen_to_frontier.py",
  "input_jsonl": "/home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl",
  "last_timestamp": 30.711333333333332,
  "limit": 200,
  "max_input_length": 17972,
  "max_output_length": 7279,
  "max_tokens": 32768,
  "max_total_tokens": 18985,
  "overflow_count": 0,
  "partial_final_block_rows": 182,
  "row_count": 200,
  "sidecar_fields": [
    "request_id",
    "chat_id",
    "parent_chat_id",
    "turn",
    "type",
    "timestamp",
    "input_length",
    "output_length",
    "hash_ids",
    "block_token_counts"
  ],
  "sidecar_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts0667/sidecar.jsonl",
  "source_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts0667/source.jsonl",
  "timestamp_monotonic": true,
  "timestamp_scale": 0.6666666666666666
 }
--- a/traces/fixtures/coder_200_ts0667/sidecar.jsonl
+++ b/traces/fixtures/coder_200_ts0667/sidecar.jsonl
--- a/traces/fixtures/coder_200_ts0667/source.jsonl
+++ b/traces/fixtures/coder_200_ts0667/source.jsonl
--- a/traces/fixtures/coder_200_ts2/frontier.csv
+++ b/traces/fixtures/coder_200_ts2/frontier.csv
--- a/traces/fixtures/coder_200_ts2/manifest.json
+++ b/traces/fixtures/coder_200_ts2/manifest.json
@@ -0,0 +1,49 @@
 {
  "adapter_semantics": {
    "block_token_counts": "full blocks use block_size tokens; final partial block uses input_length % block_size, or block_size when zero",
    "chat_id": "session_id",
    "hash_ids": "block_hash_ids joined by |",
    "input_length": "num_prefill_tokens",
    "output_length": "num_decode_tokens",
    "timestamp": "arrived_at"
  },
  "block_size": 16,
  "csv_fields": [
    "arrived_at",
    "num_prefill_tokens",
    "num_decode_tokens",
    "session_id",
    "block_hash_ids"
  ],
  "fail_on_overflow": true,
  "first_timestamp": 0.0,
  "fixture_name": "coder_200_ts2",
  "frontier_csv": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts2/frontier.csv",
  "generated_by": "tools/qwen_to_frontier.py",
  "input_jsonl": "/home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl",
  "last_timestamp": 92.134,
  "limit": 200,
  "max_input_length": 17972,
  "max_output_length": 7279,
  "max_tokens": 32768,
  "max_total_tokens": 18985,
  "overflow_count": 0,
  "partial_final_block_rows": 182,
  "row_count": 200,
  "sidecar_fields": [
    "request_id",
    "chat_id",
    "parent_chat_id",
    "turn",
    "type",
    "timestamp",
    "input_length",
    "output_length",
    "hash_ids",
    "block_token_counts"
  ],
  "sidecar_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts2/sidecar.jsonl",
  "source_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts2/source.jsonl",
  "timestamp_monotonic": true,
  "timestamp_scale": 2.0
 }
--- a/traces/fixtures/coder_200_ts2/sidecar.jsonl
+++ b/traces/fixtures/coder_200_ts2/sidecar.jsonl
--- a/traces/fixtures/coder_200_ts2/source.jsonl
+++ b/traces/fixtures/coder_200_ts2/source.jsonl
--- a/traces/fixtures/coder_200_ts3/frontier.csv
+++ b/traces/fixtures/coder_200_ts3/frontier.csv
--- a/traces/fixtures/coder_200_ts3/manifest.json
+++ b/traces/fixtures/coder_200_ts3/manifest.json
@@ -0,0 +1,49 @@
 {
  "adapter_semantics": {
    "block_token_counts": "full blocks use block_size tokens; final partial block uses input_length % block_size, or block_size when zero",
    "chat_id": "session_id",
    "hash_ids": "block_hash_ids joined by |",
    "input_length": "num_prefill_tokens",
    "output_length": "num_decode_tokens",
    "timestamp": "arrived_at"
  },
  "block_size": 16,
  "csv_fields": [
    "arrived_at",
    "num_prefill_tokens",
    "num_decode_tokens",
    "session_id",
    "block_hash_ids"
  ],
  "fail_on_overflow": true,
  "first_timestamp": 0.0,
  "fixture_name": "coder_200_ts3",
  "frontier_csv": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts3/frontier.csv",
  "generated_by": "tools/qwen_to_frontier.py",
  "input_jsonl": "/home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl",
  "last_timestamp": 138.201,
  "limit": 200,
  "max_input_length": 17972,
  "max_output_length": 7279,
  "max_tokens": 32768,
  "max_total_tokens": 18985,
  "overflow_count": 0,
  "partial_final_block_rows": 182,
  "row_count": 200,
  "sidecar_fields": [
    "request_id",
    "chat_id",
    "parent_chat_id",
    "turn",
    "type",
    "timestamp",
    "input_length",
    "output_length",
    "hash_ids",
    "block_token_counts"
  ],
  "sidecar_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts3/sidecar.jsonl",
  "source_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_200_ts3/source.jsonl",
  "timestamp_monotonic": true,
  "timestamp_scale": 3.0
 }
--- a/traces/fixtures/coder_200_ts3/sidecar.jsonl
+++ b/traces/fixtures/coder_200_ts3/sidecar.jsonl
--- a/traces/fixtures/coder_200_ts3/source.jsonl
+++ b/traces/fixtures/coder_200_ts3/source.jsonl
--- a/traces/fixtures/coder_500/frontier.csv
+++ b/traces/fixtures/coder_500/frontier.csv
--- a/traces/fixtures/coder_500/manifest.json
+++ b/traces/fixtures/coder_500/manifest.json
@@ -0,0 +1,48 @@
 {
  "adapter_semantics": {
    "block_token_counts": "full blocks use block_size tokens; final partial block uses input_length % block_size, or block_size when zero",
    "chat_id": "session_id",
    "hash_ids": "block_hash_ids joined by |",
    "input_length": "num_prefill_tokens",
    "output_length": "num_decode_tokens",
    "timestamp": "arrived_at"
  },
  "block_size": 16,
  "csv_fields": [
    "arrived_at",
    "num_prefill_tokens",
    "num_decode_tokens",
    "session_id",
    "block_hash_ids"
  ],
  "fail_on_overflow": true,
  "first_timestamp": 0.0,
  "fixture_name": "coder_500",
  "frontier_csv": "/home/gahow/phd/replayserve/traces/fixtures/coder_500/frontier.csv",
  "generated_by": "tools/qwen_to_frontier.py",
  "input_jsonl": "/home/gahow/phd/qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl",
  "last_timestamp": 109.466,
  "limit": 500,
  "max_input_length": 17972,
  "max_output_length": 8533,
  "max_tokens": 32768,
  "max_total_tokens": 21318,
  "overflow_count": 0,
  "partial_final_block_rows": 466,
  "row_count": 500,
  "sidecar_fields": [
    "request_id",
    "chat_id",
    "parent_chat_id",
    "turn",
    "type",
    "timestamp",
    "input_length",
    "output_length",
    "hash_ids",
    "block_token_counts"
  ],
  "sidecar_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_500/sidecar.jsonl",
  "source_jsonl": "/home/gahow/phd/replayserve/traces/fixtures/coder_500/source.jsonl",
  "timestamp_monotonic": true
 }
--- a/traces/fixtures/coder_500/sidecar.jsonl
+++ b/traces/fixtures/coder_500/sidecar.jsonl
--- a/traces/fixtures/coder_500/source.jsonl
+++ b/traces/fixtures/coder_500/source.jsonl