Add ReplayServe Frontier vLLM alignment report

2026-06-25 17:10:30 +08:00
commit a99bd00782
63 changed files with 17033 additions and 0 deletions
--- a/configs/rs8_frontier_h20_tp1_profile_full32k_coder500.json
+++ b/configs/rs8_frontier_h20_tp1_profile_full32k_coder500.json
@@ -0,0 +1,59 @@
+{
+  "suite_id": "rs8_frontier_h20_tp1_profile_full32k_coder500",
+  "sim": "frontier_h20_tp1_profile_full32k",
+  "frontier": {
+    "root": "/tmp/replayserve-frontier-rs1b",
+    "mode": "patched_scratch",
+    "patches": [
+      "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
+      "patches/frontier-vllm-0.11.1-profiling-compat.patch"
+    ],
+    "profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
+  },
+  "fixtures": [
+    "coder_500"
+  ],
+  "defaults": {
+    "simulation_mode": "online",
+    "sys_arch": "co-location",
+    "cluster_scheduler": "sticky_round_robin",
+    "replica_scheduler": "vllm_v1",
+    "model_name": "qwen3-a3b-30b-moe",
+    "device": "h20",
+    "network_device": "h20_dgx",
+    "attn_tensor_parallel_size": 1,
+    "attn_data_parallel_size": 1,
+    "moe_tensor_parallel_size": 1,
+    "moe_expert_parallel_size": 1,
+    "num_pipeline_stages": 1,
+    "num_replicas": 1,
+    "batch_size_cap": 64,
+    "max_tokens_in_batch": 32768,
+    "block_size": 16,
+    "enable_prefix_caching": true,
+    "enable_chunked_prefill": true,
+    "long_prefill_token_threshold": 32768,
+    "trace_max_tokens": 32768,
+    "num_blocks_mode": "explicit",
+    "num_blocks": 15281,
+    "gpu_memory_utilization": 0.85,
+    "non_kv_cache_overhead_bytes": 0,
+    "decode_cuda_graph_mode": "none",
+    "enable_dummy_mode": false,
+    "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
+    "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
+    "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
+    "prediction_max_prefill_chunk_size": 18000,
+    "prediction_max_batch_size": 128,
+    "prediction_max_tokens_per_request": 32768,
+    "skip_cpu_overhead_modeling": true,
+    "no_cache": true,
+    "dummy_execution_time_ms": 1.0
+  },
+  "configs": [
+    {
+      "id": "vllm_kv_15281_profile_full32k",
+      "description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 500-request replay stress."
+    }
+  ]
+}