Add ReplayServe Frontier vLLM alignment report
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
{
|
||||
"suite_id": "rs10_frontier_h20_tp1_profile_full32k_coder200_ts2_ts3",
|
||||
"sim": "frontier_h20_tp1_profile_full32k",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patches": [
|
||||
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
||||
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
||||
],
|
||||
"profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_200_ts2",
|
||||
"coder_200_ts3"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 1,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 15281,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"decode_cuda_graph_mode": "none",
|
||||
"enable_dummy_mode": false,
|
||||
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
|
||||
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
|
||||
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
|
||||
"prediction_max_prefill_chunk_size": 18000,
|
||||
"prediction_max_batch_size": 128,
|
||||
"prediction_max_tokens_per_request": 32768,
|
||||
"skip_cpu_overhead_modeling": true,
|
||||
"no_cache": true,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "vllm_kv_15281_profile_full32k",
|
||||
"description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 200-request replay with timestamps scaled by 2 and 3."
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
{
|
||||
"suite_id": "rs11_frontier_h20_tp2_profile_full32k_coder200_ts2_ts3",
|
||||
"sim": "frontier_h20_tp2_profile_full32k",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patches": [
|
||||
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
||||
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
||||
],
|
||||
"profile_source": "dash1:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp2_tp4_qwen3_30ba3b_full32k_20260625",
|
||||
"profile_note": "Timing rows include H20 TP2 and TP4 for attention/MoE and TP1/TP2/TP4 for linear ops; TP2 runs use explicit TP2 vLLM KV capacity."
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_200_ts2",
|
||||
"coder_200_ts3"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 2,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 2,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 69055,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"decode_cuda_graph_mode": "none",
|
||||
"enable_dummy_mode": false,
|
||||
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_tp2_tp4_full32k.csv",
|
||||
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_tp2_tp4_combined.csv",
|
||||
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_tp2_tp4_full32k.csv",
|
||||
"prediction_max_prefill_chunk_size": 18000,
|
||||
"prediction_max_batch_size": 128,
|
||||
"prediction_max_tokens_per_request": 32768,
|
||||
"skip_cpu_overhead_modeling": true,
|
||||
"no_cache": true,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "vllm_kv_69055_profile_full32k",
|
||||
"description": "H20 TP2 Qwen3-30B-A3B with explicit vLLM TP2 KV blocks and H20 TP2 CUDA_EVENT profile timing."
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
{
|
||||
"suite_id": "rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3",
|
||||
"sim": "frontier_h20_tp2_tp4_profile_full32k",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patches": [
|
||||
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
||||
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
||||
],
|
||||
"profile_source": "dash1:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp2_tp4_qwen3_30ba3b_full32k_20260625_true_mixed"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_200_ts2",
|
||||
"coder_200_ts3"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 1,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 0,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"decode_cuda_graph_mode": "none",
|
||||
"enable_dummy_mode": false,
|
||||
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_tp2_tp4_full32k.csv",
|
||||
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_tp2_tp4_combined.csv",
|
||||
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_tp2_tp4_full32k.csv",
|
||||
"prediction_max_prefill_chunk_size": 18000,
|
||||
"prediction_max_batch_size": 128,
|
||||
"prediction_max_tokens_per_request": 32768,
|
||||
"skip_cpu_overhead_modeling": true,
|
||||
"no_cache": false,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "tp2_vllm_kv_69055_profile_full32k",
|
||||
"description": "H20 TP2 Qwen3-30B-A3B with explicit vLLM TP2 KV blocks and H20 TP2 CUDA_EVENT profile timing.",
|
||||
"overrides": {
|
||||
"attn_tensor_parallel_size": 2,
|
||||
"moe_tensor_parallel_size": 2,
|
||||
"num_blocks": 69055
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "tp4_vllm_kv_177077_profile_full32k",
|
||||
"description": "H20 TP4 Qwen3-30B-A3B with explicit vLLM TP4 KV blocks and H20 TP4 CUDA_EVENT profile timing.",
|
||||
"overrides": {
|
||||
"attn_tensor_parallel_size": 4,
|
||||
"moe_tensor_parallel_size": 4,
|
||||
"num_blocks": 177077
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
51
configs/rs3_tiny_sweep.json
Normal file
51
configs/rs3_tiny_sweep.json
Normal file
@@ -0,0 +1,51 @@
|
||||
{
|
||||
"suite_id": "rs3_tiny_smoke",
|
||||
"sim": "frontier_patched",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patch": "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_100"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "Qwen/Qwen3-32B",
|
||||
"device": "a800",
|
||||
"network_device": "a800_dgx",
|
||||
"attn_tensor_parallel_size": 2,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 128,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 64,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "memory_planner",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "fixed_prefix_on",
|
||||
"description": "RS1 fixed config on patched Frontier scratch."
|
||||
},
|
||||
{
|
||||
"id": "prefix_cache_off",
|
||||
"description": "Diagnosis/control config with prefix cache disabled; all other fixed scheduler knobs unchanged.",
|
||||
"overrides": {
|
||||
"enable_prefix_caching": false
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
53
configs/rs4_frontier_h20_tp1.json
Normal file
53
configs/rs4_frontier_h20_tp1.json
Normal file
@@ -0,0 +1,53 @@
|
||||
{
|
||||
"suite_id": "rs4_frontier_h20_tp1",
|
||||
"sim": "frontier_h20_tp1",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patch": "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_100"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 1,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "memory_planner",
|
||||
"num_blocks": null,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "planner_kv",
|
||||
"description": "H20 TP1 Qwen3-30B-A3B with Frontier memory planner KV capacity."
|
||||
},
|
||||
{
|
||||
"id": "vllm_kv_15281",
|
||||
"description": "H20 TP1 Qwen3-30B-A3B with explicit KV blocks matching real vLLM TP1 on dash2.",
|
||||
"overrides": {
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 15281
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
59
configs/rs5_frontier_h20_tp1_profile.json
Normal file
59
configs/rs5_frontier_h20_tp1_profile.json
Normal file
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"suite_id": "rs5_frontier_h20_tp1_profile",
|
||||
"sim": "frontier_h20_tp1_profile",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patches": [
|
||||
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
||||
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
||||
],
|
||||
"profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_20260624"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_100"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 1,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 15281,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"decode_cuda_graph_mode": "none",
|
||||
"enable_dummy_mode": false,
|
||||
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op.csv",
|
||||
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
|
||||
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_vllm_fused.csv",
|
||||
"prediction_max_prefill_chunk_size": 18000,
|
||||
"prediction_max_batch_size": 128,
|
||||
"prediction_max_tokens_per_request": 32768,
|
||||
"skip_cpu_overhead_modeling": true,
|
||||
"no_cache": true,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "vllm_kv_15281_profile",
|
||||
"description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing."
|
||||
}
|
||||
]
|
||||
}
|
||||
59
configs/rs6_frontier_h20_tp1_profile_full32k.json
Normal file
59
configs/rs6_frontier_h20_tp1_profile_full32k.json
Normal file
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"suite_id": "rs6_frontier_h20_tp1_profile_full32k",
|
||||
"sim": "frontier_h20_tp1_profile_full32k",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patches": [
|
||||
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
||||
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
||||
],
|
||||
"profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_100"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 1,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 15281,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"decode_cuda_graph_mode": "none",
|
||||
"enable_dummy_mode": false,
|
||||
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
|
||||
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
|
||||
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
|
||||
"prediction_max_prefill_chunk_size": 18000,
|
||||
"prediction_max_batch_size": 128,
|
||||
"prediction_max_tokens_per_request": 32768,
|
||||
"skip_cpu_overhead_modeling": true,
|
||||
"no_cache": true,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "vllm_kv_15281_profile_full32k",
|
||||
"description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; linear and MoE coverage extended to 32768 tokens."
|
||||
}
|
||||
]
|
||||
}
|
||||
59
configs/rs8_frontier_h20_tp1_profile_full32k_coder500.json
Normal file
59
configs/rs8_frontier_h20_tp1_profile_full32k_coder500.json
Normal file
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"suite_id": "rs8_frontier_h20_tp1_profile_full32k_coder500",
|
||||
"sim": "frontier_h20_tp1_profile_full32k",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patches": [
|
||||
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
||||
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
||||
],
|
||||
"profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_500"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 1,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 15281,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"decode_cuda_graph_mode": "none",
|
||||
"enable_dummy_mode": false,
|
||||
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
|
||||
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
|
||||
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
|
||||
"prediction_max_prefill_chunk_size": 18000,
|
||||
"prediction_max_batch_size": 128,
|
||||
"prediction_max_tokens_per_request": 32768,
|
||||
"skip_cpu_overhead_modeling": true,
|
||||
"no_cache": true,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "vllm_kv_15281_profile_full32k",
|
||||
"description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 500-request replay stress."
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"suite_id": "rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667",
|
||||
"sim": "frontier_h20_tp1_profile_full32k",
|
||||
"frontier": {
|
||||
"root": "/tmp/replayserve-frontier-rs1b",
|
||||
"mode": "patched_scratch",
|
||||
"patches": [
|
||||
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
||||
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
||||
],
|
||||
"profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624"
|
||||
},
|
||||
"fixtures": [
|
||||
"coder_200_ts0667"
|
||||
],
|
||||
"defaults": {
|
||||
"simulation_mode": "online",
|
||||
"sys_arch": "co-location",
|
||||
"cluster_scheduler": "sticky_round_robin",
|
||||
"replica_scheduler": "vllm_v1",
|
||||
"model_name": "qwen3-a3b-30b-moe",
|
||||
"device": "h20",
|
||||
"network_device": "h20_dgx",
|
||||
"attn_tensor_parallel_size": 1,
|
||||
"attn_data_parallel_size": 1,
|
||||
"moe_tensor_parallel_size": 1,
|
||||
"moe_expert_parallel_size": 1,
|
||||
"num_pipeline_stages": 1,
|
||||
"num_replicas": 1,
|
||||
"batch_size_cap": 64,
|
||||
"max_tokens_in_batch": 32768,
|
||||
"block_size": 16,
|
||||
"enable_prefix_caching": true,
|
||||
"enable_chunked_prefill": true,
|
||||
"long_prefill_token_threshold": 32768,
|
||||
"trace_max_tokens": 32768,
|
||||
"num_blocks_mode": "explicit",
|
||||
"num_blocks": 15281,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"non_kv_cache_overhead_bytes": 0,
|
||||
"decode_cuda_graph_mode": "none",
|
||||
"enable_dummy_mode": false,
|
||||
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv",
|
||||
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
|
||||
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv",
|
||||
"prediction_max_prefill_chunk_size": 18000,
|
||||
"prediction_max_batch_size": 128,
|
||||
"prediction_max_tokens_per_request": 32768,
|
||||
"skip_cpu_overhead_modeling": true,
|
||||
"no_cache": true,
|
||||
"dummy_execution_time_ms": 1.0
|
||||
},
|
||||
"configs": [
|
||||
{
|
||||
"id": "vllm_kv_15281_profile_full32k",
|
||||
"description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 200-request replay with timestamps scaled by 2/3."
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user