60 lines
2.0 KiB
JSON
60 lines
2.0 KiB
JSON
{
|
|
"suite_id": "rs5_frontier_h20_tp1_profile",
|
|
"sim": "frontier_h20_tp1_profile",
|
|
"frontier": {
|
|
"root": "/tmp/replayserve-frontier-rs1b",
|
|
"mode": "patched_scratch",
|
|
"patches": [
|
|
"patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch",
|
|
"patches/frontier-vllm-0.11.1-profiling-compat.patch"
|
|
],
|
|
"profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_20260624"
|
|
},
|
|
"fixtures": [
|
|
"coder_100"
|
|
],
|
|
"defaults": {
|
|
"simulation_mode": "online",
|
|
"sys_arch": "co-location",
|
|
"cluster_scheduler": "sticky_round_robin",
|
|
"replica_scheduler": "vllm_v1",
|
|
"model_name": "qwen3-a3b-30b-moe",
|
|
"device": "h20",
|
|
"network_device": "h20_dgx",
|
|
"attn_tensor_parallel_size": 1,
|
|
"attn_data_parallel_size": 1,
|
|
"moe_tensor_parallel_size": 1,
|
|
"moe_expert_parallel_size": 1,
|
|
"num_pipeline_stages": 1,
|
|
"num_replicas": 1,
|
|
"batch_size_cap": 64,
|
|
"max_tokens_in_batch": 32768,
|
|
"block_size": 16,
|
|
"enable_prefix_caching": true,
|
|
"enable_chunked_prefill": true,
|
|
"long_prefill_token_threshold": 32768,
|
|
"trace_max_tokens": 32768,
|
|
"num_blocks_mode": "explicit",
|
|
"num_blocks": 15281,
|
|
"gpu_memory_utilization": 0.85,
|
|
"non_kv_cache_overhead_bytes": 0,
|
|
"decode_cuda_graph_mode": "none",
|
|
"enable_dummy_mode": false,
|
|
"linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op.csv",
|
|
"atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv",
|
|
"moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_vllm_fused.csv",
|
|
"prediction_max_prefill_chunk_size": 18000,
|
|
"prediction_max_batch_size": 128,
|
|
"prediction_max_tokens_per_request": 32768,
|
|
"skip_cpu_overhead_modeling": true,
|
|
"no_cache": true,
|
|
"dummy_execution_time_ms": 1.0
|
|
},
|
|
"configs": [
|
|
{
|
|
"id": "vllm_kv_15281_profile",
|
|
"description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing."
|
|
}
|
|
]
|
|
}
|