{ "suite_id": "rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667", "sim": "frontier_h20_tp1_profile_full32k", "frontier": { "root": "/tmp/replayserve-frontier-rs1b", "mode": "patched_scratch", "patches": [ "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch", "patches/frontier-vllm-0.11.1-profiling-compat.patch" ], "profile_source": "dash2:/home/admin/cpfs/wjh/replayserve_frontier_profiles/h20_tp1_qwen3_30ba3b_full32k_20260624" }, "fixtures": [ "coder_200_ts0667" ], "defaults": { "simulation_mode": "online", "sys_arch": "co-location", "cluster_scheduler": "sticky_round_robin", "replica_scheduler": "vllm_v1", "model_name": "qwen3-a3b-30b-moe", "device": "h20", "network_device": "h20_dgx", "attn_tensor_parallel_size": 1, "attn_data_parallel_size": 1, "moe_tensor_parallel_size": 1, "moe_expert_parallel_size": 1, "num_pipeline_stages": 1, "num_replicas": 1, "batch_size_cap": 64, "max_tokens_in_batch": 32768, "block_size": 16, "enable_prefix_caching": true, "enable_chunked_prefill": true, "long_prefill_token_threshold": 32768, "trace_max_tokens": 32768, "num_blocks_mode": "explicit", "num_blocks": 15281, "gpu_memory_utilization": 0.85, "non_kv_cache_overhead_bytes": 0, "decode_cuda_graph_mode": "none", "enable_dummy_mode": false, "linear_op_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/linear_op_full32k.csv", "atten_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/attention_combined.csv", "moe_input_file": "./data/profiling/compute/h20/qwen3-a3b-30b-moe/moe_full32k.csv", "prediction_max_prefill_chunk_size": 18000, "prediction_max_batch_size": 128, "prediction_max_tokens_per_request": 32768, "skip_cpu_overhead_modeling": true, "no_cache": true, "dummy_execution_time_ms": 1.0 }, "configs": [ { "id": "vllm_kv_15281_profile_full32k", "description": "H20 TP1 Qwen3-30B-A3B with explicit vLLM TP1 KV blocks and H20 CUDA_EVENT profile timing; 200-request replay with timestamps scaled by 2/3." } ] }