# Qwen2.5-Coder-7B (dense, GQA) on a single H800 SXM (80GB). # Architecture from HuggingFace config.json — roofline auto-derived. model: name: qwen2.5-coder-7b num_layers: 28 hidden_size: 3584 num_attention_heads: 28 num_kv_heads: 4 # GQA: 28 query heads, 4 KV heads head_dim: 128 intermediate_size: 18944 # SwiGLU FFN dtype_bytes: 2 # BF16 block_size_tokens: 16 # matches qwen_coder_blksz_16 trace hardware: gpu_flops: 9.89e14 # H800 bf16 dense gpu_mem_bw: 3.35e12 # 3.35 TB/s HBM3 hbm_bytes: 60.0e9 # leave headroom for weights/activations dram_bytes: 512.0e9 pcie_bw: 64.0e9 # PCIe Gen5 x16 pcie_latency_us: 5.0 rdma_bw: 25.0e9 # ~200 Gbps NIC rdma_latency_us: 8.0 max_batch_slots: 256 prefill_chunk_tokens: 2048 cluster: num_instances: 16 meta_store: ttl_seconds: 60.0 router: mode: ttl_aware precise_probe_latency_us: 50.0 precise_probe_topk: 4 load_alpha: 1.0 sim: trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl max_requests: null output_dir: runs/qwen7b sample_interval_s: 1.0 seed: 42