# Qwen2.5-Coder-32B (dense, GQA) on H800 SXM (80GB). # Architecture from HuggingFace config.json — roofline auto-derived. model: name: qwen2.5-coder-32b num_layers: 64 hidden_size: 5120 num_attention_heads: 40 num_kv_heads: 8 # GQA head_dim: 128 intermediate_size: 27648 # SwiGLU FFN dtype_bytes: 2 # BF16 block_size_tokens: 16 hardware: gpu_flops: 9.89e14 gpu_mem_bw: 3.35e12 hbm_bytes: 20.0e9 # smaller budget: 32B weights are large dram_bytes: 512.0e9 pcie_bw: 64.0e9 pcie_latency_us: 5.0 rdma_bw: 25.0e9 rdma_latency_us: 8.0 max_batch_slots: 128 prefill_chunk_tokens: 1024 cluster: num_instances: 16 meta_store: ttl_seconds: 60.0 router: mode: ttl_aware precise_probe_latency_us: 50.0 precise_probe_topk: 4 load_alpha: 1.0 sim: trace_path: traces/qwen_coder_blksz_16.jsonl max_requests: null output_dir: runs/qwen32b sample_interval_s: 1.0 seed: 42