# GLM-5 (zai-org/GLM-5) on 8 x B200 SXM (192GB each). # Architecture from HuggingFace config.json — all roofline coefficients # are derived automatically. model: name: glm-5 # Core architecture (from HF config.json) num_layers: 78 hidden_size: 6144 num_attention_heads: 64 num_kv_heads: 64 # formalism; MLA overrides KV cache sizing head_dim: 64 intermediate_size: 12288 # shared expert FFN width dtype_bytes: 2 # BF16 block_size_tokens: 512 # matches bailian-traces blksz_512 # MoE: 256 routed + 1 shared, 8 active per token moe: num_experts: 256 num_active_experts: 8 num_shared_experts: 1 expert_intermediate_size: 2048 # moe_intermediate_size # MLA (Multi-head Latent Attention): compressed KV cache mla: kv_lora_rank: 512 q_lora_rank: 2048 qk_nope_head_dim: 192 qk_rope_head_dim: 64 v_head_dim: 256 # DSA (DeepSeek Sparse Attention): sub-quadratic past dense_window attention: type: dsa dense_window: 4096 sparse_stride: 8 first_dense_layers: 3 hardware: # Aggregate of 8 x B200 in one tensor-parallel group. gpu_flops: 1.80e16 # 8 * 2.25 PFLOPS BF16 dense gpu_mem_bw: 6.40e13 # 8 * 8 TB/s HBM3e # KV budget after FP8 weights + activations. GLM-5 FP8 ~744GB of 1536GB. hbm_bytes: 500.0e9 dram_bytes: 1.5e12 # ~1.5 TB usable CPU DRAM / v6d per node pcie_bw: 128.0e9 # PCIe Gen6 x16 pcie_latency_us: 4.0 rdma_bw: 50.0e9 # ConnectX-7 400 Gbps rdma_latency_us: 6.0 max_batch_slots: 256 prefill_chunk_tokens: 4096 cluster: num_instances: 64 meta_store: ttl_seconds: 300.0 router: mode: min_pd precise_probe_latency_us: 50.0 precise_probe_topk: 4 load_alpha: 1.0 sim: trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl max_requests: null output_dir: runs/glm5_8xb200_blk512 sample_interval_s: 1.0 seed: 42