kvcache-simulator/configs/glm5-8xb200-blk512.yaml

# GLM-5 (zai-org/GLM-5) on 8 x B200 SXM (192GB each).
# Architecture from HuggingFace config.json — all roofline coefficients
# are derived automatically.

model:
  name: glm-5
  # Core architecture (from HF config.json)
  num_layers: 78
  hidden_size: 6144
  num_attention_heads: 64
  num_kv_heads: 64             # formalism; MLA overrides KV cache sizing
  head_dim: 64
  intermediate_size: 12288     # shared expert FFN width
  dtype_bytes: 2               # BF16
  block_size_tokens: 512       # matches bailian-traces blksz_512

  # MoE: 256 routed + 1 shared, 8 active per token
  moe:
    num_experts: 256
    num_active_experts: 8
    num_shared_experts: 1
    expert_intermediate_size: 2048   # moe_intermediate_size

  # MLA (Multi-head Latent Attention): compressed KV cache
  mla:
    kv_lora_rank: 512
    q_lora_rank: 2048
    qk_nope_head_dim: 192
    qk_rope_head_dim: 64
    v_head_dim: 256

  # DSA (DeepSeek Sparse Attention): sub-quadratic past dense_window
  attention:
    type: dsa
    dense_window: 4096
    sparse_stride: 8
    first_dense_layers: 3

hardware:
  # Aggregate of 8 x B200 in one tensor-parallel group.
  gpu_flops:        1.80e16    # 8 * 2.25 PFLOPS BF16 dense
  gpu_mem_bw:       6.40e13    # 8 * 8 TB/s HBM3e
  # KV budget after FP8 weights + activations. GLM-5 FP8 ~744GB of 1536GB.
  hbm_bytes:        500.0e9
  dram_bytes:       1.5e12     # ~1.5 TB usable CPU DRAM / v6d per node
  pcie_bw:          128.0e9    # PCIe Gen6 x16
  pcie_latency_us:  4.0
  rdma_bw:          50.0e9     # ConnectX-7 400 Gbps
  rdma_latency_us:  6.0
  max_batch_slots:  256
  prefill_chunk_tokens: 4096

cluster:
  num_instances: 64
  meta_store:
    ttl_seconds: 300.0
  router:
    mode: min_pd
    precise_probe_latency_us: 50.0
    precise_probe_topk: 4
    load_alpha: 1.0

sim:
  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
  max_requests: null
  output_dir: runs/glm5_8xb200_blk512
  sample_interval_s: 1.0
  seed: 42