feat: new router and benchmark setup

2026-04-16 14:23:53 +08:00
parent c86d931d8f
commit 996511f300
35 changed files with 1480 additions and 76 deletions
--- a/configs/glm5-fp8-8xh20-141g.yaml
+++ b/configs/glm5-fp8-8xh20-141g.yaml
@@ -0,0 +1,39 @@
+# GLM-5-FP8 (ZhipuAI/GLM-5-FP8) on 8 x H20-141G (N3E).
+# Architecture auto-loaded from the upstream ModelScope config.json.
+#
+# 8 x 141 GB = 1128 GB total HBM. With ~744 GB FP8 weights resident,
+# keep the KV budget conservative to leave room for scales, BF16 holdouts,
+# allocator slack, and runtime activations.
+
+model:
+  config_json: ../models/GLM-5-FP8/config.json
+  name: glm-5-fp8
+  compute_dtype: fp8
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xh20-141g
+  hbm_bytes: 300.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 64
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_fp8_8xh20_141g
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 0
+  input_length_max: 32768