feat: new router and benchmark setup

2026-04-16 14:23:53 +08:00
parent c86d931d8f
commit 996511f300
35 changed files with 1480 additions and 76 deletions
--- a/configs/glm5-fp8-8xh20-141g-0-32k-n56.yaml
+++ b/configs/glm5-fp8-8xh20-141g-0-32k-n56.yaml
@@ -0,0 +1,35 @@
+# GLM-5-FP8 on 8 x H20-141G for the 0-32k bucket.
+# Chosen to keep the best policy's mean TTFT below 5s.
+
+model:
+  config_json: ../models/GLM-5-FP8/config.json
+  name: glm-5-fp8
+  compute_dtype: fp8
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xh20-141g
+  hbm_bytes: 300.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 56
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: cache_affinity
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_fp8_8xh20_141g_ablation_0_32768_n56
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 0
+  input_length_max: 32768
--- a/configs/glm5-fp8-8xh20-141g-ca-tuned.yaml
+++ b/configs/glm5-fp8-8xh20-141g-ca-tuned.yaml
@@ -0,0 +1,38 @@
+# GLM-5-FP8 (ZhipuAI/GLM-5-FP8) on 8 x H20-141G (N3E).
+# Tuned for the 0-32768 input-length slice of
+# bailian-traces/glm_coder_blksz_512_040915-040917.jsonl.
+
+model:
+  config_json: ../models/GLM-5-FP8/config.json
+  name: glm-5-fp8
+  compute_dtype: fp8
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xh20-141g
+  hbm_bytes: 300.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 64
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: cache_affinity
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    # Tuned on this filtered GLM coder workload: stronger queue penalty than
+    # the default 1.0 keeps cache_affinity's locality gains while reducing TTFT.
+    load_alpha: 1.5
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_fp8_8xh20_141g_ca_tuned
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 0
+  input_length_max: 32768
--- a/configs/glm5-fp8-8xh20-141g-l1-medium.yaml
+++ b/configs/glm5-fp8-8xh20-141g-l1-medium.yaml
@@ -0,0 +1,35 @@
+# GLM-5-FP8 on 8 x H20-141G, 0-32768 slice.
+# Analysis config: medium L1 (~10% of the default DRAM KV budget).
+
+model:
+  config_json: ../models/GLM-5-FP8/config.json
+  name: glm-5-fp8
+  compute_dtype: fp8
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xh20-141g
+  hbm_bytes: 300.0e9
+  dram_bytes: 1.5e11
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 64
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_fp8_8xh20_141g_l1_medium
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 0
+  input_length_max: 32768
--- a/configs/glm5-fp8-8xh20-141g-l1-none.yaml
+++ b/configs/glm5-fp8-8xh20-141g-l1-none.yaml
@@ -0,0 +1,35 @@
+# GLM-5-FP8 on 8 x H20-141G, 0-32768 slice.
+# Analysis config: effectively disable L1/remote KV by shrinking DRAM to ~1 block.
+
+model:
+  config_json: ../models/GLM-5-FP8/config.json
+  name: glm-5-fp8
+  compute_dtype: fp8
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xh20-141g
+  hbm_bytes: 300.0e9
+  dram_bytes: 1.0
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 64
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_fp8_8xh20_141g_l1_none
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 0
+  input_length_max: 32768
--- a/configs/glm5-fp8-8xh20-141g-l1-small.yaml
+++ b/configs/glm5-fp8-8xh20-141g-l1-small.yaml
@@ -0,0 +1,35 @@
+# GLM-5-FP8 on 8 x H20-141G, 0-32768 slice.
+# Analysis config: small L1 (~1% of the default DRAM KV budget).
+
+model:
+  config_json: ../models/GLM-5-FP8/config.json
+  name: glm-5-fp8
+  compute_dtype: fp8
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xh20-141g
+  hbm_bytes: 300.0e9
+  dram_bytes: 1.5e10
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 64
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_fp8_8xh20_141g_l1_small
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 0
+  input_length_max: 32768
--- a/configs/glm5-fp8-8xh20-141g.yaml
+++ b/configs/glm5-fp8-8xh20-141g.yaml
@@ -0,0 +1,39 @@
+# GLM-5-FP8 (ZhipuAI/GLM-5-FP8) on 8 x H20-141G (N3E).
+# Architecture auto-loaded from the upstream ModelScope config.json.
+#
+# 8 x 141 GB = 1128 GB total HBM. With ~744 GB FP8 weights resident,
+# keep the KV budget conservative to leave room for scales, BF16 holdouts,
+# allocator slack, and runtime activations.
+
+model:
+  config_json: ../models/GLM-5-FP8/config.json
+  name: glm-5-fp8
+  compute_dtype: fp8
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xh20-141g
+  hbm_bytes: 300.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 64
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_fp8_8xh20_141g
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 0
+  input_length_max: 32768
--- a/configs/glm5-nvfp4-8xb200-32k-85k-n5.yaml
+++ b/configs/glm5-nvfp4-8xb200-32k-85k-n5.yaml
@@ -0,0 +1,36 @@
+# GLM-5-NVFP4 on 8 x B200 for the 32k-85k bucket.
+# Chosen to keep the best policy's mean TTFT below 5s.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb200
+  hbm_bytes: 1150.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 5
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: cache_affinity
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_8xb200_ablation_32769_87040_n5
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 32769
+  input_length_max: 87040
--- a/configs/glm5-nvfp4-8xb200.yaml
+++ b/configs/glm5-nvfp4-8xb200.yaml
@@ -0,0 +1,36 @@
+# GLM-5-NVFP4 (nvidia/GLM-5-NVFP4) on 8 x B200 (192GB each).
+# Architecture auto-loaded from HuggingFace config.json.
+#
+# FP4 weights: ~744B params * 0.5 bytes = ~372 GB across 8 GPUs.
+# Total HBM: 8 * 192 GB = 1536 GB. Keep the KV budget below the raw
+# remainder to leave room for runtime activations and allocator slack.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb200
+  hbm_bytes: 1150.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 8
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: prefix_affinity
+    prefix_k: 8
+    load_alpha: 1.0
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_8xb200
+  sample_interval_s: 1.0
+  seed: 42
--- a/configs/glm5-nvfp4-8xb300-128k-plus-n1.yaml
+++ b/configs/glm5-nvfp4-8xb300-128k-plus-n1.yaml
@@ -0,0 +1,37 @@
+# GLM-5-NVFP4 on 8 x B300 for the 128k++ bucket.
+# A single instance already keeps mean TTFT below 5s, and routing is
+# effectively irrelevant at N=1 because every request lands on the same node.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb300
+  hbm_bytes: 1900.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 1
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: cache_affinity
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 1
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_8xb300_ablation_131073_plus_n1
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 131073
+  input_length_max: 4294967295
--- a/configs/glm5-nvfp4-8xb300-85k-128k-n2.yaml
+++ b/configs/glm5-nvfp4-8xb300-85k-128k-n2.yaml
@@ -0,0 +1,36 @@
+# GLM-5-NVFP4 on 8 x B300 for the 85k-128k bucket.
+# Chosen to keep the best policy's mean TTFT below 5s.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb300
+  hbm_bytes: 1900.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 2
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: cache_affinity_strong_only
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_8xb300_ablation_87041_131072_n2
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 87041
+  input_length_max: 131072
--- a/configs/glm5-nvfp4-8xb300.yaml
+++ b/configs/glm5-nvfp4-8xb300.yaml
@@ -7,7 +7,8 @@
 model:
  config_json: ../models/GLM-5-NVFP4/config.json
  name: glm-5-nvfp4
-  compute_dtype: fp4            # FP4 weights → selects FP4 tensor core FLOPS
+  compute_dtype: fp8            # FP8 tensor-core execution
+  weight_dtype: fp4             # NVFP4 weights still set the HBM budget
  dtype_bytes: 1                # FP8 KV cache
  block_size_tokens: 512

--- a/configs/glm5-nvfp4-fp8compute-8xb200-32k-85k-n9.yaml
+++ b/configs/glm5-nvfp4-fp8compute-8xb200-32k-85k-n9.yaml
@@ -0,0 +1,32 @@
+# GLM-5-NVFP4 on 8 x B200 for the 32k-85k bucket.
+# NVFP4 weights, FP8 compute. Chosen to keep the best policy below 5 s TTFT.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb200
+  hbm_bytes: 1150.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 9
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_fp8compute_8xb200_ablation_32769_87040_n9
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 32769
+  input_length_max: 87040
--- a/configs/glm5-nvfp4-fp8compute-8xb200.yaml
+++ b/configs/glm5-nvfp4-fp8compute-8xb200.yaml
@@ -0,0 +1,32 @@
+# GLM-5-NVFP4 on 8 x B200 with FP8 tensor-core compute.
+# Weights remain stored in NVFP4, so HBM budget follows FP4 storage.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb200
+  hbm_bytes: 1150.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 8
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: prefix_affinity
+    prefix_k: 8
+    load_alpha: 1.0
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_fp8compute_8xb200
+  sample_interval_s: 1.0
+  seed: 42
--- a/configs/glm5-nvfp4-fp8compute-8xb300-128k-plus-n1.yaml
+++ b/configs/glm5-nvfp4-fp8compute-8xb300-128k-plus-n1.yaml
@@ -0,0 +1,33 @@
+# GLM-5-NVFP4 on 8 x B300 for the 128k++ bucket.
+# NVFP4 weights, FP8 compute. Routing is effectively irrelevant at one instance.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb300
+  hbm_bytes: 1900.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 1
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: cache_affinity
+    precise_probe_topk: 1
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_fp8compute_8xb300_ablation_131073_plus_n1
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 131073
+  input_length_max: 4294967295
--- a/configs/glm5-nvfp4-fp8compute-8xb300-85k-128k-n4.yaml
+++ b/configs/glm5-nvfp4-fp8compute-8xb300-85k-128k-n4.yaml
@@ -0,0 +1,32 @@
+# GLM-5-NVFP4 on 8 x B300 for the 85k-128k bucket.
+# NVFP4 weights, FP8 compute. Chosen to keep the best policy below 5 s TTFT.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb300
+  hbm_bytes: 1900.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 4
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: cache_affinity
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_fp8compute_8xb300_ablation_87041_131072_n4
+  sample_interval_s: 1.0
+  seed: 42
+  input_length_min: 87041
+  input_length_max: 131072
--- a/configs/glm5-nvfp4-fp8compute-8xb300.yaml
+++ b/configs/glm5-nvfp4-fp8compute-8xb300.yaml
@@ -0,0 +1,32 @@
+# GLM-5-NVFP4 on 8 x B300 with FP8 tensor-core compute.
+# Weights remain stored in NVFP4, so HBM budget follows FP4 storage.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp8
+  weight_dtype: fp4
+  dtype_bytes: 1
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb300
+  hbm_bytes: 1900.0e9
+  dram_bytes: 1.5e12
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 8
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: prefix_affinity
+    prefix_k: 8
+    load_alpha: 1.0
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_fp8compute_8xb300
+  sample_interval_s: 1.0
+  seed: 42