feat: new router and benchmark setup

This commit is contained in:
2026-04-16 14:23:53 +08:00
parent c86d931d8f
commit 996511f300
35 changed files with 1480 additions and 76 deletions

View File

@@ -0,0 +1,35 @@
# GLM-5-FP8 on 8 x H20-141G for the 0-32k bucket.
# Chosen to keep the best policy's mean TTFT below 5s.
model:
config_json: ../models/GLM-5-FP8/config.json
name: glm-5-fp8
compute_dtype: fp8
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xh20-141g
hbm_bytes: 300.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 56
meta_store:
ttl_seconds: 300.0
router:
mode: cache_affinity
precise_probe_latency_us: 50.0
precise_probe_topk: 4
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_fp8_8xh20_141g_ablation_0_32768_n56
sample_interval_s: 1.0
seed: 42
input_length_min: 0
input_length_max: 32768

View File

@@ -0,0 +1,38 @@
# GLM-5-FP8 (ZhipuAI/GLM-5-FP8) on 8 x H20-141G (N3E).
# Tuned for the 0-32768 input-length slice of
# bailian-traces/glm_coder_blksz_512_040915-040917.jsonl.
model:
config_json: ../models/GLM-5-FP8/config.json
name: glm-5-fp8
compute_dtype: fp8
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xh20-141g
hbm_bytes: 300.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 64
meta_store:
ttl_seconds: 300.0
router:
mode: cache_affinity
precise_probe_latency_us: 50.0
precise_probe_topk: 4
# Tuned on this filtered GLM coder workload: stronger queue penalty than
# the default 1.0 keeps cache_affinity's locality gains while reducing TTFT.
load_alpha: 1.5
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_fp8_8xh20_141g_ca_tuned
sample_interval_s: 1.0
seed: 42
input_length_min: 0
input_length_max: 32768

View File

@@ -0,0 +1,35 @@
# GLM-5-FP8 on 8 x H20-141G, 0-32768 slice.
# Analysis config: medium L1 (~10% of the default DRAM KV budget).
model:
config_json: ../models/GLM-5-FP8/config.json
name: glm-5-fp8
compute_dtype: fp8
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xh20-141g
hbm_bytes: 300.0e9
dram_bytes: 1.5e11
max_batch_slots: 256
cluster:
num_instances: 64
meta_store:
ttl_seconds: 300.0
router:
mode: min_pd
precise_probe_latency_us: 50.0
precise_probe_topk: 4
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_fp8_8xh20_141g_l1_medium
sample_interval_s: 1.0
seed: 42
input_length_min: 0
input_length_max: 32768

View File

@@ -0,0 +1,35 @@
# GLM-5-FP8 on 8 x H20-141G, 0-32768 slice.
# Analysis config: effectively disable L1/remote KV by shrinking DRAM to ~1 block.
model:
config_json: ../models/GLM-5-FP8/config.json
name: glm-5-fp8
compute_dtype: fp8
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xh20-141g
hbm_bytes: 300.0e9
dram_bytes: 1.0
max_batch_slots: 256
cluster:
num_instances: 64
meta_store:
ttl_seconds: 300.0
router:
mode: min_pd
precise_probe_latency_us: 50.0
precise_probe_topk: 4
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_fp8_8xh20_141g_l1_none
sample_interval_s: 1.0
seed: 42
input_length_min: 0
input_length_max: 32768

View File

@@ -0,0 +1,35 @@
# GLM-5-FP8 on 8 x H20-141G, 0-32768 slice.
# Analysis config: small L1 (~1% of the default DRAM KV budget).
model:
config_json: ../models/GLM-5-FP8/config.json
name: glm-5-fp8
compute_dtype: fp8
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xh20-141g
hbm_bytes: 300.0e9
dram_bytes: 1.5e10
max_batch_slots: 256
cluster:
num_instances: 64
meta_store:
ttl_seconds: 300.0
router:
mode: min_pd
precise_probe_latency_us: 50.0
precise_probe_topk: 4
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_fp8_8xh20_141g_l1_small
sample_interval_s: 1.0
seed: 42
input_length_min: 0
input_length_max: 32768

View File

@@ -0,0 +1,39 @@
# GLM-5-FP8 (ZhipuAI/GLM-5-FP8) on 8 x H20-141G (N3E).
# Architecture auto-loaded from the upstream ModelScope config.json.
#
# 8 x 141 GB = 1128 GB total HBM. With ~744 GB FP8 weights resident,
# keep the KV budget conservative to leave room for scales, BF16 holdouts,
# allocator slack, and runtime activations.
model:
config_json: ../models/GLM-5-FP8/config.json
name: glm-5-fp8
compute_dtype: fp8
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xh20-141g
hbm_bytes: 300.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 64
meta_store:
ttl_seconds: 300.0
router:
mode: min_pd
precise_probe_latency_us: 50.0
precise_probe_topk: 4
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_fp8_8xh20_141g
sample_interval_s: 1.0
seed: 42
input_length_min: 0
input_length_max: 32768

View File

@@ -0,0 +1,36 @@
# GLM-5-NVFP4 on 8 x B200 for the 32k-85k bucket.
# Chosen to keep the best policy's mean TTFT below 5s.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb200
hbm_bytes: 1150.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 5
meta_store:
ttl_seconds: 300.0
router:
mode: cache_affinity
precise_probe_latency_us: 50.0
precise_probe_topk: 4
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_8xb200_ablation_32769_87040_n5
sample_interval_s: 1.0
seed: 42
input_length_min: 32769
input_length_max: 87040

View File

@@ -0,0 +1,36 @@
# GLM-5-NVFP4 (nvidia/GLM-5-NVFP4) on 8 x B200 (192GB each).
# Architecture auto-loaded from HuggingFace config.json.
#
# FP4 weights: ~744B params * 0.5 bytes = ~372 GB across 8 GPUs.
# Total HBM: 8 * 192 GB = 1536 GB. Keep the KV budget below the raw
# remainder to leave room for runtime activations and allocator slack.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb200
hbm_bytes: 1150.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 8
meta_store:
ttl_seconds: 300.0
router:
mode: prefix_affinity
prefix_k: 8
load_alpha: 1.0
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_8xb200
sample_interval_s: 1.0
seed: 42

View File

@@ -0,0 +1,37 @@
# GLM-5-NVFP4 on 8 x B300 for the 128k++ bucket.
# A single instance already keeps mean TTFT below 5s, and routing is
# effectively irrelevant at N=1 because every request lands on the same node.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb300
hbm_bytes: 1900.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 1
meta_store:
ttl_seconds: 300.0
router:
mode: cache_affinity
precise_probe_latency_us: 50.0
precise_probe_topk: 1
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_8xb300_ablation_131073_plus_n1
sample_interval_s: 1.0
seed: 42
input_length_min: 131073
input_length_max: 4294967295

View File

@@ -0,0 +1,36 @@
# GLM-5-NVFP4 on 8 x B300 for the 85k-128k bucket.
# Chosen to keep the best policy's mean TTFT below 5s.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb300
hbm_bytes: 1900.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 2
meta_store:
ttl_seconds: 300.0
router:
mode: cache_affinity_strong_only
precise_probe_latency_us: 50.0
precise_probe_topk: 4
load_alpha: 1.0
prefix_k: 8
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_8xb300_ablation_87041_131072_n2
sample_interval_s: 1.0
seed: 42
input_length_min: 87041
input_length_max: 131072

View File

@@ -7,7 +7,8 @@
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp4 # FP4 weights → selects FP4 tensor core FLOPS
compute_dtype: fp8 # FP8 tensor-core execution
weight_dtype: fp4 # NVFP4 weights still set the HBM budget
dtype_bytes: 1 # FP8 KV cache
block_size_tokens: 512

View File

@@ -0,0 +1,32 @@
# GLM-5-NVFP4 on 8 x B200 for the 32k-85k bucket.
# NVFP4 weights, FP8 compute. Chosen to keep the best policy below 5 s TTFT.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb200
hbm_bytes: 1150.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 9
meta_store:
ttl_seconds: 300.0
router:
mode: min_pd
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_fp8compute_8xb200_ablation_32769_87040_n9
sample_interval_s: 1.0
seed: 42
input_length_min: 32769
input_length_max: 87040

View File

@@ -0,0 +1,32 @@
# GLM-5-NVFP4 on 8 x B200 with FP8 tensor-core compute.
# Weights remain stored in NVFP4, so HBM budget follows FP4 storage.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb200
hbm_bytes: 1150.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 8
meta_store:
ttl_seconds: 300.0
router:
mode: prefix_affinity
prefix_k: 8
load_alpha: 1.0
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_fp8compute_8xb200
sample_interval_s: 1.0
seed: 42

View File

@@ -0,0 +1,33 @@
# GLM-5-NVFP4 on 8 x B300 for the 128k++ bucket.
# NVFP4 weights, FP8 compute. Routing is effectively irrelevant at one instance.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb300
hbm_bytes: 1900.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 1
meta_store:
ttl_seconds: 300.0
router:
mode: cache_affinity
precise_probe_topk: 1
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_fp8compute_8xb300_ablation_131073_plus_n1
sample_interval_s: 1.0
seed: 42
input_length_min: 131073
input_length_max: 4294967295

View File

@@ -0,0 +1,32 @@
# GLM-5-NVFP4 on 8 x B300 for the 85k-128k bucket.
# NVFP4 weights, FP8 compute. Chosen to keep the best policy below 5 s TTFT.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb300
hbm_bytes: 1900.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 4
meta_store:
ttl_seconds: 300.0
router:
mode: cache_affinity
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_fp8compute_8xb300_ablation_87041_131072_n4
sample_interval_s: 1.0
seed: 42
input_length_min: 87041
input_length_max: 131072

View File

@@ -0,0 +1,32 @@
# GLM-5-NVFP4 on 8 x B300 with FP8 tensor-core compute.
# Weights remain stored in NVFP4, so HBM budget follows FP4 storage.
model:
config_json: ../models/GLM-5-NVFP4/config.json
name: glm-5-nvfp4
compute_dtype: fp8
weight_dtype: fp4
dtype_bytes: 1
block_size_tokens: 512
hardware:
type: 8xb300
hbm_bytes: 1900.0e9
dram_bytes: 1.5e12
max_batch_slots: 256
cluster:
num_instances: 8
meta_store:
ttl_seconds: 300.0
router:
mode: prefix_affinity
prefix_k: 8
load_alpha: 1.0
sim:
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
max_requests: null
output_dir: runs/glm5_nvfp4_fp8compute_8xb300
sample_interval_s: 1.0
seed: 42