diff --git a/configs/glm5-nvfp4-8xb300.yaml b/configs/glm5-nvfp4-8xb300.yaml new file mode 100644 index 0000000..94b646a --- /dev/null +++ b/configs/glm5-nvfp4-8xb300.yaml @@ -0,0 +1,31 @@ +# GLM-5-NVFP4 (nvidia/GLM-5-NVFP4) on 8 x B300 (Blackwell Ultra, 288GB each). +# Architecture auto-loaded from HuggingFace config.json. +# +# FP4 weights: ~744B params * 0.5 bytes = ~372 GB across 8 GPUs. +# Total HBM: 8 * 288 GB = 2304 GB. KV budget: ~1900 GB after weights. + +model: + config_json: ../models/GLM-5-NVFP4/config.json + name: glm-5-nvfp4 + dtype_bytes: 1 # FP8 KV cache + block_size_tokens: 512 + +hardware: + type: 8xb300 + hbm_bytes: 1900.0e9 # KV budget after FP4 weights (~372 GB) + +cluster: + num_instances: 32 + meta_store: + ttl_seconds: 120.0 + router: + mode: prefix_affinity + prefix_k: 8 + load_alpha: 1.0 + +sim: + trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl + max_requests: null + output_dir: runs/glm5_nvfp4_8xb300 + sample_interval_s: 1.0 + seed: 42 diff --git a/models/GLM-5-NVFP4/config.json b/models/GLM-5-NVFP4/config.json new file mode 100644 index 0000000..a34ad26 --- /dev/null +++ b/models/GLM-5-NVFP4/config.json @@ -0,0 +1,59 @@ +{ + "architectures": [ + "GlmMoeDsaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "ep_size": 1, + "first_k_dense_replace": 3, + "hidden_act": "silu", + "head_dim": 64, + "hidden_size": 6144, + "index_head_dim": 128, + "index_n_heads": 32, + "index_topk": 2048, + "indexer_rope_interleave": true, + "initializer_range": 0.02, + "intermediate_size": 12288, + "kv_lora_rank": 512, + "max_position_embeddings": 202752, + "moe_intermediate_size": 2048, + "moe_layer_freq": 1, + "model_type": "glm_moe_dsa", + "n_group": 1, + "n_routed_experts": 256, + "n_shared_experts": 1, + "norm_topk_prob": true, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "num_nextn_predict_layers": 1, + "pad_token_id": 154820, + "pretraining_tp": 1, + "q_lora_rank": 2048, + "qk_head_dim": 256, + "qk_nope_head_dim": 192, + "qk_rope_head_dim": 64, + "rms_norm_eps": 1e-05, + "rope_interleave": true, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "routed_scaling_factor": 2.5, + "scoring_func": "sigmoid", + "tie_word_embeddings": false, + "topk_group": 1, + "topk_method": "noaux_tc", + "transformers_version": "5.0.2.dev0", + "use_cache": true, + "v_head_dim": 256, + "vocab_size": 154880 +} diff --git a/src/hardware_presets.rs b/src/hardware_presets.rs index b99fd6e..f779c6d 100644 --- a/src/hardware_presets.rs +++ b/src/hardware_presets.rs @@ -20,6 +20,7 @@ pub const AVAILABLE: &[&str] = &[ "a100-80gb", "a100-40gb", "b200", + "b300", "2xh100", "4xh100", "8xh100", @@ -32,6 +33,9 @@ pub const AVAILABLE: &[&str] = &[ "2xb200", "4xb200", "8xb200", + "2xb300", + "4xb300", + "8xb300", ]; /// Resolve a hardware preset by name. @@ -48,6 +52,7 @@ pub fn resolve(name: &str) -> Option { "a10080gb" | "a100" => Some(make_config(count, &A100_80GB)), "a10040gb" => Some(make_config(count, &A100_40GB)), "b200" => Some(make_config(count, &B200)), + "b300" => Some(make_config(count, &B300)), _ => None, } } @@ -121,6 +126,13 @@ const B200: GpuBase = GpuBase { pcie_gen: 6, }; +const B300: GpuBase = GpuBase { + flops: 2.25e15, // 2250 TFLOPS BF16 dense (same GB202 die as B200) + mem_bw: 12.0e12, // 12 TB/s HBM3e 12-Hi (50% more than B200 8-Hi) + hbm: 288.0e9, // 288 GB HBM3e 12-Hi + pcie_gen: 6, +}; + /// Build a [`HardwareConfig`] from a base GPU spec × TP count. /// /// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.