Files
kvcache-simulator/src/hardware_presets.rs

290 lines
8.9 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Built-in hardware presets for common GPU configurations.
//!
//! Presets provide baseline specs for single GPUs and tensor-parallel (TP)
//! groups. All values can be overridden in the YAML config by specifying
//! explicit fields alongside `type`:
//!
//! ```yaml
//! hardware:
//! type: 8xb200
//! hbm_bytes: 500.0e9 # override total HBM with actual KV budget
//! ```
use crate::config::HardwareConfig;
/// All recognized preset names (for help/error messages).
pub const AVAILABLE: &[&str] = &[
"h100",
"h800",
"h20",
"h20-141g",
"a100-80gb",
"a100-40gb",
"b200",
"b300",
"2xh100",
"4xh100",
"8xh100",
"2xh800",
"4xh800",
"8xh800",
"2xh20",
"4xh20",
"8xh20",
"2xh20-141g",
"4xh20-141g",
"8xh20-141g",
"2xb200",
"4xb200",
"8xb200",
"2xb300",
"4xb300",
"8xb300",
];
/// Resolve a hardware preset by name.
///
/// Case-insensitive; hyphens, underscores, and spaces are stripped before
/// matching. Accepts `NxGPU` patterns (e.g. `8xb200`).
pub fn resolve(name: &str) -> Option<HardwareConfig> {
let key = normalize(name);
let (count, gpu) = parse_count_gpu(&key);
match gpu.as_str() {
"h100" => Some(make_config(count, &H100)),
"h800" => Some(make_config(count, &H800)),
"h20" => Some(make_config(count, &H20)),
"h20141g" | "h20141gb" => Some(make_config(count, &H20_141G)),
"a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
"a10040gb" => Some(make_config(count, &A100_40GB)),
"b200" => Some(make_config(count, &B200)),
"b300" => Some(make_config(count, &B300)),
_ => None,
}
}
// ---------------------------------------------------------------------------
// Internals
// ---------------------------------------------------------------------------
fn normalize(s: &str) -> String {
s.to_ascii_lowercase().replace(['-', '_', ' '], "")
}
/// Parse `"8xh100"` → `(8, "h100")`, `"h100"` → `(1, "h100")`.
fn parse_count_gpu(s: &str) -> (u32, String) {
if let Some(pos) = s.find('x') {
if let Ok(n) = s[..pos].parse::<u32>() {
return (n, s[pos + 1..].to_string());
}
}
(1, s.to_string())
}
// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
struct GpuBase {
flops: f64, // BF16 dense FLOPS
fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
mem_bw: f64, // HBM bandwidth (B/s)
hbm: f64, // Total HBM (bytes)
pcie_gen: u32, // PCIe generation (4/5/6)
}
const H100: GpuBase = GpuBase {
flops: 9.89e14, // 989 TFLOPS BF16 dense
fp8_flops: 1.979e15, // 1979 TFLOPS FP8 dense
fp4_flops: 0.0, // not supported
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
pcie_gen: 5,
};
const H800: GpuBase = GpuBase {
flops: 9.89e14, // same die as H100
fp8_flops: 1.979e15,
fp4_flops: 0.0,
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
pcie_gen: 5,
};
const H20: GpuBase = GpuBase {
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
fp8_flops: 2.96e14, // 296 TFLOPS FP8
fp4_flops: 0.0, // not supported
mem_bw: 4.0e12, // 4.0 TB/s HBM3
hbm: 96.0e9, // 96 GB
pcie_gen: 5,
};
const H20_141G: GpuBase = GpuBase {
flops: 1.48e14, // modeled as the same H20 compute envelope
fp8_flops: 2.96e14, // modeled as the same H20 FP8 throughput
fp4_flops: 0.0, // not supported
mem_bw: 4.8e12, // 141 GB HBM variant
hbm: 141.0e9, // 141 GB
pcie_gen: 5,
};
const A100_80GB: GpuBase = GpuBase {
flops: 3.12e14, // 312 TFLOPS BF16
fp8_flops: 0.0, // A100 has no FP8 tensor cores
fp4_flops: 0.0,
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
hbm: 80.0e9, // 80 GB
pcie_gen: 4,
};
const A100_40GB: GpuBase = GpuBase {
flops: 3.12e14, // 312 TFLOPS BF16
fp8_flops: 0.0,
fp4_flops: 0.0,
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
hbm: 40.0e9, // 40 GB
pcie_gen: 4,
};
// DGX B200 (8 GPU) specs: BF16 18 PFLOPS, FP8 36 PFLOPS, FP4 72 PFLOPS (dense)
const B200: GpuBase = GpuBase {
flops: 2.25e15, // 2250 TFLOPS BF16 dense
fp8_flops: 4.5e15, // 4500 TFLOPS FP8 dense
fp4_flops: 9.0e15, // 9000 TFLOPS FP4 dense
mem_bw: 8.0e12, // 8.0 TB/s HBM3e
hbm: 192.0e9, // 192 GB
pcie_gen: 6,
};
// DGX B300 (8 GPU) specs: BF16 18 PFLOPS, FP8 ~54 PFLOPS, FP4 108 PFLOPS (dense)
const B300: GpuBase = GpuBase {
flops: 2.25e15, // 2250 TFLOPS BF16 dense (same GB202 die as B200)
fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
mem_bw: 12.0e12, // 12 TB/s HBM3e 12-Hi
hbm: 288.0e9, // 288 GB HBM3e 12-Hi
pcie_gen: 6,
};
/// Build a [`HardwareConfig`] from a base GPU spec × TP count.
///
/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
/// PCIe bandwidth scales linearly (one link per GPU). RDMA bandwidth
/// assumes one NIC for ≤4 GPUs and two NICs for ≥8. Server DRAM is a
/// reasonable default based on typical deployment sizes.
fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
let f = n as f64;
// PCIe per-GPU bandwidth and latency by generation
let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
6 => (128.0e9, 4.0), // Gen6 x16
5 => (64.0e9, 5.0), // Gen5 x16
_ => (32.0e9, 5.0), // Gen4 x16
};
// RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
let (rdma_base, rdma_lat) = match base.pcie_gen {
6 => (50.0e9, 6.0), // 400 Gbps NIC
_ => (25.0e9, 8.0), // 200 Gbps NIC
};
let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };
// Server DRAM: rough defaults by deployment size
let dram = match n {
1 => 512.0e9,
2..=4 => 1.0e12,
_ => 1.5e12,
};
HardwareConfig {
gpu_flops: base.flops * f,
gpu_fp8_flops: base.fp8_flops * f,
gpu_fp4_flops: base.fp4_flops * f,
gpu_mem_bw: base.mem_bw * f,
hbm_bytes: base.hbm * f,
dram_bytes: dram,
host_dram_bw: if n >= 8 { 9.0e11 } else { 5.0e11 },
pcie_bw: pcie_per_gpu * f,
pcie_latency_us: pcie_lat,
rdma_bw: rdma_base * rdma_scale,
rdma_latency_us: rdma_lat,
intra_node_tp_bw: if base.pcie_gen >= 6 {
1.8e12 * f
} else {
9.0e11 * f
},
intra_node_tp_latency_us: if base.pcie_gen >= 6 { 1.0 } else { 2.0 },
tp_degree: n,
max_batch_slots: 256,
prefill_chunk_tokens: if n >= 4 { 4096 } else { 2048 },
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resolve_single_gpu() {
let hw = resolve("h100").unwrap();
assert!((hw.gpu_flops - 9.89e14).abs() < 1e10);
assert!((hw.hbm_bytes - 80e9).abs() < 1e6);
assert_eq!(hw.prefill_chunk_tokens, 2048);
}
#[test]
fn resolve_tp_group() {
let hw = resolve("8xb200").unwrap();
assert!((hw.gpu_flops - 2.25e15 * 8.0).abs() < 1e11);
assert!((hw.hbm_bytes - 192e9 * 8.0).abs() < 1e6);
assert!((hw.pcie_bw - 128e9 * 8.0).abs() < 1e6);
assert_eq!(hw.prefill_chunk_tokens, 4096);
}
#[test]
fn resolve_case_and_separator_insensitive() {
assert!(resolve("H100").is_some());
assert!(resolve("8xB200").is_some());
assert!(resolve("8x-B200").is_some());
assert!(resolve("8xH20-141G").is_some());
assert!(resolve("a100-80gb").is_some());
assert!(resolve("A100_80GB").is_some());
assert!(resolve("a100_80gb").is_some());
}
#[test]
fn resolve_unknown_returns_none() {
assert!(resolve("v100").is_none());
assert!(resolve("tpu-v5").is_none());
assert!(resolve("").is_none());
}
#[test]
fn a100_variants() {
let a80 = resolve("a100-80gb").unwrap();
let a40 = resolve("a100-40gb").unwrap();
assert!((a80.hbm_bytes - 80e9).abs() < 1e6);
assert!((a40.hbm_bytes - 40e9).abs() < 1e6);
assert!(a80.gpu_mem_bw > a40.gpu_mem_bw);
}
#[test]
fn scaling_is_linear() {
let s1 = resolve("h100").unwrap();
let s4 = resolve("4xh100").unwrap();
let s8 = resolve("8xh100").unwrap();
assert!((s4.gpu_flops - s1.gpu_flops * 4.0).abs() < 1.0);
assert!((s8.gpu_flops - s1.gpu_flops * 8.0).abs() < 1.0);
assert!((s4.gpu_mem_bw - s1.gpu_mem_bw * 4.0).abs() < 1.0);
assert!((s8.hbm_bytes - s1.hbm_bytes * 8.0).abs() < 1.0);
}
#[test]
fn h20_141g_variant_has_larger_hbm() {
let h20 = resolve("8xh20").unwrap();
let h20_141g = resolve("8xh20-141g").unwrap();
assert!((h20_141g.gpu_flops - h20.gpu_flops).abs() < 1.0);
assert!(h20_141g.hbm_bytes > h20.hbm_bytes);
assert!(h20_141g.gpu_mem_bw > h20.gpu_mem_bw);
}
}