Files
kvcache-simulator/src/hardware_presets.rs
Gahow Wang ec73a95e05 KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies
in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache
hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention,
architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide
meta-store for prefix-aware routing decisions.

Includes 11 routing policies (random, round_robin, least_loaded,
least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score,
estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing,
built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation
tooling for systematic policy comparison across real Alibaba serving traces.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00

226 lines
6.5 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Built-in hardware presets for common GPU configurations.
//!
//! Presets provide baseline specs for single GPUs and tensor-parallel (TP)
//! groups. All values can be overridden in the YAML config by specifying
//! explicit fields alongside `type`:
//!
//! ```yaml
//! hardware:
//! type: 8xb200
//! hbm_bytes: 500.0e9 # override total HBM with actual KV budget
//! ```
use crate::config::HardwareConfig;
/// All recognized preset names (for help/error messages).
pub const AVAILABLE: &[&str] = &[
"h100",
"h800",
"h20",
"a100-80gb",
"a100-40gb",
"b200",
"2xh100",
"4xh100",
"8xh100",
"2xh800",
"4xh800",
"8xh800",
"2xh20",
"4xh20",
"8xh20",
"2xb200",
"4xb200",
"8xb200",
];
/// Resolve a hardware preset by name.
///
/// Case-insensitive; hyphens, underscores, and spaces are stripped before
/// matching. Accepts `NxGPU` patterns (e.g. `8xb200`).
pub fn resolve(name: &str) -> Option<HardwareConfig> {
let key = normalize(name);
let (count, gpu) = parse_count_gpu(&key);
match gpu.as_str() {
"h100" => Some(make_config(count, &H100)),
"h800" => Some(make_config(count, &H800)),
"h20" => Some(make_config(count, &H20)),
"a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
"a10040gb" => Some(make_config(count, &A100_40GB)),
"b200" => Some(make_config(count, &B200)),
_ => None,
}
}
// ---------------------------------------------------------------------------
// Internals
// ---------------------------------------------------------------------------
fn normalize(s: &str) -> String {
s.to_ascii_lowercase().replace(['-', '_', ' '], "")
}
/// Parse `"8xh100"` → `(8, "h100")`, `"h100"` → `(1, "h100")`.
fn parse_count_gpu(s: &str) -> (u32, String) {
if let Some(pos) = s.find('x') {
if let Ok(n) = s[..pos].parse::<u32>() {
return (n, s[pos + 1..].to_string());
}
}
(1, s.to_string())
}
// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
struct GpuBase {
flops: f64, // BF16 dense TFLOPS
mem_bw: f64, // HBM bandwidth (B/s)
hbm: f64, // Total HBM (bytes)
pcie_gen: u32, // PCIe generation (4/5/6)
}
const H100: GpuBase = GpuBase {
flops: 9.89e14, // 989 TFLOPS BF16
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
pcie_gen: 5,
};
const H800: GpuBase = GpuBase {
flops: 9.89e14, // same die as H100
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
pcie_gen: 5,
};
const H20: GpuBase = GpuBase {
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
mem_bw: 4.0e12, // 4.0 TB/s HBM3
hbm: 96.0e9, // 96 GB
pcie_gen: 5,
};
const A100_80GB: GpuBase = GpuBase {
flops: 3.12e14, // 312 TFLOPS BF16
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
hbm: 80.0e9, // 80 GB
pcie_gen: 4,
};
const A100_40GB: GpuBase = GpuBase {
flops: 3.12e14, // 312 TFLOPS BF16
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
hbm: 40.0e9, // 40 GB
pcie_gen: 4,
};
const B200: GpuBase = GpuBase {
flops: 2.25e15, // 2250 TFLOPS BF16
mem_bw: 8.0e12, // 8.0 TB/s HBM3e
hbm: 192.0e9, // 192 GB
pcie_gen: 6,
};
/// Build a [`HardwareConfig`] from a base GPU spec × TP count.
///
/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
/// PCIe bandwidth scales linearly (one link per GPU). RDMA bandwidth
/// assumes one NIC for ≤4 GPUs and two NICs for ≥8. Server DRAM is a
/// reasonable default based on typical deployment sizes.
fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
let f = n as f64;
// PCIe per-GPU bandwidth and latency by generation
let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
6 => (128.0e9, 4.0), // Gen6 x16
5 => (64.0e9, 5.0), // Gen5 x16
_ => (32.0e9, 5.0), // Gen4 x16
};
// RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
let (rdma_base, rdma_lat) = match base.pcie_gen {
6 => (50.0e9, 6.0), // 400 Gbps NIC
_ => (25.0e9, 8.0), // 200 Gbps NIC
};
let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };
// Server DRAM: rough defaults by deployment size
let dram = match n {
1 => 512.0e9,
2..=4 => 1.0e12,
_ => 1.5e12,
};
HardwareConfig {
gpu_flops: base.flops * f,
gpu_mem_bw: base.mem_bw * f,
hbm_bytes: base.hbm * f,
dram_bytes: dram,
pcie_bw: pcie_per_gpu * f,
pcie_latency_us: pcie_lat,
rdma_bw: rdma_base * rdma_scale,
rdma_latency_us: rdma_lat,
max_batch_slots: 256,
prefill_chunk_tokens: if n >= 4 { 4096 } else { 2048 },
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resolve_single_gpu() {
let hw = resolve("h100").unwrap();
assert!((hw.gpu_flops - 9.89e14).abs() < 1e10);
assert!((hw.hbm_bytes - 80e9).abs() < 1e6);
assert_eq!(hw.prefill_chunk_tokens, 2048);
}
#[test]
fn resolve_tp_group() {
let hw = resolve("8xb200").unwrap();
assert!((hw.gpu_flops - 2.25e15 * 8.0).abs() < 1e11);
assert!((hw.hbm_bytes - 192e9 * 8.0).abs() < 1e6);
assert!((hw.pcie_bw - 128e9 * 8.0).abs() < 1e6);
assert_eq!(hw.prefill_chunk_tokens, 4096);
}
#[test]
fn resolve_case_and_separator_insensitive() {
assert!(resolve("H100").is_some());
assert!(resolve("8xB200").is_some());
assert!(resolve("8x-B200").is_some());
assert!(resolve("a100-80gb").is_some());
assert!(resolve("A100_80GB").is_some());
assert!(resolve("a100_80gb").is_some());
}
#[test]
fn resolve_unknown_returns_none() {
assert!(resolve("v100").is_none());
assert!(resolve("tpu-v5").is_none());
assert!(resolve("").is_none());
}
#[test]
fn a100_variants() {
let a80 = resolve("a100-80gb").unwrap();
let a40 = resolve("a100-40gb").unwrap();
assert!((a80.hbm_bytes - 80e9).abs() < 1e6);
assert!((a40.hbm_bytes - 40e9).abs() < 1e6);
assert!(a80.gpu_mem_bw > a40.gpu_mem_bw);
}
#[test]
fn scaling_is_linear() {
let s1 = resolve("h100").unwrap();
let s4 = resolve("4xh100").unwrap();
let s8 = resolve("8xh100").unwrap();
assert!((s4.gpu_flops - s1.gpu_flops * 4.0).abs() < 1.0);
assert!((s8.gpu_flops - s1.gpu_flops * 8.0).abs() < 1.0);
assert!((s4.gpu_mem_bw - s1.gpu_mem_bw * 4.0).abs() < 1.0);
assert!((s8.hbm_bytes - s1.hbm_bytes * 8.0).abs() < 1.0);
}
}