KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
225
src/hardware_presets.rs
Normal file
225
src/hardware_presets.rs
Normal file
@@ -0,0 +1,225 @@
|
||||
//! Built-in hardware presets for common GPU configurations.
|
||||
//!
|
||||
//! Presets provide baseline specs for single GPUs and tensor-parallel (TP)
|
||||
//! groups. All values can be overridden in the YAML config by specifying
|
||||
//! explicit fields alongside `type`:
|
||||
//!
|
||||
//! ```yaml
|
||||
//! hardware:
|
||||
//! type: 8xb200
|
||||
//! hbm_bytes: 500.0e9 # override total HBM with actual KV budget
|
||||
//! ```
|
||||
|
||||
use crate::config::HardwareConfig;
|
||||
|
||||
/// All recognized preset names (for help/error messages).
|
||||
pub const AVAILABLE: &[&str] = &[
|
||||
"h100",
|
||||
"h800",
|
||||
"h20",
|
||||
"a100-80gb",
|
||||
"a100-40gb",
|
||||
"b200",
|
||||
"2xh100",
|
||||
"4xh100",
|
||||
"8xh100",
|
||||
"2xh800",
|
||||
"4xh800",
|
||||
"8xh800",
|
||||
"2xh20",
|
||||
"4xh20",
|
||||
"8xh20",
|
||||
"2xb200",
|
||||
"4xb200",
|
||||
"8xb200",
|
||||
];
|
||||
|
||||
/// Resolve a hardware preset by name.
|
||||
///
|
||||
/// Case-insensitive; hyphens, underscores, and spaces are stripped before
|
||||
/// matching. Accepts `NxGPU` patterns (e.g. `8xb200`).
|
||||
pub fn resolve(name: &str) -> Option<HardwareConfig> {
|
||||
let key = normalize(name);
|
||||
let (count, gpu) = parse_count_gpu(&key);
|
||||
match gpu.as_str() {
|
||||
"h100" => Some(make_config(count, &H100)),
|
||||
"h800" => Some(make_config(count, &H800)),
|
||||
"h20" => Some(make_config(count, &H20)),
|
||||
"a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
|
||||
"a10040gb" => Some(make_config(count, &A100_40GB)),
|
||||
"b200" => Some(make_config(count, &B200)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internals
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn normalize(s: &str) -> String {
|
||||
s.to_ascii_lowercase().replace(['-', '_', ' '], "")
|
||||
}
|
||||
|
||||
/// Parse `"8xh100"` → `(8, "h100")`, `"h100"` → `(1, "h100")`.
|
||||
fn parse_count_gpu(s: &str) -> (u32, String) {
|
||||
if let Some(pos) = s.find('x') {
|
||||
if let Ok(n) = s[..pos].parse::<u32>() {
|
||||
return (n, s[pos + 1..].to_string());
|
||||
}
|
||||
}
|
||||
(1, s.to_string())
|
||||
}
|
||||
|
||||
// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
|
||||
|
||||
struct GpuBase {
|
||||
flops: f64, // BF16 dense TFLOPS
|
||||
mem_bw: f64, // HBM bandwidth (B/s)
|
||||
hbm: f64, // Total HBM (bytes)
|
||||
pcie_gen: u32, // PCIe generation (4/5/6)
|
||||
}
|
||||
|
||||
const H100: GpuBase = GpuBase {
|
||||
flops: 9.89e14, // 989 TFLOPS BF16
|
||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||
hbm: 80.0e9, // 80 GB
|
||||
pcie_gen: 5,
|
||||
};
|
||||
|
||||
const H800: GpuBase = GpuBase {
|
||||
flops: 9.89e14, // same die as H100
|
||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||
hbm: 80.0e9, // 80 GB
|
||||
pcie_gen: 5,
|
||||
};
|
||||
|
||||
const H20: GpuBase = GpuBase {
|
||||
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
|
||||
mem_bw: 4.0e12, // 4.0 TB/s HBM3
|
||||
hbm: 96.0e9, // 96 GB
|
||||
pcie_gen: 5,
|
||||
};
|
||||
|
||||
const A100_80GB: GpuBase = GpuBase {
|
||||
flops: 3.12e14, // 312 TFLOPS BF16
|
||||
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
|
||||
hbm: 80.0e9, // 80 GB
|
||||
pcie_gen: 4,
|
||||
};
|
||||
|
||||
const A100_40GB: GpuBase = GpuBase {
|
||||
flops: 3.12e14, // 312 TFLOPS BF16
|
||||
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
|
||||
hbm: 40.0e9, // 40 GB
|
||||
pcie_gen: 4,
|
||||
};
|
||||
|
||||
const B200: GpuBase = GpuBase {
|
||||
flops: 2.25e15, // 2250 TFLOPS BF16
|
||||
mem_bw: 8.0e12, // 8.0 TB/s HBM3e
|
||||
hbm: 192.0e9, // 192 GB
|
||||
pcie_gen: 6,
|
||||
};
|
||||
|
||||
/// Build a [`HardwareConfig`] from a base GPU spec × TP count.
|
||||
///
|
||||
/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
|
||||
/// PCIe bandwidth scales linearly (one link per GPU). RDMA bandwidth
|
||||
/// assumes one NIC for ≤4 GPUs and two NICs for ≥8. Server DRAM is a
|
||||
/// reasonable default based on typical deployment sizes.
|
||||
fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
|
||||
let f = n as f64;
|
||||
|
||||
// PCIe per-GPU bandwidth and latency by generation
|
||||
let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
|
||||
6 => (128.0e9, 4.0), // Gen6 x16
|
||||
5 => (64.0e9, 5.0), // Gen5 x16
|
||||
_ => (32.0e9, 5.0), // Gen4 x16
|
||||
};
|
||||
|
||||
// RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
|
||||
let (rdma_base, rdma_lat) = match base.pcie_gen {
|
||||
6 => (50.0e9, 6.0), // 400 Gbps NIC
|
||||
_ => (25.0e9, 8.0), // 200 Gbps NIC
|
||||
};
|
||||
let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };
|
||||
|
||||
// Server DRAM: rough defaults by deployment size
|
||||
let dram = match n {
|
||||
1 => 512.0e9,
|
||||
2..=4 => 1.0e12,
|
||||
_ => 1.5e12,
|
||||
};
|
||||
|
||||
HardwareConfig {
|
||||
gpu_flops: base.flops * f,
|
||||
gpu_mem_bw: base.mem_bw * f,
|
||||
hbm_bytes: base.hbm * f,
|
||||
dram_bytes: dram,
|
||||
pcie_bw: pcie_per_gpu * f,
|
||||
pcie_latency_us: pcie_lat,
|
||||
rdma_bw: rdma_base * rdma_scale,
|
||||
rdma_latency_us: rdma_lat,
|
||||
max_batch_slots: 256,
|
||||
prefill_chunk_tokens: if n >= 4 { 4096 } else { 2048 },
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn resolve_single_gpu() {
|
||||
let hw = resolve("h100").unwrap();
|
||||
assert!((hw.gpu_flops - 9.89e14).abs() < 1e10);
|
||||
assert!((hw.hbm_bytes - 80e9).abs() < 1e6);
|
||||
assert_eq!(hw.prefill_chunk_tokens, 2048);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_tp_group() {
|
||||
let hw = resolve("8xb200").unwrap();
|
||||
assert!((hw.gpu_flops - 2.25e15 * 8.0).abs() < 1e11);
|
||||
assert!((hw.hbm_bytes - 192e9 * 8.0).abs() < 1e6);
|
||||
assert!((hw.pcie_bw - 128e9 * 8.0).abs() < 1e6);
|
||||
assert_eq!(hw.prefill_chunk_tokens, 4096);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_case_and_separator_insensitive() {
|
||||
assert!(resolve("H100").is_some());
|
||||
assert!(resolve("8xB200").is_some());
|
||||
assert!(resolve("8x-B200").is_some());
|
||||
assert!(resolve("a100-80gb").is_some());
|
||||
assert!(resolve("A100_80GB").is_some());
|
||||
assert!(resolve("a100_80gb").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_unknown_returns_none() {
|
||||
assert!(resolve("v100").is_none());
|
||||
assert!(resolve("tpu-v5").is_none());
|
||||
assert!(resolve("").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a100_variants() {
|
||||
let a80 = resolve("a100-80gb").unwrap();
|
||||
let a40 = resolve("a100-40gb").unwrap();
|
||||
assert!((a80.hbm_bytes - 80e9).abs() < 1e6);
|
||||
assert!((a40.hbm_bytes - 40e9).abs() < 1e6);
|
||||
assert!(a80.gpu_mem_bw > a40.gpu_mem_bw);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scaling_is_linear() {
|
||||
let s1 = resolve("h100").unwrap();
|
||||
let s4 = resolve("4xh100").unwrap();
|
||||
let s8 = resolve("8xh100").unwrap();
|
||||
assert!((s4.gpu_flops - s1.gpu_flops * 4.0).abs() < 1.0);
|
||||
assert!((s8.gpu_flops - s1.gpu_flops * 8.0).abs() < 1.0);
|
||||
assert!((s4.gpu_mem_bw - s1.gpu_mem_bw * 4.0).abs() < 1.0);
|
||||
assert!((s8.hbm_bytes - s1.hbm_bytes * 8.0).abs() < 1.0);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user