kvcache-simulator/src/hardware_presets.rs

//! Built-in hardware presets for common GPU configurations.
//!
//! Presets provide baseline specs for single GPUs and tensor-parallel (TP)
//! groups.  All values can be overridden in the YAML config by specifying
//! explicit fields alongside `type`:
//!
//! ```yaml
//! hardware:
//!   type: 8xb200
//!   hbm_bytes: 500.0e9   # override total HBM with actual KV budget
//! ```

use crate::config::HardwareConfig;

/// All recognized preset names (for help/error messages).
pub const AVAILABLE: &[&str] = &[
    "h100",
    "h800",
    "h20",
    "h20-141g",
    "a100-80gb",
    "a100-40gb",
    "b200",
    "b300",
    "2xh100",
    "4xh100",
    "8xh100",
    "2xh800",
    "4xh800",
    "8xh800",
    "2xh20",
    "4xh20",
    "8xh20",
    "2xh20-141g",
    "4xh20-141g",
    "8xh20-141g",
    "2xb200",
    "4xb200",
    "8xb200",
    "2xb300",
    "4xb300",
    "8xb300",
];

/// Resolve a hardware preset by name.
///
/// Case-insensitive; hyphens, underscores, and spaces are stripped before
/// matching.  Accepts `NxGPU` patterns (e.g. `8xb200`).
pub fn resolve(name: &str) -> Option<HardwareConfig> {
    let key = normalize(name);
    let (count, gpu) = parse_count_gpu(&key);
    match gpu.as_str() {
        "h100" => Some(make_config(count, &H100)),
        "h800" => Some(make_config(count, &H800)),
        "h20" => Some(make_config(count, &H20)),
        "h20141g" | "h20141gb" => Some(make_config(count, &H20_141G)),
        "a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
        "a10040gb" => Some(make_config(count, &A100_40GB)),
        "b200" => Some(make_config(count, &B200)),
        "b300" => Some(make_config(count, &B300)),
        _ => None,
    }
}

// ---------------------------------------------------------------------------
// Internals
// ---------------------------------------------------------------------------

fn normalize(s: &str) -> String {
    s.to_ascii_lowercase().replace(['-', '_', ' '], "")
}

/// Parse `"8xh100"` → `(8, "h100")`, `"h100"` → `(1, "h100")`.
fn parse_count_gpu(s: &str) -> (u32, String) {
    if let Some(pos) = s.find('x') {
        if let Ok(n) = s[..pos].parse::<u32>() {
            return (n, s[pos + 1..].to_string());
        }
    }
    (1, s.to_string())
}

// -- Per-GPU base specs (single die, BF16 dense) -----------------------------

struct GpuBase {
    flops: f64,     // BF16 dense FLOPS
    fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
    fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
    mem_bw: f64,    // HBM bandwidth (B/s)
    hbm: f64,       // Total HBM (bytes)
    pcie_gen: u32,  // PCIe generation (4/5/6)
}

const H100: GpuBase = GpuBase {
    flops: 9.89e14,      // 989 TFLOPS BF16 dense
    fp8_flops: 1.979e15, // 1979 TFLOPS FP8 dense
    fp4_flops: 0.0,      // not supported
    mem_bw: 3.35e12,     // 3.35 TB/s HBM3
    hbm: 80.0e9,         // 80 GB
    pcie_gen: 5,
};

const H800: GpuBase = GpuBase {
    flops: 9.89e14, // same die as H100
    fp8_flops: 1.979e15,
    fp4_flops: 0.0,
    mem_bw: 3.35e12, // 3.35 TB/s HBM3
    hbm: 80.0e9,     // 80 GB
    pcie_gen: 5,
};

const H20: GpuBase = GpuBase {
    flops: 1.48e14,     // 148 TFLOPS BF16 (China-export Hopper)
    fp8_flops: 2.96e14, // 296 TFLOPS FP8
    fp4_flops: 0.0,     // not supported
    mem_bw: 4.0e12,     // 4.0 TB/s HBM3
    hbm: 96.0e9,        // 96 GB
    pcie_gen: 5,
};

const H20_141G: GpuBase = GpuBase {
    flops: 1.48e14,     // modeled as the same H20 compute envelope
    fp8_flops: 2.96e14, // modeled as the same H20 FP8 throughput
    fp4_flops: 0.0,     // not supported
    mem_bw: 4.8e12,     // 141 GB HBM variant
    hbm: 141.0e9,       // 141 GB
    pcie_gen: 5,
};

const A100_80GB: GpuBase = GpuBase {
    flops: 3.12e14, // 312 TFLOPS BF16
    fp8_flops: 0.0, // A100 has no FP8 tensor cores
    fp4_flops: 0.0,
    mem_bw: 2.0e12, // 2.0 TB/s HBM2e
    hbm: 80.0e9,    // 80 GB
    pcie_gen: 4,
};

const A100_40GB: GpuBase = GpuBase {
    flops: 3.12e14, // 312 TFLOPS BF16
    fp8_flops: 0.0,
    fp4_flops: 0.0,
    mem_bw: 1.555e12, // 1.555 TB/s HBM2e
    hbm: 40.0e9,      // 40 GB
    pcie_gen: 4,
};

// DGX B200 (8 GPU) specs: BF16 18 PFLOPS, FP8 36 PFLOPS, FP4 72 PFLOPS (dense)
const B200: GpuBase = GpuBase {
    flops: 2.25e15,    // 2250 TFLOPS BF16 dense
    fp8_flops: 4.5e15, // 4500 TFLOPS FP8 dense
    fp4_flops: 9.0e15, // 9000 TFLOPS FP4 dense
    mem_bw: 8.0e12,    // 8.0 TB/s HBM3e
    hbm: 192.0e9,      // 192 GB
    pcie_gen: 6,
};

// DGX B300 (8 GPU) specs: BF16 18 PFLOPS, FP8 ~54 PFLOPS, FP4 108 PFLOPS (dense)
const B300: GpuBase = GpuBase {
    flops: 2.25e15,     // 2250 TFLOPS BF16 dense (same GB202 die as B200)
    fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
    fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
    mem_bw: 12.0e12,    // 12 TB/s HBM3e 12-Hi
    hbm: 288.0e9,       // 288 GB HBM3e 12-Hi
    pcie_gen: 6,
};

/// Build a [`HardwareConfig`] from a base GPU spec × TP count.
///
/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
/// PCIe bandwidth scales linearly (one link per GPU).  RDMA bandwidth
/// assumes one NIC for ≤4 GPUs and two NICs for ≥8.  Server DRAM is a
/// reasonable default based on typical deployment sizes.
fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
    let f = n as f64;

    // PCIe per-GPU bandwidth and latency by generation
    let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
        6 => (128.0e9, 4.0), // Gen6 x16
        5 => (64.0e9, 5.0),  // Gen5 x16
        _ => (32.0e9, 5.0),  // Gen4 x16
    };

    // RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
    let (rdma_base, rdma_lat) = match base.pcie_gen {
        6 => (50.0e9, 6.0), // 400 Gbps NIC
        _ => (25.0e9, 8.0), // 200 Gbps NIC
    };
    let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };

    // Server DRAM: rough defaults by deployment size
    let dram = match n {
        1 => 512.0e9,
        2..=4 => 1.0e12,
        _ => 1.5e12,
    };

    HardwareConfig {
        gpu_flops: base.flops * f,
        gpu_fp8_flops: base.fp8_flops * f,
        gpu_fp4_flops: base.fp4_flops * f,
        gpu_mem_bw: base.mem_bw * f,
        hbm_bytes: base.hbm * f,
        dram_bytes: dram,
        host_dram_bw: if n >= 8 { 9.0e11 } else { 5.0e11 },
        pcie_bw: pcie_per_gpu * f,
        pcie_latency_us: pcie_lat,
        rdma_bw: rdma_base * rdma_scale,
        rdma_latency_us: rdma_lat,
        intra_node_tp_bw: if base.pcie_gen >= 6 {
            1.8e12 * f
        } else {
            9.0e11 * f
        },
        intra_node_tp_latency_us: if base.pcie_gen >= 6 { 1.0 } else { 2.0 },
        tp_degree: n,
        max_batch_slots: 256,
        prefill_chunk_tokens: if n >= 4 { 4096 } else { 2048 },
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn resolve_single_gpu() {
        let hw = resolve("h100").unwrap();
        assert!((hw.gpu_flops - 9.89e14).abs() < 1e10);
        assert!((hw.hbm_bytes - 80e9).abs() < 1e6);
        assert_eq!(hw.prefill_chunk_tokens, 2048);
    }

    #[test]
    fn resolve_tp_group() {
        let hw = resolve("8xb200").unwrap();
        assert!((hw.gpu_flops - 2.25e15 * 8.0).abs() < 1e11);
        assert!((hw.hbm_bytes - 192e9 * 8.0).abs() < 1e6);
        assert!((hw.pcie_bw - 128e9 * 8.0).abs() < 1e6);
        assert_eq!(hw.prefill_chunk_tokens, 4096);
    }

    #[test]
    fn resolve_case_and_separator_insensitive() {
        assert!(resolve("H100").is_some());
        assert!(resolve("8xB200").is_some());
        assert!(resolve("8x-B200").is_some());
        assert!(resolve("8xH20-141G").is_some());
        assert!(resolve("a100-80gb").is_some());
        assert!(resolve("A100_80GB").is_some());
        assert!(resolve("a100_80gb").is_some());
    }

    #[test]
    fn resolve_unknown_returns_none() {
        assert!(resolve("v100").is_none());
        assert!(resolve("tpu-v5").is_none());
        assert!(resolve("").is_none());
    }

    #[test]
    fn a100_variants() {
        let a80 = resolve("a100-80gb").unwrap();
        let a40 = resolve("a100-40gb").unwrap();
        assert!((a80.hbm_bytes - 80e9).abs() < 1e6);
        assert!((a40.hbm_bytes - 40e9).abs() < 1e6);
        assert!(a80.gpu_mem_bw > a40.gpu_mem_bw);
    }

    #[test]
    fn scaling_is_linear() {
        let s1 = resolve("h100").unwrap();
        let s4 = resolve("4xh100").unwrap();
        let s8 = resolve("8xh100").unwrap();
        assert!((s4.gpu_flops - s1.gpu_flops * 4.0).abs() < 1.0);
        assert!((s8.gpu_flops - s1.gpu_flops * 8.0).abs() < 1.0);
        assert!((s4.gpu_mem_bw - s1.gpu_mem_bw * 4.0).abs() < 1.0);
        assert!((s8.hbm_bytes - s1.hbm_bytes * 8.0).abs() < 1.0);
    }

    #[test]
    fn h20_141g_variant_has_larger_hbm() {
        let h20 = resolve("8xh20").unwrap();
        let h20_141g = resolve("8xh20-141g").unwrap();
        assert!((h20_141g.gpu_flops - h20.gpu_flops).abs() < 1.0);
        assert!(h20_141g.hbm_bytes > h20.hbm_bytes);
        assert!(h20_141g.gpu_mem_bw > h20.gpu_mem_bw);
    }
}