fix: cache calculation

This commit is contained in:
2026-04-15 17:31:39 +08:00
parent 365ceac3be
commit ff316c6873
23 changed files with 500 additions and 336 deletions

View File

@@ -78,56 +78,56 @@ fn parse_count_gpu(s: &str) -> (u32, String) {
// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
struct GpuBase {
flops: f64, // BF16 dense FLOPS
fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
mem_bw: f64, // HBM bandwidth (B/s)
hbm: f64, // Total HBM (bytes)
pcie_gen: u32, // PCIe generation (4/5/6)
flops: f64, // BF16 dense FLOPS
fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
mem_bw: f64, // HBM bandwidth (B/s)
hbm: f64, // Total HBM (bytes)
pcie_gen: u32, // PCIe generation (4/5/6)
}
const H100: GpuBase = GpuBase {
flops: 9.89e14, // 989 TFLOPS BF16 dense
flops: 9.89e14, // 989 TFLOPS BF16 dense
fp8_flops: 1.979e15, // 1979 TFLOPS FP8 dense
fp4_flops: 0.0, // not supported
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
fp4_flops: 0.0, // not supported
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
pcie_gen: 5,
};
const H800: GpuBase = GpuBase {
flops: 9.89e14, // same die as H100
flops: 9.89e14, // same die as H100
fp8_flops: 1.979e15,
fp4_flops: 0.0,
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
mem_bw: 3.35e12, // 3.35 TB/s HBM3
hbm: 80.0e9, // 80 GB
pcie_gen: 5,
};
const H20: GpuBase = GpuBase {
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
fp8_flops: 2.96e14, // 296 TFLOPS FP8
fp4_flops: 0.0, // not supported
mem_bw: 4.0e12, // 4.0 TB/s HBM3
hbm: 96.0e9, // 96 GB
fp4_flops: 0.0, // not supported
mem_bw: 4.0e12, // 4.0 TB/s HBM3
hbm: 96.0e9, // 96 GB
pcie_gen: 5,
};
const A100_80GB: GpuBase = GpuBase {
flops: 3.12e14, // 312 TFLOPS BF16
fp8_flops: 0.0, // A100 has no FP8 tensor cores
flops: 3.12e14, // 312 TFLOPS BF16
fp8_flops: 0.0, // A100 has no FP8 tensor cores
fp4_flops: 0.0,
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
hbm: 80.0e9, // 80 GB
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
hbm: 80.0e9, // 80 GB
pcie_gen: 4,
};
const A100_40GB: GpuBase = GpuBase {
flops: 3.12e14, // 312 TFLOPS BF16
flops: 3.12e14, // 312 TFLOPS BF16
fp8_flops: 0.0,
fp4_flops: 0.0,
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
hbm: 40.0e9, // 40 GB
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
hbm: 40.0e9, // 40 GB
pcie_gen: 4,
};
@@ -143,11 +143,11 @@ const B200: GpuBase = GpuBase {
// DGX B300 (8 GPU) specs: BF16 18 PFLOPS, FP8 ~54 PFLOPS, FP4 108 PFLOPS (dense)
const B300: GpuBase = GpuBase {
flops: 2.25e15, // 2250 TFLOPS BF16 dense (same GB202 die as B200)
fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
mem_bw: 12.0e12, // 12 TB/s HBM3e 12-Hi
hbm: 288.0e9, // 288 GB HBM3e 12-Hi
flops: 2.25e15, // 2250 TFLOPS BF16 dense (same GB202 die as B200)
fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
mem_bw: 12.0e12, // 12 TB/s HBM3e 12-Hi
hbm: 288.0e9, // 288 GB HBM3e 12-Hi
pcie_gen: 6,
};
@@ -162,15 +162,15 @@ fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
// PCIe per-GPU bandwidth and latency by generation
let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
6 => (128.0e9, 4.0), // Gen6 x16
5 => (64.0e9, 5.0), // Gen5 x16
_ => (32.0e9, 5.0), // Gen4 x16
6 => (128.0e9, 4.0), // Gen6 x16
5 => (64.0e9, 5.0), // Gen5 x16
_ => (32.0e9, 5.0), // Gen4 x16
};
// RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
let (rdma_base, rdma_lat) = match base.pcie_gen {
6 => (50.0e9, 6.0), // 400 Gbps NIC
_ => (25.0e9, 8.0), // 200 Gbps NIC
6 => (50.0e9, 6.0), // 400 Gbps NIC
_ => (25.0e9, 8.0), // 200 Gbps NIC
};
let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };