fix: cache calculation
This commit is contained in:
@@ -78,56 +78,56 @@ fn parse_count_gpu(s: &str) -> (u32, String) {
|
||||
// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
|
||||
|
||||
struct GpuBase {
|
||||
flops: f64, // BF16 dense FLOPS
|
||||
fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
|
||||
fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
|
||||
mem_bw: f64, // HBM bandwidth (B/s)
|
||||
hbm: f64, // Total HBM (bytes)
|
||||
pcie_gen: u32, // PCIe generation (4/5/6)
|
||||
flops: f64, // BF16 dense FLOPS
|
||||
fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
|
||||
fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
|
||||
mem_bw: f64, // HBM bandwidth (B/s)
|
||||
hbm: f64, // Total HBM (bytes)
|
||||
pcie_gen: u32, // PCIe generation (4/5/6)
|
||||
}
|
||||
|
||||
const H100: GpuBase = GpuBase {
|
||||
flops: 9.89e14, // 989 TFLOPS BF16 dense
|
||||
flops: 9.89e14, // 989 TFLOPS BF16 dense
|
||||
fp8_flops: 1.979e15, // 1979 TFLOPS FP8 dense
|
||||
fp4_flops: 0.0, // not supported
|
||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||
hbm: 80.0e9, // 80 GB
|
||||
fp4_flops: 0.0, // not supported
|
||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||
hbm: 80.0e9, // 80 GB
|
||||
pcie_gen: 5,
|
||||
};
|
||||
|
||||
const H800: GpuBase = GpuBase {
|
||||
flops: 9.89e14, // same die as H100
|
||||
flops: 9.89e14, // same die as H100
|
||||
fp8_flops: 1.979e15,
|
||||
fp4_flops: 0.0,
|
||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||
hbm: 80.0e9, // 80 GB
|
||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||
hbm: 80.0e9, // 80 GB
|
||||
pcie_gen: 5,
|
||||
};
|
||||
|
||||
const H20: GpuBase = GpuBase {
|
||||
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
|
||||
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
|
||||
fp8_flops: 2.96e14, // 296 TFLOPS FP8
|
||||
fp4_flops: 0.0, // not supported
|
||||
mem_bw: 4.0e12, // 4.0 TB/s HBM3
|
||||
hbm: 96.0e9, // 96 GB
|
||||
fp4_flops: 0.0, // not supported
|
||||
mem_bw: 4.0e12, // 4.0 TB/s HBM3
|
||||
hbm: 96.0e9, // 96 GB
|
||||
pcie_gen: 5,
|
||||
};
|
||||
|
||||
const A100_80GB: GpuBase = GpuBase {
|
||||
flops: 3.12e14, // 312 TFLOPS BF16
|
||||
fp8_flops: 0.0, // A100 has no FP8 tensor cores
|
||||
flops: 3.12e14, // 312 TFLOPS BF16
|
||||
fp8_flops: 0.0, // A100 has no FP8 tensor cores
|
||||
fp4_flops: 0.0,
|
||||
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
|
||||
hbm: 80.0e9, // 80 GB
|
||||
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
|
||||
hbm: 80.0e9, // 80 GB
|
||||
pcie_gen: 4,
|
||||
};
|
||||
|
||||
const A100_40GB: GpuBase = GpuBase {
|
||||
flops: 3.12e14, // 312 TFLOPS BF16
|
||||
flops: 3.12e14, // 312 TFLOPS BF16
|
||||
fp8_flops: 0.0,
|
||||
fp4_flops: 0.0,
|
||||
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
|
||||
hbm: 40.0e9, // 40 GB
|
||||
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
|
||||
hbm: 40.0e9, // 40 GB
|
||||
pcie_gen: 4,
|
||||
};
|
||||
|
||||
@@ -143,11 +143,11 @@ const B200: GpuBase = GpuBase {
|
||||
|
||||
// DGX B300 (8 GPU) specs: BF16 18 PFLOPS, FP8 ~54 PFLOPS, FP4 108 PFLOPS (dense)
|
||||
const B300: GpuBase = GpuBase {
|
||||
flops: 2.25e15, // 2250 TFLOPS BF16 dense (same GB202 die as B200)
|
||||
fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
|
||||
fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
|
||||
mem_bw: 12.0e12, // 12 TB/s HBM3e 12-Hi
|
||||
hbm: 288.0e9, // 288 GB HBM3e 12-Hi
|
||||
flops: 2.25e15, // 2250 TFLOPS BF16 dense (same GB202 die as B200)
|
||||
fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
|
||||
fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
|
||||
mem_bw: 12.0e12, // 12 TB/s HBM3e 12-Hi
|
||||
hbm: 288.0e9, // 288 GB HBM3e 12-Hi
|
||||
pcie_gen: 6,
|
||||
};
|
||||
|
||||
@@ -162,15 +162,15 @@ fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
|
||||
|
||||
// PCIe per-GPU bandwidth and latency by generation
|
||||
let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
|
||||
6 => (128.0e9, 4.0), // Gen6 x16
|
||||
5 => (64.0e9, 5.0), // Gen5 x16
|
||||
_ => (32.0e9, 5.0), // Gen4 x16
|
||||
6 => (128.0e9, 4.0), // Gen6 x16
|
||||
5 => (64.0e9, 5.0), // Gen5 x16
|
||||
_ => (32.0e9, 5.0), // Gen4 x16
|
||||
};
|
||||
|
||||
// RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
|
||||
let (rdma_base, rdma_lat) = match base.pcie_gen {
|
||||
6 => (50.0e9, 6.0), // 400 Gbps NIC
|
||||
_ => (25.0e9, 8.0), // 200 Gbps NIC
|
||||
6 => (50.0e9, 6.0), // 400 Gbps NIC
|
||||
_ => (25.0e9, 8.0), // 200 Gbps NIC
|
||||
};
|
||||
let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };
|
||||
|
||||
|
||||
Reference in New Issue
Block a user