fix: cache calculation

2026-04-15 17:31:39 +08:00
parent 365ceac3be
commit ff316c6873
23 changed files with 500 additions and 336 deletions
--- a/src/hardware_presets.rs
+++ b/src/hardware_presets.rs
@@ -78,56 +78,56 @@ fn parse_count_gpu(s: &str) -> (u32, String) {
 // -- Per-GPU base specs (single die, BF16 dense) -----------------------------

 struct GpuBase {
-    flops: f64,      // BF16 dense FLOPS
-    fp8_flops: f64,  // FP8 dense FLOPS (0 = not supported)
-    fp4_flops: f64,  // FP4 dense FLOPS (0 = not supported)
-    mem_bw: f64,     // HBM bandwidth (B/s)
-    hbm: f64,        // Total HBM (bytes)
-    pcie_gen: u32,   // PCIe generation (4/5/6)
+    flops: f64,     // BF16 dense FLOPS
+    fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
+    fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
+    mem_bw: f64,    // HBM bandwidth (B/s)
+    hbm: f64,       // Total HBM (bytes)
+    pcie_gen: u32,  // PCIe generation (4/5/6)
 }

 const H100: GpuBase = GpuBase {
-    flops: 9.89e14,    // 989 TFLOPS BF16 dense
+    flops: 9.89e14,      // 989 TFLOPS BF16 dense
    fp8_flops: 1.979e15, // 1979 TFLOPS FP8 dense
-    fp4_flops: 0.0,    // not supported
-    mem_bw: 3.35e12,   // 3.35 TB/s HBM3
-    hbm: 80.0e9,       // 80 GB
+    fp4_flops: 0.0,      // not supported
+    mem_bw: 3.35e12,     // 3.35 TB/s HBM3
+    hbm: 80.0e9,         // 80 GB
    pcie_gen: 5,
 };

 const H800: GpuBase = GpuBase {
-    flops: 9.89e14,    // same die as H100
+    flops: 9.89e14, // same die as H100
    fp8_flops: 1.979e15,
    fp4_flops: 0.0,
-    mem_bw: 3.35e12,   // 3.35 TB/s HBM3
-    hbm: 80.0e9,       // 80 GB
+    mem_bw: 3.35e12, // 3.35 TB/s HBM3
+    hbm: 80.0e9,     // 80 GB
    pcie_gen: 5,
 };

 const H20: GpuBase = GpuBase {
-    flops: 1.48e14,    // 148 TFLOPS BF16 (China-export Hopper)
+    flops: 1.48e14,     // 148 TFLOPS BF16 (China-export Hopper)
    fp8_flops: 2.96e14, // 296 TFLOPS FP8
-    fp4_flops: 0.0,    // not supported
-    mem_bw: 4.0e12,    // 4.0 TB/s HBM3
-    hbm: 96.0e9,       // 96 GB
+    fp4_flops: 0.0,     // not supported
+    mem_bw: 4.0e12,     // 4.0 TB/s HBM3
+    hbm: 96.0e9,        // 96 GB
    pcie_gen: 5,
 };

 const A100_80GB: GpuBase = GpuBase {
-    flops: 3.12e14,    // 312 TFLOPS BF16
-    fp8_flops: 0.0,    // A100 has no FP8 tensor cores
+    flops: 3.12e14, // 312 TFLOPS BF16
+    fp8_flops: 0.0, // A100 has no FP8 tensor cores
    fp4_flops: 0.0,
-    mem_bw: 2.0e12,    // 2.0 TB/s HBM2e
-    hbm: 80.0e9,       // 80 GB
+    mem_bw: 2.0e12, // 2.0 TB/s HBM2e
+    hbm: 80.0e9,    // 80 GB
    pcie_gen: 4,
 };

 const A100_40GB: GpuBase = GpuBase {
-    flops: 3.12e14,    // 312 TFLOPS BF16
+    flops: 3.12e14, // 312 TFLOPS BF16
    fp8_flops: 0.0,
    fp4_flops: 0.0,
-    mem_bw: 1.555e12,  // 1.555 TB/s HBM2e
-    hbm: 40.0e9,       // 40 GB
+    mem_bw: 1.555e12, // 1.555 TB/s HBM2e
+    hbm: 40.0e9,      // 40 GB
    pcie_gen: 4,
 };

@@ -143,11 +143,11 @@ const B200: GpuBase = GpuBase {

 // DGX B300 (8 GPU) specs: BF16 18 PFLOPS, FP8 ~54 PFLOPS, FP4 108 PFLOPS (dense)
 const B300: GpuBase = GpuBase {
-    flops: 2.25e15,      // 2250 TFLOPS BF16 dense (same GB202 die as B200)
-    fp8_flops: 6.75e15,  // 6750 TFLOPS FP8 dense (estimated from FP4/2)
-    fp4_flops: 13.5e15,  // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
-    mem_bw: 12.0e12,     // 12 TB/s HBM3e 12-Hi
-    hbm: 288.0e9,        // 288 GB HBM3e 12-Hi
+    flops: 2.25e15,     // 2250 TFLOPS BF16 dense (same GB202 die as B200)
+    fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
+    fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
+    mem_bw: 12.0e12,    // 12 TB/s HBM3e 12-Hi
+    hbm: 288.0e9,       // 288 GB HBM3e 12-Hi
    pcie_gen: 6,
 };

@@ -162,15 +162,15 @@ fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {

    // PCIe per-GPU bandwidth and latency by generation
    let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
-        6 => (128.0e9, 4.0),  // Gen6 x16
-        5 => (64.0e9, 5.0),   // Gen5 x16
-        _ => (32.0e9, 5.0),   // Gen4 x16
+        6 => (128.0e9, 4.0), // Gen6 x16
+        5 => (64.0e9, 5.0),  // Gen5 x16
+        _ => (32.0e9, 5.0),  // Gen4 x16
    };

    // RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
    let (rdma_base, rdma_lat) = match base.pcie_gen {
-        6 => (50.0e9, 6.0),  // 400 Gbps NIC
-        _ => (25.0e9, 8.0),  // 200 Gbps NIC
+        6 => (50.0e9, 6.0), // 400 Gbps NIC
+        _ => (25.0e9, 8.0), // 200 Gbps NIC
    };
    let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };