KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/network.rs
+++ b/src/network.rs
@@ -0,0 +1,84 @@
+//! Network cost models for RDMA (cross-instance) and PCIe (host<->GPU).
+//!
+//! Each link is modeled as a token bucket via a `next_free` cursor: a fetch of
+//! `bytes` starting at `now` waits until `next_free`, then advances the cursor
+//! by `bytes / bw`. Latency is added on top of transfer time. This captures
+//! contention without simulating individual packets.
+
+use crate::config::HardwareConfig;
+
+#[derive(Debug, Clone)]
+pub struct LinkModel {
+    pub bw_bytes_per_s: f64,
+    pub latency_s: f64,
+    next_free: f64,
+}
+
+impl LinkModel {
+    pub fn new(bw_bytes_per_s: f64, latency_s: f64) -> Self {
+        Self {
+            bw_bytes_per_s,
+            latency_s,
+            next_free: 0.0,
+        }
+    }
+
+    /// Reserve a transfer of `bytes` starting at `now`. Returns the absolute
+    /// time at which the bytes have all arrived (advances internal cursor).
+    pub fn reserve(&mut self, now: f64, bytes: u64) -> f64 {
+        if bytes == 0 {
+            return now + self.latency_s;
+        }
+        let xfer = bytes as f64 / self.bw_bytes_per_s;
+        let start = self.next_free.max(now);
+        self.next_free = start + xfer;
+        self.next_free + self.latency_s
+    }
+
+    /// Pure cost (no contention): how long to push `bytes` over this link.
+    pub fn cost(&self, bytes: u64) -> f64 {
+        if bytes == 0 {
+            self.latency_s
+        } else {
+            self.latency_s + bytes as f64 / self.bw_bytes_per_s
+        }
+    }
+}
+
+/// Per-instance bundle of links: PCIe (host<->GPU) and RDMA (host<->remote).
+#[derive(Debug, Clone)]
+pub struct InstanceLinks {
+    pub pcie: LinkModel,
+    pub rdma: LinkModel,
+}
+
+impl InstanceLinks {
+    pub fn from_hw(hw: &HardwareConfig) -> Self {
+        Self {
+            pcie: LinkModel::new(hw.pcie_bw, hw.pcie_latency_us * 1e-6),
+            rdma: LinkModel::new(hw.rdma_bw, hw.rdma_latency_us * 1e-6),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn link_cost_matches_formula() {
+        let l = LinkModel::new(1.0e9, 1.0e-6);
+        // 1 GB / (1 GB/s) = 1s, plus 1us latency
+        let t = l.cost(1_000_000_000);
+        assert!((t - (1.0 + 1e-6)).abs() < 1e-9);
+    }
+
+    #[test]
+    fn reserve_serializes_concurrent_transfers() {
+        let mut l = LinkModel::new(1.0e9, 0.0);
+        let t1 = l.reserve(0.0, 500_000_000); // 0.5s
+        let t2 = l.reserve(0.0, 500_000_000); // contended -> 1.0s
+        assert!((t1 - 0.5).abs() < 1e-9);
+        assert!((t2 - 1.0).abs() < 1e-9);
+    }
+}