KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/sim/engine.rs
+++ b/src/sim/engine.rs
@@ -0,0 +1,113 @@
+//! Discrete-event engine.
+//!
+//! Single-threaded virtual time `f64` seconds. Events are stored in a min-heap
+//! keyed by `(time, seq)` so equal-time events fire in insertion order.
+
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+
+use super::events::Event;
+
+#[derive(Debug)]
+struct Slot {
+    time: f64,
+    seq: u64,
+    event: Event,
+}
+
+impl Eq for Slot {}
+impl PartialEq for Slot {
+    fn eq(&self, other: &Self) -> bool {
+        self.time == other.time && self.seq == other.seq
+    }
+}
+impl Ord for Slot {
+    fn cmp(&self, other: &Self) -> Ordering {
+        // Reverse so BinaryHeap acts as a min-heap.
+        other
+            .time
+            .partial_cmp(&self.time)
+            .unwrap_or(Ordering::Equal)
+            .then_with(|| other.seq.cmp(&self.seq))
+    }
+}
+impl PartialOrd for Slot {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct EventQueue {
+    heap: BinaryHeap<Slot>,
+    seq: u64,
+    now: f64,
+}
+
+impl EventQueue {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn now(&self) -> f64 {
+        self.now
+    }
+
+    pub fn schedule(&mut self, time: f64, event: Event) {
+        let t = time.max(self.now);
+        self.seq += 1;
+        self.heap.push(Slot { time: t, seq: self.seq, event });
+    }
+
+    pub fn pop(&mut self) -> Option<(f64, Event)> {
+        let slot = self.heap.pop()?;
+        if slot.time > self.now {
+            self.now = slot.time;
+        }
+        Some((slot.time, slot.event))
+    }
+
+    pub fn len(&self) -> usize {
+        self.heap.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.heap.is_empty()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::InstanceId;
+
+    #[test]
+    fn pops_in_time_order() {
+        let mut q = EventQueue::new();
+        q.schedule(2.0, Event::BatchTick { instance: 0 as InstanceId });
+        q.schedule(1.0, Event::BatchTick { instance: 1 });
+        q.schedule(1.5, Event::BatchTick { instance: 2 });
+        let (t1, _) = q.pop().unwrap();
+        let (t2, _) = q.pop().unwrap();
+        let (t3, _) = q.pop().unwrap();
+        assert!(t1 <= t2 && t2 <= t3);
+        assert!((t1 - 1.0).abs() < 1e-12);
+        assert!((t3 - 2.0).abs() < 1e-12);
+    }
+
+    #[test]
+    fn equal_time_fifo() {
+        let mut q = EventQueue::new();
+        q.schedule(1.0, Event::BatchTick { instance: 7 });
+        q.schedule(1.0, Event::BatchTick { instance: 8 });
+        let (_, e1) = q.pop().unwrap();
+        let (_, e2) = q.pop().unwrap();
+        match (e1, e2) {
+            (Event::BatchTick { instance: a }, Event::BatchTick { instance: b }) => {
+                assert_eq!(a, 7);
+                assert_eq!(b, 8);
+            }
+            _ => panic!("wrong events"),
+        }
+    }
+}
--- a/src/sim/events.rs
+++ b/src/sim/events.rs
@@ -0,0 +1,15 @@
+//! Event types for the discrete-event engine.
+
+use crate::types::{InstanceId, ReqId};
+
+#[derive(Debug)]
+pub enum Event {
+    /// New trace request arrives at the cluster router.
+    Arrival { req_id: ReqId },
+    /// Per-instance scheduler tick (continuous batching).
+    BatchTick { instance: InstanceId },
+    /// Periodic time-series sample of all instances.
+    Sample,
+    /// Stop the simulation early (used internally).
+    Stop,
+}
--- a/src/sim/mod.rs
+++ b/src/sim/mod.rs
@@ -0,0 +1,5 @@
+pub mod engine;
+pub mod events;
+
+pub use engine::EventQueue;
+pub use events::Event;