KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/cluster/cluster.rs
+++ b/src/cluster/cluster.rs
@@ -0,0 +1,167 @@
+//! Cluster: routes arrivals, performs the L0 / L1 / remote-RDMA fetch chain
+//! described in the design diagram, and bookkeeps the global meta store.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::config::{Config, ModelConfig};
+use crate::instance::instance::AdmittedRequest;
+use crate::instance::Instance;
+use crate::router::{self, RouteDecision, Router};
+use crate::trace::RequestRecord;
+use crate::types::InstanceId;
+
+#[derive(Debug, Clone)]
+pub struct AdmissionStats {
+    pub instance: InstanceId,
+    pub l0_hit_blocks: u32,
+    pub l1_hit_blocks: u32,
+    pub remote_hit_blocks: u32,
+    pub miss_blocks: u32,
+    pub rdma_bytes: u64,
+    pub pcie_bytes: u64,
+    pub fetch_time_s: f64,
+    pub probe_overhead_s: f64,
+    pub ready_at: f64,
+    pub decision: RouteDecision,
+}
+
+pub struct Cluster {
+    pub instances: Vec<Instance>,
+    pub meta_store: MetaStore,
+    pub router: Box<dyn Router>,
+    pub block_size_tokens: u32,
+    pub kv_block_bytes: u64,
+}
+
+impl Cluster {
+    pub fn new(config: &Config, model: &ModelConfig) -> Self {
+        let mut instances = Vec::with_capacity(config.cluster.num_instances as usize);
+        for id in 0..config.cluster.num_instances {
+            instances.push(Instance::new(id as InstanceId, model, &config.hardware));
+        }
+        let meta_store = MetaStore::new(config.cluster.meta_store.ttl_seconds);
+        let router = router::build(config, config.sim.seed);
+        Self {
+            instances,
+            meta_store,
+            router,
+            block_size_tokens: model.block_size_tokens,
+            kv_block_bytes: model.kv_block_bytes(),
+        }
+    }
+
+    /// Route + admit a request. Returns the chosen instance plus rich
+    /// per-request stats for metrics. Does NOT schedule the BatchTick — the
+    /// simulator driver does that based on the returned `ready_at`.
+    pub fn route_and_admit(&mut self, req: &RequestRecord, now: f64) -> AdmissionStats {
+        let decision = self.router.route(req, &self.instances, &self.meta_store, now);
+        let inst_id = decision.chosen;
+        let probe_overhead_s = decision.probe_overhead_s;
+
+        // The router probe overhead delays the request's effective start time.
+        let effective_now = now + probe_overhead_s;
+
+        let inst = &mut self.instances[inst_id as usize];
+        let total_blocks = req.hash_ids.len() as u32;
+
+        // 1. L0 lookup (touches matched blocks).
+        let l0_hits = inst.cache.l0.longest_prefix(&req.hash_ids) as u32;
+
+        // 2. L1 lookup on the remaining suffix.
+        let suffix_after_l0 = &req.hash_ids[l0_hits as usize..];
+        let l1_hits = inst.cache.l1.longest_prefix(suffix_after_l0) as u32;
+        // L1->L0 transfer cost
+        let l1_bytes = (l1_hits as u64) * self.kv_block_bytes;
+        let mut t = effective_now;
+        if l1_hits > 0 {
+            t = inst.links.pcie.reserve(t, l1_bytes);
+            // Promote those blocks into L0
+            let mut evicted = Vec::new();
+            inst.cache.l0.insert_blocks(
+                &suffix_after_l0[..l1_hits as usize],
+                &mut evicted,
+            );
+        }
+
+        // 3. Remote v6d lookup for the still-remaining suffix.
+        let suffix_after_l1 = &suffix_after_l0[l1_hits as usize..];
+        let mut remote_hit_blocks: u32 = 0;
+        for &h in suffix_after_l1 {
+            // A block is remotely available iff some instance other than
+            // `inst_id` lists it (and not expired).
+            let owners = self.meta_store.instances_for(h, now);
+            let any_remote = owners.iter().any(|o| *o != inst_id);
+            if any_remote {
+                remote_hit_blocks += 1;
+            } else {
+                break; // contiguous prefix - stop on first miss
+            }
+        }
+        let remote_bytes = (remote_hit_blocks as u64) * self.kv_block_bytes;
+        if remote_hit_blocks > 0 {
+            // RDMA from peer host -> local DRAM, then PCIe -> GPU
+            let inst = &mut self.instances[inst_id as usize];
+            t = inst.links.rdma.reserve(t, remote_bytes);
+            t = inst.links.pcie.reserve(t, remote_bytes);
+            // Insert into local L1 (occupies LRU space) AND into L0
+            let pulled = &suffix_after_l1[..remote_hit_blocks as usize];
+            let mut evicted_l1 = Vec::new();
+            inst.cache.l1.insert_blocks(pulled, &mut evicted_l1);
+            let mut evicted_l0 = Vec::new();
+            inst.cache.l0.insert_blocks(pulled, &mut evicted_l0);
+            // The local instance now also owns these blocks - update meta_store.
+            for &h in pulled {
+                self.meta_store.insert(h, inst_id, now);
+            }
+        }
+
+        // 4. Miss = remaining tokens to prefill from scratch.
+        let miss_blocks = total_blocks - l0_hits - l1_hits - remote_hit_blocks;
+        let miss_tokens = miss_blocks * self.block_size_tokens;
+
+        // The newly-prefilled blocks (after the request runs) are inserted
+        // into L0 here, and into L1 / meta_store via async writeback. Doing
+        // this at admission time is OK because we're tracking presence, not
+        // actually moving bytes — the writeback latency is hidden behind
+        // request execution and we don't model meta_store inconsistency
+        // window beyond the TTL itself.
+        let inst = &mut self.instances[inst_id as usize];
+        let new_input_blocks = &req.hash_ids[(l0_hits + l1_hits + remote_hit_blocks) as usize..];
+        let mut evicted_l0 = Vec::new();
+        inst.cache.l0.insert_blocks(new_input_blocks, &mut evicted_l0);
+        let mut evicted_l1 = Vec::new();
+        inst.cache.l1.insert_blocks(new_input_blocks, &mut evicted_l1);
+        for &h in new_input_blocks {
+            self.meta_store.insert(h, inst_id, now);
+        }
+
+        // 5. Reserve KV slots for this request's prefill residency.
+        //    PD disaggregation: decode runs elsewhere, so only the input
+        //    blocks occupy HBM on this instance.
+        let reserved_blocks = total_blocks;
+        let admitted = AdmittedRequest {
+            req_id: req.req_id,
+            arrival: req.arrival,
+            ready_at: t,
+            prefill_tokens_remaining: miss_tokens,
+            reserved_blocks,
+        };
+        inst.admit(admitted);
+
+        let pcie_bytes = l1_bytes + remote_bytes;
+        let fetch_time_s = (t - effective_now).max(0.0);
+
+        AdmissionStats {
+            instance: inst_id,
+            l0_hit_blocks: l0_hits,
+            l1_hit_blocks: l1_hits,
+            remote_hit_blocks,
+            miss_blocks,
+            rdma_bytes: remote_bytes,
+            pcie_bytes,
+            fetch_time_s,
+            probe_overhead_s,
+            ready_at: t,
+            decision,
+        }
+    }
+}
--- a/src/cluster/meta_store.rs
+++ b/src/cluster/meta_store.rs
@@ -0,0 +1,161 @@
+//! Global redis-like KV-cache index.
+//!
+//! Maps `block_hash -> SmallVec<(instance_id, expires_at)>`. TTL eviction is
+//! lazy (on read). The TTL-aware router uses `score_prefix` to score each
+//! instance's predicted longest prefix without probing instances directly.
+
+use ahash::AHashMap;
+use smallvec::SmallVec;
+
+use crate::types::InstanceId;
+
+#[derive(Debug, Clone, Copy)]
+struct Entry {
+    instance: InstanceId,
+    expires_at: f64,
+}
+
+#[derive(Debug, Default)]
+pub struct MetaStore {
+    ttl_seconds: f64,
+    map: AHashMap<u64, SmallVec<[Entry; 4]>>,
+}
+
+impl MetaStore {
+    pub fn new(ttl_seconds: f64) -> Self {
+        Self {
+            ttl_seconds,
+            map: AHashMap::with_capacity(1 << 16),
+        }
+    }
+
+    pub fn ttl(&self) -> f64 {
+        self.ttl_seconds
+    }
+
+    /// Record that `instance` now holds `block_hash`.
+    pub fn insert(&mut self, block_hash: u64, instance: InstanceId, now: f64) {
+        let entry = Entry {
+            instance,
+            expires_at: now + self.ttl_seconds,
+        };
+        let bucket = self.map.entry(block_hash).or_default();
+        // refresh existing entry if present
+        for e in bucket.iter_mut() {
+            if e.instance == instance {
+                e.expires_at = entry.expires_at;
+                return;
+            }
+        }
+        bucket.push(entry);
+    }
+
+    /// Score each candidate instance by the longest leading prefix of
+    /// `hash_ids` for which the meta store believes that instance still holds
+    /// every block. Returns scores indexed by instance id.
+    pub fn score_prefix(&self, hash_ids: &[u64], now: f64, num_instances: usize) -> Vec<u32> {
+        if hash_ids.is_empty() {
+            return vec![0; num_instances];
+        }
+        // Walk hashes; at each step intersect the still-eligible instance set.
+        // Use a small bitset since num_instances is typically <= 1024.
+        let mut alive: Vec<bool> = vec![false; num_instances];
+        // First block: seed alive set
+        let first = hash_ids[0];
+        let mut any = false;
+        if let Some(bucket) = self.map.get(&first) {
+            for e in bucket {
+                if e.expires_at >= now {
+                    let i = e.instance as usize;
+                    if i < num_instances {
+                        alive[i] = true;
+                        any = true;
+                    }
+                }
+            }
+        }
+        let mut scores = vec![0u32; num_instances];
+        if !any {
+            return scores;
+        }
+        for i in 0..num_instances {
+            if alive[i] {
+                scores[i] = 1;
+            }
+        }
+        // Subsequent blocks: an instance survives only if the meta store still
+        // lists it for that block (and not expired).
+        for (depth, &h) in hash_ids.iter().enumerate().skip(1) {
+            let bucket = match self.map.get(&h) {
+                Some(b) => b,
+                None => break,
+            };
+            // mark instances present for this block
+            let mut present = vec![false; num_instances];
+            let mut any2 = false;
+            for e in bucket {
+                if e.expires_at >= now {
+                    let i = e.instance as usize;
+                    if i < num_instances && alive[i] {
+                        present[i] = true;
+                        any2 = true;
+                    }
+                }
+            }
+            if !any2 {
+                break;
+            }
+            for i in 0..num_instances {
+                if present[i] {
+                    scores[i] = (depth + 1) as u32;
+                } else {
+                    alive[i] = false;
+                }
+            }
+        }
+        scores
+    }
+
+    /// Lookup which (alive) instances claim to hold a given block.
+    pub fn instances_for(&self, hash: u64, now: f64) -> SmallVec<[InstanceId; 4]> {
+        let mut out = SmallVec::new();
+        if let Some(bucket) = self.map.get(&hash) {
+            for e in bucket {
+                if e.expires_at >= now {
+                    out.push(e.instance);
+                }
+            }
+        }
+        out
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn score_prefix_basic() {
+        let mut m = MetaStore::new(60.0);
+        m.insert(10, 0, 0.0);
+        m.insert(11, 0, 0.0);
+        m.insert(12, 0, 0.0);
+        m.insert(10, 1, 0.0);
+        m.insert(11, 1, 0.0);
+        // instance 1 only has 10,11; instance 0 has 10,11,12
+        let s = m.score_prefix(&[10, 11, 12, 13], 1.0, 4);
+        assert_eq!(s[0], 3);
+        assert_eq!(s[1], 2);
+        assert_eq!(s[2], 0);
+    }
+
+    #[test]
+    fn ttl_expiry() {
+        let mut m = MetaStore::new(1.0);
+        m.insert(10, 0, 0.0);
+        let s_now = m.score_prefix(&[10], 0.5, 2);
+        assert_eq!(s_now[0], 1);
+        let s_later = m.score_prefix(&[10], 5.0, 2);
+        assert_eq!(s_later[0], 0);
+    }
+}
--- a/src/cluster/mod.rs
+++ b/src/cluster/mod.rs
@@ -0,0 +1,6 @@
+pub mod meta_store;
+#[allow(clippy::module_inception)]
+pub mod cluster;
+
+pub use cluster::Cluster;
+pub use meta_store::MetaStore;