KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/oracle.rs
+++ b/src/oracle.rs
@@ -0,0 +1,279 @@
+//! Offline oracle analyzers for upper-bound KV-cache hit rates.
+//!
+//! Two analyses, both treating the cluster as a single aggregated cache so
+//! the result is independent of routing — i.e. they answer the question
+//! "what is the best the cluster could possibly do?":
+//!
+//! 1. **Unlimited capacity**: longest-prefix-match against an unbounded
+//!    cache. The only misses are blocks that the prefix walk encounters for
+//!    the first time. Sets the absolute ceiling.
+//!
+//! 2. **Belady (offline optimal eviction) at finite capacity**: classic
+//!    OPT replacement — evict the cached block whose *next* access is
+//!    furthest in the future. Run alongside an LRU baseline at the same
+//!    capacity so the gap tells you how much room LRU is leaving.
+//!
+//! Hit accounting uses prefix-match semantics matching the rest of the
+//! simulator: a block at position k in a request counts as a hit only if
+//! all positions 0..k are also in the cache.
+
+use ahash::{AHashMap, AHashSet};
+use serde::Serialize;
+use std::collections::BinaryHeap;
+
+use crate::instance::kv_cache::LruBlocks;
+use crate::trace::RequestRecord;
+
+#[derive(Debug, Clone, Serialize)]
+pub struct OracleResult {
+    pub num_requests: u64,
+    pub total_blocks: u64,
+    pub unique_blocks: u64,
+    pub unlimited: TierResult,
+    pub belady_finite: TierResult,
+    pub lru_finite: TierResult,
+}
+
+#[derive(Debug, Clone, Serialize, Default)]
+pub struct TierResult {
+    pub label: String,
+    pub capacity_blocks: u64,
+    pub hits: u64,
+    pub misses: u64,
+    pub hit_rate: f64,
+}
+
+impl TierResult {
+    fn from_counts(label: &str, capacity_blocks: u64, hits: u64, total: u64) -> Self {
+        let misses = total.saturating_sub(hits);
+        TierResult {
+            label: label.to_string(),
+            capacity_blocks,
+            hits,
+            misses,
+            hit_rate: if total == 0 { 0.0 } else { hits as f64 / total as f64 },
+        }
+    }
+}
+
+pub fn analyze(records: &[RequestRecord], capacity_blocks: u64) -> OracleResult {
+    // total / unique counters
+    let total_blocks: u64 = records.iter().map(|r| r.hash_ids.len() as u64).sum();
+    let mut unique = AHashSet::new();
+    for r in records {
+        for &h in &r.hash_ids {
+            unique.insert(h);
+        }
+    }
+
+    // 1. Unlimited cache
+    let unlimited_hits = run_unlimited(records);
+    let unlimited = TierResult::from_counts(
+        "unlimited",
+        u64::MAX,
+        unlimited_hits,
+        total_blocks,
+    );
+
+    // 2. Precompute next-use index for Belady
+    let next_use = build_next_use(records);
+
+    // 3. Belady at the given capacity
+    let belady_hits = run_belady(records, &next_use, capacity_blocks as usize);
+    let belady = TierResult::from_counts("belady", capacity_blocks, belady_hits, total_blocks);
+
+    // 4. LRU baseline at the same capacity
+    let lru_hits = run_lru(records, capacity_blocks as usize);
+    let lru = TierResult::from_counts("lru", capacity_blocks, lru_hits, total_blocks);
+
+    OracleResult {
+        num_requests: records.len() as u64,
+        total_blocks,
+        unique_blocks: unique.len() as u64,
+        unlimited,
+        belady_finite: belady,
+        lru_finite: lru,
+    }
+}
+
+fn run_unlimited(records: &[RequestRecord]) -> u64 {
+    let mut seen: AHashSet<u64> = AHashSet::with_capacity(1 << 18);
+    let mut hits: u64 = 0;
+    for r in records {
+        // Longest prefix match against `seen`
+        for &h in &r.hash_ids {
+            if seen.contains(&h) {
+                hits += 1;
+            } else {
+                break;
+            }
+        }
+        for &h in &r.hash_ids {
+            seen.insert(h);
+        }
+    }
+    hits
+}
+
+fn run_lru(records: &[RequestRecord], capacity: usize) -> u64 {
+    if capacity == 0 {
+        return 0;
+    }
+    let mut cache = LruBlocks::new(capacity);
+    let mut hits: u64 = 0;
+    let mut evicted = Vec::new();
+    for r in records {
+        hits += cache.longest_prefix(&r.hash_ids) as u64;
+        evicted.clear();
+        cache.insert_blocks(&r.hash_ids, &mut evicted);
+    }
+    hits
+}
+
+/// For each (request_idx, position_in_hash_ids) compute the next request
+/// index whose `hash_ids` contains the same block (`u32::MAX` if none).
+fn build_next_use(records: &[RequestRecord]) -> Vec<Vec<u32>> {
+    let n = records.len();
+    let mut next_use: Vec<Vec<u32>> = Vec::with_capacity(n);
+    for r in records {
+        next_use.push(vec![u32::MAX; r.hash_ids.len()]);
+    }
+    let mut last_seen: AHashMap<u64, u32> = AHashMap::with_capacity(1 << 18);
+    for i in (0..n).rev() {
+        let r = &records[i];
+        for (j, &h) in r.hash_ids.iter().enumerate() {
+            next_use[i][j] = *last_seen.get(&h).unwrap_or(&u32::MAX);
+        }
+        for &h in &r.hash_ids {
+            last_seen.insert(h, i as u32);
+        }
+    }
+    next_use
+}
+
+/// Belady (offline OPT) eviction over the trace.
+///
+/// Implementation: lazy-deletion max-heap keyed by next-use index. Each
+/// cache entry has a version; the heap may contain stale entries from
+/// previous insertions, which we skip on pop.
+fn run_belady(records: &[RequestRecord], next_use: &[Vec<u32>], capacity: usize) -> u64 {
+    if capacity == 0 {
+        return 0;
+    }
+    // block_hash -> (current_version, current_next_use)
+    let mut in_cache: AHashMap<u64, (u64, u32)> = AHashMap::with_capacity(capacity);
+    // (next_use, version, block_hash) — BinaryHeap is max-heap, which is what
+    // we want for "evict the entry whose next access is furthest".
+    let mut heap: BinaryHeap<(u32, u64, u64)> = BinaryHeap::with_capacity(capacity);
+    let mut version: u64 = 0;
+    let mut hits: u64 = 0;
+
+    for (i, r) in records.iter().enumerate() {
+        // 1. Longest-prefix hit accounting against current cache.
+        for &h in &r.hash_ids {
+            if in_cache.contains_key(&h) {
+                hits += 1;
+            } else {
+                break;
+            }
+        }
+
+        // 2. Insert / update each block in the request with its new next-use.
+        for (j, &h) in r.hash_ids.iter().enumerate() {
+            let nu = next_use[i][j];
+            if let Some(slot) = in_cache.get_mut(&h) {
+                version += 1;
+                slot.0 = version;
+                slot.1 = nu;
+                heap.push((nu, version, h));
+                continue;
+            }
+            // Need to make room?
+            if in_cache.len() == capacity {
+                // Evict max next_use entry, skipping stale heap entries.
+                loop {
+                    let (nu_top, ver_top, h_top) = match heap.pop() {
+                        Some(x) => x,
+                        None => break,
+                    };
+                    if let Some(&(cur_ver, cur_nu)) = in_cache.get(&h_top) {
+                        if cur_ver == ver_top && cur_nu == nu_top {
+                            in_cache.remove(&h_top);
+                            break;
+                        }
+                    }
+                    // stale; loop
+                }
+            }
+            version += 1;
+            in_cache.insert(h, (version, nu));
+            heap.push((nu, version, h));
+        }
+    }
+
+    hits
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn req(id: u64, t: f64, hashes: Vec<u64>) -> RequestRecord {
+        RequestRecord {
+            req_id: id,
+            chat_id: id as i64,
+            arrival: t,
+            input_len: (hashes.len() as u32) * 16,
+            output_len: 16,
+            hash_ids: hashes,
+        }
+    }
+
+    #[test]
+    fn unlimited_first_occurrence_misses() {
+        let recs = vec![
+            req(0, 0.0, vec![1, 2, 3]),
+            req(1, 1.0, vec![1, 2, 3, 4]),
+            req(2, 2.0, vec![1, 2, 3, 4, 5]),
+        ];
+        let out = analyze(&recs, 100);
+        // total = 3 + 4 + 5 = 12
+        assert_eq!(out.total_blocks, 12);
+        // unique = {1,2,3,4,5} = 5
+        assert_eq!(out.unique_blocks, 5);
+        // unlimited hits = 0 (req 0 all miss) + 3 (req 1 has [1,2,3] cached, then 4 miss) + 4
+        assert_eq!(out.unlimited.hits, 7);
+        assert!((out.unlimited.hit_rate - 7.0 / 12.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn belady_beats_lru_when_lru_thrashes() {
+        // Capacity 2. Pattern designed so LRU thrashes but Belady keeps the
+        // useful block: A B A C A B A C A ...
+        let mut recs = Vec::new();
+        let pattern = [1u64, 2, 1, 3, 1, 2, 1, 3];
+        for (i, &h) in pattern.iter().enumerate() {
+            recs.push(req(i as u64, i as f64, vec![h]));
+        }
+        let out = analyze(&recs, 2);
+        assert!(
+            out.belady_finite.hits >= out.lru_finite.hits,
+            "belady should be at least as good as lru: belady={} lru={}",
+            out.belady_finite.hits,
+            out.lru_finite.hits
+        );
+    }
+
+    #[test]
+    fn unlimited_is_upper_bound() {
+        let recs = vec![
+            req(0, 0.0, vec![10, 20, 30]),
+            req(1, 1.0, vec![10, 20, 30, 40, 50]),
+            req(2, 2.0, vec![60]),
+            req(3, 3.0, vec![10, 20, 30, 40, 50, 60]),
+        ];
+        let out = analyze(&recs, 3);
+        assert!(out.unlimited.hit_rate >= out.belady_finite.hit_rate);
+        assert!(out.belady_finite.hit_rate >= out.lru_finite.hit_rate - 1e-9);
+    }
+}