KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies
in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache
hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention,
architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide
meta-store for prefix-aware routing decisions.

Includes 11 routing policies (random, round_robin, least_loaded,
least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score,
estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing,
built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation
tooling for systematic policy comparison across real Alibaba serving traces.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions

279
src/oracle.rs Normal file
View File

@@ -0,0 +1,279 @@
//! Offline oracle analyzers for upper-bound KV-cache hit rates.
//!
//! Two analyses, both treating the cluster as a single aggregated cache so
//! the result is independent of routing — i.e. they answer the question
//! "what is the best the cluster could possibly do?":
//!
//! 1. **Unlimited capacity**: longest-prefix-match against an unbounded
//! cache. The only misses are blocks that the prefix walk encounters for
//! the first time. Sets the absolute ceiling.
//!
//! 2. **Belady (offline optimal eviction) at finite capacity**: classic
//! OPT replacement — evict the cached block whose *next* access is
//! furthest in the future. Run alongside an LRU baseline at the same
//! capacity so the gap tells you how much room LRU is leaving.
//!
//! Hit accounting uses prefix-match semantics matching the rest of the
//! simulator: a block at position k in a request counts as a hit only if
//! all positions 0..k are also in the cache.
use ahash::{AHashMap, AHashSet};
use serde::Serialize;
use std::collections::BinaryHeap;
use crate::instance::kv_cache::LruBlocks;
use crate::trace::RequestRecord;
#[derive(Debug, Clone, Serialize)]
pub struct OracleResult {
pub num_requests: u64,
pub total_blocks: u64,
pub unique_blocks: u64,
pub unlimited: TierResult,
pub belady_finite: TierResult,
pub lru_finite: TierResult,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct TierResult {
pub label: String,
pub capacity_blocks: u64,
pub hits: u64,
pub misses: u64,
pub hit_rate: f64,
}
impl TierResult {
fn from_counts(label: &str, capacity_blocks: u64, hits: u64, total: u64) -> Self {
let misses = total.saturating_sub(hits);
TierResult {
label: label.to_string(),
capacity_blocks,
hits,
misses,
hit_rate: if total == 0 { 0.0 } else { hits as f64 / total as f64 },
}
}
}
pub fn analyze(records: &[RequestRecord], capacity_blocks: u64) -> OracleResult {
// total / unique counters
let total_blocks: u64 = records.iter().map(|r| r.hash_ids.len() as u64).sum();
let mut unique = AHashSet::new();
for r in records {
for &h in &r.hash_ids {
unique.insert(h);
}
}
// 1. Unlimited cache
let unlimited_hits = run_unlimited(records);
let unlimited = TierResult::from_counts(
"unlimited",
u64::MAX,
unlimited_hits,
total_blocks,
);
// 2. Precompute next-use index for Belady
let next_use = build_next_use(records);
// 3. Belady at the given capacity
let belady_hits = run_belady(records, &next_use, capacity_blocks as usize);
let belady = TierResult::from_counts("belady", capacity_blocks, belady_hits, total_blocks);
// 4. LRU baseline at the same capacity
let lru_hits = run_lru(records, capacity_blocks as usize);
let lru = TierResult::from_counts("lru", capacity_blocks, lru_hits, total_blocks);
OracleResult {
num_requests: records.len() as u64,
total_blocks,
unique_blocks: unique.len() as u64,
unlimited,
belady_finite: belady,
lru_finite: lru,
}
}
fn run_unlimited(records: &[RequestRecord]) -> u64 {
let mut seen: AHashSet<u64> = AHashSet::with_capacity(1 << 18);
let mut hits: u64 = 0;
for r in records {
// Longest prefix match against `seen`
for &h in &r.hash_ids {
if seen.contains(&h) {
hits += 1;
} else {
break;
}
}
for &h in &r.hash_ids {
seen.insert(h);
}
}
hits
}
fn run_lru(records: &[RequestRecord], capacity: usize) -> u64 {
if capacity == 0 {
return 0;
}
let mut cache = LruBlocks::new(capacity);
let mut hits: u64 = 0;
let mut evicted = Vec::new();
for r in records {
hits += cache.longest_prefix(&r.hash_ids) as u64;
evicted.clear();
cache.insert_blocks(&r.hash_ids, &mut evicted);
}
hits
}
/// For each (request_idx, position_in_hash_ids) compute the next request
/// index whose `hash_ids` contains the same block (`u32::MAX` if none).
fn build_next_use(records: &[RequestRecord]) -> Vec<Vec<u32>> {
let n = records.len();
let mut next_use: Vec<Vec<u32>> = Vec::with_capacity(n);
for r in records {
next_use.push(vec![u32::MAX; r.hash_ids.len()]);
}
let mut last_seen: AHashMap<u64, u32> = AHashMap::with_capacity(1 << 18);
for i in (0..n).rev() {
let r = &records[i];
for (j, &h) in r.hash_ids.iter().enumerate() {
next_use[i][j] = *last_seen.get(&h).unwrap_or(&u32::MAX);
}
for &h in &r.hash_ids {
last_seen.insert(h, i as u32);
}
}
next_use
}
/// Belady (offline OPT) eviction over the trace.
///
/// Implementation: lazy-deletion max-heap keyed by next-use index. Each
/// cache entry has a version; the heap may contain stale entries from
/// previous insertions, which we skip on pop.
fn run_belady(records: &[RequestRecord], next_use: &[Vec<u32>], capacity: usize) -> u64 {
if capacity == 0 {
return 0;
}
// block_hash -> (current_version, current_next_use)
let mut in_cache: AHashMap<u64, (u64, u32)> = AHashMap::with_capacity(capacity);
// (next_use, version, block_hash) — BinaryHeap is max-heap, which is what
// we want for "evict the entry whose next access is furthest".
let mut heap: BinaryHeap<(u32, u64, u64)> = BinaryHeap::with_capacity(capacity);
let mut version: u64 = 0;
let mut hits: u64 = 0;
for (i, r) in records.iter().enumerate() {
// 1. Longest-prefix hit accounting against current cache.
for &h in &r.hash_ids {
if in_cache.contains_key(&h) {
hits += 1;
} else {
break;
}
}
// 2. Insert / update each block in the request with its new next-use.
for (j, &h) in r.hash_ids.iter().enumerate() {
let nu = next_use[i][j];
if let Some(slot) = in_cache.get_mut(&h) {
version += 1;
slot.0 = version;
slot.1 = nu;
heap.push((nu, version, h));
continue;
}
// Need to make room?
if in_cache.len() == capacity {
// Evict max next_use entry, skipping stale heap entries.
loop {
let (nu_top, ver_top, h_top) = match heap.pop() {
Some(x) => x,
None => break,
};
if let Some(&(cur_ver, cur_nu)) = in_cache.get(&h_top) {
if cur_ver == ver_top && cur_nu == nu_top {
in_cache.remove(&h_top);
break;
}
}
// stale; loop
}
}
version += 1;
in_cache.insert(h, (version, nu));
heap.push((nu, version, h));
}
}
hits
}
#[cfg(test)]
mod tests {
use super::*;
fn req(id: u64, t: f64, hashes: Vec<u64>) -> RequestRecord {
RequestRecord {
req_id: id,
chat_id: id as i64,
arrival: t,
input_len: (hashes.len() as u32) * 16,
output_len: 16,
hash_ids: hashes,
}
}
#[test]
fn unlimited_first_occurrence_misses() {
let recs = vec![
req(0, 0.0, vec![1, 2, 3]),
req(1, 1.0, vec![1, 2, 3, 4]),
req(2, 2.0, vec![1, 2, 3, 4, 5]),
];
let out = analyze(&recs, 100);
// total = 3 + 4 + 5 = 12
assert_eq!(out.total_blocks, 12);
// unique = {1,2,3,4,5} = 5
assert_eq!(out.unique_blocks, 5);
// unlimited hits = 0 (req 0 all miss) + 3 (req 1 has [1,2,3] cached, then 4 miss) + 4
assert_eq!(out.unlimited.hits, 7);
assert!((out.unlimited.hit_rate - 7.0 / 12.0).abs() < 1e-9);
}
#[test]
fn belady_beats_lru_when_lru_thrashes() {
// Capacity 2. Pattern designed so LRU thrashes but Belady keeps the
// useful block: A B A C A B A C A ...
let mut recs = Vec::new();
let pattern = [1u64, 2, 1, 3, 1, 2, 1, 3];
for (i, &h) in pattern.iter().enumerate() {
recs.push(req(i as u64, i as f64, vec![h]));
}
let out = analyze(&recs, 2);
assert!(
out.belady_finite.hits >= out.lru_finite.hits,
"belady should be at least as good as lru: belady={} lru={}",
out.belady_finite.hits,
out.lru_finite.hits
);
}
#[test]
fn unlimited_is_upper_bound() {
let recs = vec![
req(0, 0.0, vec![10, 20, 30]),
req(1, 1.0, vec![10, 20, 30, 40, 50]),
req(2, 2.0, vec![60]),
req(3, 3.0, vec![10, 20, 30, 40, 50, 60]),
];
let out = analyze(&recs, 3);
assert!(out.unlimited.hit_rate >= out.belady_finite.hit_rate);
assert!(out.belady_finite.hit_rate >= out.lru_finite.hit_rate - 1e-9);
}
}