KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
279
src/oracle.rs
Normal file
279
src/oracle.rs
Normal file
@@ -0,0 +1,279 @@
|
||||
//! Offline oracle analyzers for upper-bound KV-cache hit rates.
|
||||
//!
|
||||
//! Two analyses, both treating the cluster as a single aggregated cache so
|
||||
//! the result is independent of routing — i.e. they answer the question
|
||||
//! "what is the best the cluster could possibly do?":
|
||||
//!
|
||||
//! 1. **Unlimited capacity**: longest-prefix-match against an unbounded
|
||||
//! cache. The only misses are blocks that the prefix walk encounters for
|
||||
//! the first time. Sets the absolute ceiling.
|
||||
//!
|
||||
//! 2. **Belady (offline optimal eviction) at finite capacity**: classic
|
||||
//! OPT replacement — evict the cached block whose *next* access is
|
||||
//! furthest in the future. Run alongside an LRU baseline at the same
|
||||
//! capacity so the gap tells you how much room LRU is leaving.
|
||||
//!
|
||||
//! Hit accounting uses prefix-match semantics matching the rest of the
|
||||
//! simulator: a block at position k in a request counts as a hit only if
|
||||
//! all positions 0..k are also in the cache.
|
||||
|
||||
use ahash::{AHashMap, AHashSet};
|
||||
use serde::Serialize;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
use crate::instance::kv_cache::LruBlocks;
|
||||
use crate::trace::RequestRecord;
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct OracleResult {
|
||||
pub num_requests: u64,
|
||||
pub total_blocks: u64,
|
||||
pub unique_blocks: u64,
|
||||
pub unlimited: TierResult,
|
||||
pub belady_finite: TierResult,
|
||||
pub lru_finite: TierResult,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Default)]
|
||||
pub struct TierResult {
|
||||
pub label: String,
|
||||
pub capacity_blocks: u64,
|
||||
pub hits: u64,
|
||||
pub misses: u64,
|
||||
pub hit_rate: f64,
|
||||
}
|
||||
|
||||
impl TierResult {
|
||||
fn from_counts(label: &str, capacity_blocks: u64, hits: u64, total: u64) -> Self {
|
||||
let misses = total.saturating_sub(hits);
|
||||
TierResult {
|
||||
label: label.to_string(),
|
||||
capacity_blocks,
|
||||
hits,
|
||||
misses,
|
||||
hit_rate: if total == 0 { 0.0 } else { hits as f64 / total as f64 },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn analyze(records: &[RequestRecord], capacity_blocks: u64) -> OracleResult {
|
||||
// total / unique counters
|
||||
let total_blocks: u64 = records.iter().map(|r| r.hash_ids.len() as u64).sum();
|
||||
let mut unique = AHashSet::new();
|
||||
for r in records {
|
||||
for &h in &r.hash_ids {
|
||||
unique.insert(h);
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Unlimited cache
|
||||
let unlimited_hits = run_unlimited(records);
|
||||
let unlimited = TierResult::from_counts(
|
||||
"unlimited",
|
||||
u64::MAX,
|
||||
unlimited_hits,
|
||||
total_blocks,
|
||||
);
|
||||
|
||||
// 2. Precompute next-use index for Belady
|
||||
let next_use = build_next_use(records);
|
||||
|
||||
// 3. Belady at the given capacity
|
||||
let belady_hits = run_belady(records, &next_use, capacity_blocks as usize);
|
||||
let belady = TierResult::from_counts("belady", capacity_blocks, belady_hits, total_blocks);
|
||||
|
||||
// 4. LRU baseline at the same capacity
|
||||
let lru_hits = run_lru(records, capacity_blocks as usize);
|
||||
let lru = TierResult::from_counts("lru", capacity_blocks, lru_hits, total_blocks);
|
||||
|
||||
OracleResult {
|
||||
num_requests: records.len() as u64,
|
||||
total_blocks,
|
||||
unique_blocks: unique.len() as u64,
|
||||
unlimited,
|
||||
belady_finite: belady,
|
||||
lru_finite: lru,
|
||||
}
|
||||
}
|
||||
|
||||
fn run_unlimited(records: &[RequestRecord]) -> u64 {
|
||||
let mut seen: AHashSet<u64> = AHashSet::with_capacity(1 << 18);
|
||||
let mut hits: u64 = 0;
|
||||
for r in records {
|
||||
// Longest prefix match against `seen`
|
||||
for &h in &r.hash_ids {
|
||||
if seen.contains(&h) {
|
||||
hits += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for &h in &r.hash_ids {
|
||||
seen.insert(h);
|
||||
}
|
||||
}
|
||||
hits
|
||||
}
|
||||
|
||||
fn run_lru(records: &[RequestRecord], capacity: usize) -> u64 {
|
||||
if capacity == 0 {
|
||||
return 0;
|
||||
}
|
||||
let mut cache = LruBlocks::new(capacity);
|
||||
let mut hits: u64 = 0;
|
||||
let mut evicted = Vec::new();
|
||||
for r in records {
|
||||
hits += cache.longest_prefix(&r.hash_ids) as u64;
|
||||
evicted.clear();
|
||||
cache.insert_blocks(&r.hash_ids, &mut evicted);
|
||||
}
|
||||
hits
|
||||
}
|
||||
|
||||
/// For each (request_idx, position_in_hash_ids) compute the next request
|
||||
/// index whose `hash_ids` contains the same block (`u32::MAX` if none).
|
||||
fn build_next_use(records: &[RequestRecord]) -> Vec<Vec<u32>> {
|
||||
let n = records.len();
|
||||
let mut next_use: Vec<Vec<u32>> = Vec::with_capacity(n);
|
||||
for r in records {
|
||||
next_use.push(vec![u32::MAX; r.hash_ids.len()]);
|
||||
}
|
||||
let mut last_seen: AHashMap<u64, u32> = AHashMap::with_capacity(1 << 18);
|
||||
for i in (0..n).rev() {
|
||||
let r = &records[i];
|
||||
for (j, &h) in r.hash_ids.iter().enumerate() {
|
||||
next_use[i][j] = *last_seen.get(&h).unwrap_or(&u32::MAX);
|
||||
}
|
||||
for &h in &r.hash_ids {
|
||||
last_seen.insert(h, i as u32);
|
||||
}
|
||||
}
|
||||
next_use
|
||||
}
|
||||
|
||||
/// Belady (offline OPT) eviction over the trace.
|
||||
///
|
||||
/// Implementation: lazy-deletion max-heap keyed by next-use index. Each
|
||||
/// cache entry has a version; the heap may contain stale entries from
|
||||
/// previous insertions, which we skip on pop.
|
||||
fn run_belady(records: &[RequestRecord], next_use: &[Vec<u32>], capacity: usize) -> u64 {
|
||||
if capacity == 0 {
|
||||
return 0;
|
||||
}
|
||||
// block_hash -> (current_version, current_next_use)
|
||||
let mut in_cache: AHashMap<u64, (u64, u32)> = AHashMap::with_capacity(capacity);
|
||||
// (next_use, version, block_hash) — BinaryHeap is max-heap, which is what
|
||||
// we want for "evict the entry whose next access is furthest".
|
||||
let mut heap: BinaryHeap<(u32, u64, u64)> = BinaryHeap::with_capacity(capacity);
|
||||
let mut version: u64 = 0;
|
||||
let mut hits: u64 = 0;
|
||||
|
||||
for (i, r) in records.iter().enumerate() {
|
||||
// 1. Longest-prefix hit accounting against current cache.
|
||||
for &h in &r.hash_ids {
|
||||
if in_cache.contains_key(&h) {
|
||||
hits += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Insert / update each block in the request with its new next-use.
|
||||
for (j, &h) in r.hash_ids.iter().enumerate() {
|
||||
let nu = next_use[i][j];
|
||||
if let Some(slot) = in_cache.get_mut(&h) {
|
||||
version += 1;
|
||||
slot.0 = version;
|
||||
slot.1 = nu;
|
||||
heap.push((nu, version, h));
|
||||
continue;
|
||||
}
|
||||
// Need to make room?
|
||||
if in_cache.len() == capacity {
|
||||
// Evict max next_use entry, skipping stale heap entries.
|
||||
loop {
|
||||
let (nu_top, ver_top, h_top) = match heap.pop() {
|
||||
Some(x) => x,
|
||||
None => break,
|
||||
};
|
||||
if let Some(&(cur_ver, cur_nu)) = in_cache.get(&h_top) {
|
||||
if cur_ver == ver_top && cur_nu == nu_top {
|
||||
in_cache.remove(&h_top);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// stale; loop
|
||||
}
|
||||
}
|
||||
version += 1;
|
||||
in_cache.insert(h, (version, nu));
|
||||
heap.push((nu, version, h));
|
||||
}
|
||||
}
|
||||
|
||||
hits
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn req(id: u64, t: f64, hashes: Vec<u64>) -> RequestRecord {
|
||||
RequestRecord {
|
||||
req_id: id,
|
||||
chat_id: id as i64,
|
||||
arrival: t,
|
||||
input_len: (hashes.len() as u32) * 16,
|
||||
output_len: 16,
|
||||
hash_ids: hashes,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unlimited_first_occurrence_misses() {
|
||||
let recs = vec![
|
||||
req(0, 0.0, vec![1, 2, 3]),
|
||||
req(1, 1.0, vec![1, 2, 3, 4]),
|
||||
req(2, 2.0, vec![1, 2, 3, 4, 5]),
|
||||
];
|
||||
let out = analyze(&recs, 100);
|
||||
// total = 3 + 4 + 5 = 12
|
||||
assert_eq!(out.total_blocks, 12);
|
||||
// unique = {1,2,3,4,5} = 5
|
||||
assert_eq!(out.unique_blocks, 5);
|
||||
// unlimited hits = 0 (req 0 all miss) + 3 (req 1 has [1,2,3] cached, then 4 miss) + 4
|
||||
assert_eq!(out.unlimited.hits, 7);
|
||||
assert!((out.unlimited.hit_rate - 7.0 / 12.0).abs() < 1e-9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn belady_beats_lru_when_lru_thrashes() {
|
||||
// Capacity 2. Pattern designed so LRU thrashes but Belady keeps the
|
||||
// useful block: A B A C A B A C A ...
|
||||
let mut recs = Vec::new();
|
||||
let pattern = [1u64, 2, 1, 3, 1, 2, 1, 3];
|
||||
for (i, &h) in pattern.iter().enumerate() {
|
||||
recs.push(req(i as u64, i as f64, vec![h]));
|
||||
}
|
||||
let out = analyze(&recs, 2);
|
||||
assert!(
|
||||
out.belady_finite.hits >= out.lru_finite.hits,
|
||||
"belady should be at least as good as lru: belady={} lru={}",
|
||||
out.belady_finite.hits,
|
||||
out.lru_finite.hits
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unlimited_is_upper_bound() {
|
||||
let recs = vec![
|
||||
req(0, 0.0, vec![10, 20, 30]),
|
||||
req(1, 1.0, vec![10, 20, 30, 40, 50]),
|
||||
req(2, 2.0, vec![60]),
|
||||
req(3, 3.0, vec![10, 20, 30, 40, 50, 60]),
|
||||
];
|
||||
let out = analyze(&recs, 3);
|
||||
assert!(out.unlimited.hit_rate >= out.belady_finite.hit_rate);
|
||||
assert!(out.belady_finite.hit_rate >= out.lru_finite.hit_rate - 1e-9);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user