Files
kvcache-simulator/src/router/precise_aware.rs
Gahow Wang ec73a95e05 KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies
in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache
hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention,
architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide
meta-store for prefix-aware routing decisions.

Includes 11 routing policies (random, round_robin, least_loaded,
least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score,
estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing,
built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation
tooling for systematic policy comparison across real Alibaba serving traces.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00

121 lines
4.4 KiB
Rust

//! KV-aware routing via meta-store candidate selection + precise probing.
//!
//! The global meta store is used as a *candidate pre-filter*: we score
//! every instance's predicted prefix from the store, take the top-K by
//! (predicted_prefix DESC, load ASC), and then exact-probe those K
//! candidates' actual L0+L1 caches to get the true longest prefix. This
//! catches two cases where the meta store is wrong:
//!
//! - the store is stale (block evicted from L0/L1 but TTL not yet up),
//! - the store undercounts because some blocks' TTL expired individually.
//!
//! Because the candidate set is sourced from the meta store rather than
//! from a load ranking, this router is a strict superset of `ttl_aware`:
//! any instance the meta store would pick is a candidate here, and the
//! exact probe can only move the decision toward a truthfully-better
//! instance. Each probe adds `probe_latency_s` to the request's
//! effective arrival time.
//!
//! If the meta store returns zero-prefix for every instance (e.g. cold
//! start, or a request whose blocks have never been seen), we fall back
//! to the top-K least-loaded instances so we still place the request.
use crate::cluster::meta_store::MetaStore;
use crate::instance::Instance;
use crate::router::{CandidateInfo, RouteDecision, Router};
use crate::trace::RequestRecord;
pub struct PreciseRouter {
pub topk: u32,
pub probe_latency_s: f64,
pub alpha: f64,
}
impl PreciseRouter {
pub fn new(topk: u32, probe_latency_s: f64, alpha: f64) -> Self {
Self { topk, probe_latency_s, alpha }
}
fn load_of(&self, inst: &Instance) -> f64 {
inst.kv_blocks_used as f64 + self.alpha * inst.queue_len() as f64
}
}
impl Router for PreciseRouter {
fn name(&self) -> &'static str {
"precise"
}
fn route(
&mut self,
req: &RequestRecord,
instances: &[Instance],
meta: &MetaStore,
now: f64,
) -> RouteDecision {
let n = instances.len();
let k = (self.topk as usize).min(n).max(1);
// 1. Meta-store candidate set: rank all instances by
// (predicted_prefix DESC, load ASC) and take the top-K.
let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
let any_meta_hit = meta_scores.iter().any(|&p| p > 0);
let mut ranked: Vec<usize> = (0..n).collect();
if any_meta_hit {
ranked.sort_by(|&a, &b| {
let pa = meta_scores[a];
let pb = meta_scores[b];
// prefix desc, then load asc
pb.cmp(&pa)
.then_with(|| {
self.load_of(&instances[a])
.partial_cmp(&self.load_of(&instances[b]))
.unwrap_or(std::cmp::Ordering::Equal)
})
});
} else {
// Cold start fallback: pure load order.
ranked.sort_by(|&a, &b| {
self.load_of(&instances[a])
.partial_cmp(&self.load_of(&instances[b]))
.unwrap_or(std::cmp::Ordering::Equal)
});
}
let probed = &ranked[..k];
// 2. Exact probe each candidate and pick
// argmax(exact_prefix, tiebreak: -load).
let mut candidates = Vec::with_capacity(k);
let mut best = probed[0] as u32;
let mut best_key: (i64, f64) = (i64::MIN, f64::INFINITY);
for &i in probed {
let inst = &instances[i];
let l0 = inst.cache.l0.longest_prefix_peek(&req.hash_ids);
let l1 = inst.cache.l1.longest_prefix_peek(&req.hash_ids[l0..]);
let predicted = (l0 + l1) as u32;
let load = self.load_of(inst);
candidates.push(CandidateInfo {
instance: inst.id,
predicted_prefix: predicted,
load_blocks: inst.kv_blocks_used,
queue_len: inst.queue_len(),
});
let key = (predicted as i64, -load);
if key > (best_key.0, -best_key.1) {
best_key = (predicted as i64, load);
best = inst.id;
}
}
RouteDecision {
req_id: req.req_id,
mode: "precise",
chosen: best,
probe_overhead_s: k as f64 * self.probe_latency_s,
candidates,
reason: "exact-probe top-K meta-store candidates",
}
}
}