KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
59
src/router/ttl_aware.rs
Normal file
59
src/router/ttl_aware.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
use crate::cluster::meta_store::MetaStore;
|
||||
use crate::instance::Instance;
|
||||
use crate::router::{CandidateInfo, RouteDecision, Router};
|
||||
use crate::trace::RequestRecord;
|
||||
|
||||
pub struct TtlAwareRouter {
|
||||
pub alpha: f64,
|
||||
}
|
||||
|
||||
impl TtlAwareRouter {
|
||||
pub fn new(alpha: f64) -> Self {
|
||||
Self { alpha }
|
||||
}
|
||||
}
|
||||
|
||||
impl Router for TtlAwareRouter {
|
||||
fn name(&self) -> &'static str {
|
||||
"ttl_aware"
|
||||
}
|
||||
|
||||
fn route(
|
||||
&mut self,
|
||||
req: &RequestRecord,
|
||||
instances: &[Instance],
|
||||
meta: &MetaStore,
|
||||
now: f64,
|
||||
) -> RouteDecision {
|
||||
let n = instances.len();
|
||||
let scores = meta.score_prefix(&req.hash_ids, now, n);
|
||||
let mut best = 0u32;
|
||||
let mut best_key = (i64::MIN, f64::INFINITY); // maximize prefix, then minimize load
|
||||
let mut candidates = Vec::with_capacity(n);
|
||||
for inst in instances {
|
||||
let p = scores[inst.id as usize];
|
||||
let load = inst.kv_blocks_used as f64
|
||||
+ self.alpha * inst.queue_len() as f64;
|
||||
candidates.push(CandidateInfo {
|
||||
instance: inst.id,
|
||||
predicted_prefix: p,
|
||||
load_blocks: inst.kv_blocks_used,
|
||||
queue_len: inst.queue_len(),
|
||||
});
|
||||
let key = (p as i64, -load);
|
||||
// we want max prefix, min load -> compare (p, -load) lexicographically max
|
||||
if key > (best_key.0, -best_key.1) {
|
||||
best_key = (p as i64, load);
|
||||
best = inst.id;
|
||||
}
|
||||
}
|
||||
RouteDecision {
|
||||
req_id: req.req_id,
|
||||
mode: "ttl_aware",
|
||||
chosen: best,
|
||||
probe_overhead_s: 0.0,
|
||||
candidates,
|
||||
reason: "max meta_store prefix, tie -> least loaded",
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user