KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/router/cache_load.rs
+++ b/src/router/cache_load.rs
@@ -0,0 +1,89 @@
+//! Load-filtered cache-aware routing.
+//!
+//! **Step 1** — filter: sort all instances by `queue_len` ascending and take the
+//! least-loaded quarter (≥ 2 instances).
+//!
+//! **Step 2** — select: among that pool, pick the instance with the highest
+//! meta-store prefix score.  Tiebreak on lowest `queue_len`.
+//!
+//! This cleanly separates concerns: step 1 guarantees the request won't land
+//! on a saturated instance, while step 2 maximises cache reuse within the
+//! load-safe pool.  The 1/4 fraction keeps the pool large enough that good
+//! cache candidates are rarely excluded.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct CacheLoadRouter;
+
+impl CacheLoadRouter {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl Default for CacheLoadRouter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Router for CacheLoadRouter {
+    fn name(&self) -> &'static str {
+        "cache_load"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+
+        // Step 1: least-loaded 1/4 of instances (by queue_len).
+        let pool_size = (n / 4).max(2).min(n);
+        let mut indices: Vec<usize> = (0..n).collect();
+        indices.sort_by_key(|&i| instances[i].queue_len());
+        let pool = &indices[..pool_size];
+
+        // Step 2: among the pool, pick highest prefix score.
+        // Tiebreak: lowest queue_len.
+        let mut best_idx = pool[0];
+        let mut best_prefix = scores[pool[0]];
+        let mut best_queue = instances[pool[0]].queue_len();
+
+        for &i in &pool[1..] {
+            let p = scores[i];
+            let q = instances[i].queue_len();
+            if p > best_prefix || (p == best_prefix && q < best_queue) {
+                best_idx = i;
+                best_prefix = p;
+                best_queue = q;
+            }
+        }
+
+        let mut candidates = Vec::with_capacity(pool_size);
+        for &i in pool {
+            candidates.push(CandidateInfo {
+                instance: instances[i].id,
+                predicted_prefix: scores[i],
+                load_blocks: instances[i].kv_blocks_used,
+                queue_len: instances[i].queue_len(),
+            });
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "cache_load",
+            chosen: instances[best_idx].id,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "least-loaded 1/4, then best prefix",
+        }
+    }
+}