KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/router/prefix_affinity.rs
+++ b/src/router/prefix_affinity.rs
@@ -0,0 +1,196 @@
+//! Prefix-affinity routing with load-aware fallback.
+//!
+//! **Key insight**: in real LLM traces, 99%+ of requests share a common
+//! system-prompt prefix (dozens to hundreds of 16-token blocks).  If we
+//! *consistently* route requests with the same prefix to the same small set
+//! of instances, L0 (HBM) cache hit rates increase dramatically because the
+//! working set per instance is concentrated rather than scattered.
+//!
+//! Algorithm (rendezvous hashing + drain-time-aware selection):
+//!
+//! 1. **Fingerprint**: hash the first `K` blocks of the request to produce a
+//!    prefix fingerprint that captures the system prompt identity.
+//!
+//! 2. **Rendezvous ranking**: for each instance `i`, compute
+//!    `rendezvous(fingerprint, i)` — a deterministic pseudo-random score.
+//!    Sort instances by this score descending to get a stable, per-prefix
+//!    ordering.
+//!
+//! 3. **Select from top candidates**: among the top `fan_out` instances in
+//!    the rendezvous ranking, pick the one with the lowest estimated drain
+//!    time (architecture-aware, per-request sum).  This accounts for
+//!    heterogeneous request sizes in the queue.
+//!
+//! 4. **Overload fallback**: if all top candidates have queue length above a
+//!    threshold, expand to the full instance set and use estimated-TTFT
+//!    scoring (drain + fetch) for the best selection.
+//!
+//! The combination ensures:
+//! - **Cache locality**: same-prefix requests cluster on a few instances,
+//!   building strong L0 cache entries that benefit subsequent requests.
+//! - **Load balance**: within the affinity group, drain-time-aware selection
+//!   avoids hot-spotting from large-prompt requests.
+//! - **Zero overhead**: no per-instance probes needed; fingerprint +
+//!   rendezvous are pure arithmetic.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::config::Config;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct PrefixAffinityRouter {
+    /// Number of leading block hashes used for the prefix fingerprint.
+    prefix_k: usize,
+    /// Number of top-affinity instances to consider before fallback.
+    fan_out: usize,
+    /// Queue-length threshold: if all top candidates exceed this, expand to
+    /// the full instance set.
+    overload_threshold: u32,
+    /// Bytes per KV block (for RDMA cost estimation in fallback path).
+    kv_block_bytes: f64,
+    /// RDMA bandwidth in bytes/s.
+    rdma_bw: f64,
+    /// RDMA per-transfer latency in seconds.
+    rdma_latency_s: f64,
+}
+
+impl PrefixAffinityRouter {
+    pub fn new(config: &Config) -> Self {
+        let n = config.cluster.num_instances as usize;
+        let cfg_fan = config.cluster.router.affinity_fan_out;
+        // fan_out: if configured, use it; otherwise auto = max(2, n/8).
+        let fan_out = if cfg_fan > 0 {
+            cfg_fan.min(n)
+        } else {
+            (n / 8).max(2).min(n)
+        };
+        Self {
+            prefix_k: config.cluster.router.prefix_k,
+            fan_out,
+            overload_threshold: 4,
+            kv_block_bytes: config.model.kv_block_bytes() as f64,
+            rdma_bw: config.hardware.rdma_bw,
+            rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
+        }
+    }
+
+    /// Compute a prefix fingerprint from the first K block hashes.
+    fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
+        let n = hash_ids.len().min(k);
+        let mut fp: u64 = 0xcbf29ce484222325; // FNV offset basis
+        for &h in &hash_ids[..n] {
+            fp ^= h;
+            fp = fp.wrapping_mul(0x100000001b3); // FNV prime
+        }
+        fp
+    }
+
+    /// Rendezvous hash: deterministic pseudo-random score for (fingerprint, instance_id).
+    /// Higher score = higher affinity.
+    fn rendezvous_score(fp: u64, instance_id: u32) -> u64 {
+        let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
+        // Splitmix64 finalizer
+        h = h.wrapping_add(0x9e3779b97f4a7c15);
+        h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
+        h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
+        h ^ (h >> 31)
+    }
+
+    /// Estimate RDMA fetch time for `remote_blocks` blocks.
+    fn fetch_time(&self, remote_blocks: u32) -> f64 {
+        if remote_blocks == 0 {
+            return 0.0;
+        }
+        let bytes = remote_blocks as f64 * self.kv_block_bytes;
+        bytes / self.rdma_bw + self.rdma_latency_s
+    }
+}
+
+impl Router for PrefixAffinityRouter {
+    fn name(&self) -> &'static str {
+        "prefix_affinity"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let fp = Self::fingerprint(&req.hash_ids, self.prefix_k);
+
+        // Build rendezvous-ranked list of (score, index).
+        let mut ranked: Vec<(u64, usize)> = (0..n)
+            .map(|i| (Self::rendezvous_score(fp, instances[i].id), i))
+            .collect();
+        ranked.sort_unstable_by(|a, b| b.0.cmp(&a.0)); // descending score
+
+        // Collect candidate info for logging (also needed for fallback).
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+        let candidates: Vec<CandidateInfo> = instances
+            .iter()
+            .map(|inst| CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: scores[inst.id as usize],
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            })
+            .collect();
+
+        // Phase 1: among top fan_out instances, pick lowest drain time.
+        let top_k = self.fan_out.min(n);
+        let mut best_idx = ranked[0].1;
+        let mut best_drain = instances[best_idx].estimated_drain_time();
+        let mut best_ql = instances[best_idx].queue_len();
+        let mut all_overloaded = best_ql > self.overload_threshold;
+
+        for &(_, idx) in &ranked[1..top_k] {
+            let drain = instances[idx].estimated_drain_time();
+            let ql = instances[idx].queue_len();
+            if drain < best_drain || (drain == best_drain && ql < best_ql) {
+                best_idx = idx;
+                best_drain = drain;
+                best_ql = ql;
+            }
+            if ql <= self.overload_threshold {
+                all_overloaded = false;
+            }
+        }
+
+        // Phase 2: if all top candidates are overloaded, search globally
+        // using estimated-TTFT (drain + fetch) for optimal fallback.
+        let reason;
+        if all_overloaded {
+            reason = "affinity fallback: min(drain+fetch)";
+            let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
+            let mut best_cost = f64::INFINITY;
+            for &(_, idx) in ranked.iter() {
+                let inst = &instances[idx];
+                let drain = inst.estimated_drain_time();
+                let local_prefix = scores[idx];
+                let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
+                let cost = drain + self.fetch_time(remote_blocks);
+                let ql = inst.queue_len();
+                if cost < best_cost || (cost == best_cost && ql < best_ql) {
+                    best_cost = cost;
+                    best_idx = idx;
+                    best_ql = ql;
+                }
+            }
+        } else {
+            reason = "prefix affinity: top-K min drain";
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "prefix_affinity",
+            chosen: instances[best_idx].id,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason,
+        }
+    }
+}