KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/router/estimated_ttft.rs
+++ b/src/router/estimated_ttft.rs
@@ -0,0 +1,128 @@
+//! First-principles TTFT-optimal routing.
+//!
+//! Estimates the actual time-to-first-token for each candidate instance:
+//!
+//! `TTFT(r,i) = drain(i) + fetch(r,i) + prefill(miss)`
+//!
+//! - **drain** — exact queue drain time: sum of per-request `prefill_time()`
+//!   using the architecture-aware compute model (quadratic / DSA).
+//!
+//! - **fetch** — RDMA fetch time for blocks cached elsewhere in the cluster
+//!   but not on instance `i` locally.
+//!
+//! - **prefill** — compute for cluster-wide cache-miss tokens (constant
+//!   across instances, cancels in the argmin).
+//!
+//! The router minimises `drain(i) + fetch(r,i)`, with ties broken by
+//! lowest `queue_len` then most local cache.  The fetch overlap with queue
+//! drain is handled by keeping the additive form: this gives double
+//! incentive to prefer instances with local cache, which empirically
+//! outperforms the `max(drain, fetch)` alternative because even small
+//! RDMA savings compound across thousands of routing decisions.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::config::Config;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct EstimatedTtftRouter {
+    /// Bytes per KV block (for RDMA cost estimation).
+    kv_block_bytes: f64,
+    /// RDMA bandwidth in bytes/s.
+    rdma_bw: f64,
+    /// RDMA per-transfer latency in seconds.
+    rdma_latency_s: f64,
+}
+
+impl EstimatedTtftRouter {
+    pub fn new(config: &Config) -> Self {
+        Self {
+            kv_block_bytes: config.model.kv_block_bytes() as f64,
+            rdma_bw: config.hardware.rdma_bw,
+            rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
+        }
+    }
+
+    /// Estimate RDMA fetch time for `remote_blocks` blocks.
+    fn fetch_time(&self, remote_blocks: u32) -> f64 {
+        if remote_blocks == 0 {
+            return 0.0;
+        }
+        let bytes = remote_blocks as f64 * self.kv_block_bytes;
+        bytes / self.rdma_bw + self.rdma_latency_s
+    }
+}
+
+impl Router for EstimatedTtftRouter {
+    fn name(&self) -> &'static str {
+        "estimated_ttft"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+
+        // Cluster-wide max prefix: blocks reachable via RDMA from any peer.
+        let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
+
+        let mut best: u32 = 0;
+        let mut best_cost = f64::INFINITY;
+        let mut best_queue = u32::MAX;
+        let mut best_local = 0u32;
+        let mut candidates = Vec::with_capacity(n);
+
+        for inst in instances {
+            let i = inst.id as usize;
+            let local_prefix = scores[i];
+
+            // 1. Exact queue drain time (architecture-aware, per-request sum).
+            let drain = inst.estimated_drain_time();
+
+            // 2. RDMA fetch cost for blocks not locally cached.
+            let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
+            let fetch = self.fetch_time(remote_blocks);
+
+            // Additive cost: drain + fetch.
+            // The additive form gives explicit incentive to prefer local cache
+            // (lower fetch) even when the queue is non-empty, which reduces
+            // total RDMA traffic and improves TTFT in aggregate.
+            let cost = drain + fetch;
+
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: local_prefix,
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            });
+
+            // Minimise (cost, queue_len, -local_prefix).
+            let ql = inst.queue_len();
+            let better = cost < best_cost
+                || (cost == best_cost && ql < best_queue)
+                || (cost == best_cost && ql == best_queue && local_prefix > best_local);
+
+            if better {
+                best_cost = cost;
+                best = inst.id;
+                best_queue = ql;
+                best_local = local_prefix;
+            }
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "estimated_ttft",
+            chosen: best,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "argmin(drain_time + fetch_time)",
+        }
+    }
+}