//! Prefix-affinity routing with load-aware fallback. //! //! **Key insight**: in real LLM traces, 99%+ of requests share a common //! system-prompt prefix (dozens to hundreds of 16-token blocks). If we //! *consistently* route requests with the same prefix to the same small set //! of instances, L0 (HBM) cache hit rates increase dramatically because the //! working set per instance is concentrated rather than scattered. //! //! Algorithm (rendezvous hashing + drain-time-aware selection): //! //! 1. **Fingerprint**: hash the first `K` blocks of the request to produce a //! prefix fingerprint that captures the system prompt identity. //! //! 2. **Rendezvous ranking**: for each instance `i`, compute //! `rendezvous(fingerprint, i)` — a deterministic pseudo-random score. //! Sort instances by this score descending to get a stable, per-prefix //! ordering. //! //! 3. **Select from top candidates**: among the top `fan_out` instances in //! the rendezvous ranking, pick the one with the lowest estimated drain //! time (architecture-aware, per-request sum). This accounts for //! heterogeneous request sizes in the queue. //! //! 4. **Overload fallback**: if all top candidates have queue length above a //! threshold, expand to the full instance set and use estimated-TTFT //! scoring (drain + fetch) for the best selection. //! //! The combination ensures: //! - **Cache locality**: same-prefix requests cluster on a few instances, //! building strong L0 cache entries that benefit subsequent requests. //! - **Load balance**: within the affinity group, drain-time-aware selection //! avoids hot-spotting from large-prompt requests. //! - **Zero overhead**: no per-instance probes needed; fingerprint + //! rendezvous are pure arithmetic. use crate::cluster::meta_store::MetaStore; use crate::config::Config; use crate::instance::Instance; use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router}; use crate::trace::RequestRecord; pub struct PrefixAffinityRouter { /// Number of leading block hashes used for the prefix fingerprint. prefix_k: usize, /// Number of top-affinity instances to consider before fallback. fan_out: usize, /// Queue-length threshold: if all top candidates exceed this, expand to /// the full instance set. overload_threshold: u32, } impl PrefixAffinityRouter { pub fn new(config: &Config) -> Self { let n = config.cluster.total_instances() as usize; let cfg_fan = config.cluster.router.affinity_fan_out; // fan_out: if configured, use it; otherwise auto = max(2, n/8). let fan_out = if cfg_fan > 0 { cfg_fan.min(n) } else { (n / 8).max(2).min(n) }; Self { prefix_k: config.cluster.router.prefix_k, fan_out, overload_threshold: 4, } } /// Compute a prefix fingerprint from the first K block hashes. fn fingerprint(hash_ids: &[u64], k: usize) -> u64 { let n = hash_ids.len().min(k); let mut fp: u64 = 0xcbf29ce484222325; // FNV offset basis for &h in &hash_ids[..n] { fp ^= h; fp = fp.wrapping_mul(0x100000001b3); // FNV prime } fp } /// Rendezvous hash: deterministic pseudo-random score for (fingerprint, instance_id). /// Higher score = higher affinity. fn rendezvous_score(fp: u64, instance_id: u32) -> u64 { let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15); // Splitmix64 finalizer h = h.wrapping_add(0x9e3779b97f4a7c15); h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb); h ^ (h >> 31) } } impl Router for PrefixAffinityRouter { fn name(&self) -> &'static str { "prefix_affinity" } fn route( &mut self, req: &RequestRecord, instances: &[Instance], _meta: &MetaStore, _now: f64, ) -> RouteDecision { let n = instances.len(); let fp = Self::fingerprint(&req.hash_ids, self.prefix_k); // Build rendezvous-ranked list of (score, index). let mut ranked: Vec<(u64, usize)> = (0..n) .map(|i| (Self::rendezvous_score(fp, instances[i].id), i)) .collect(); ranked.sort_unstable_by(|a, b| b.0.cmp(&a.0)); // descending score // Collect candidate info for logging (also needed for fallback). let scores = local_l0_scores(req, instances); let candidates: Vec = instances .iter() .map(|inst| CandidateInfo { instance: inst.id, predicted_prefix: scores[inst.id as usize], load_blocks: inst.kv_blocks_used, queue_len: inst.queue_len(), }) .collect(); // Phase 1: among top fan_out instances, pick lowest drain time. let top_k = self.fan_out.min(n); let mut best_idx = ranked[0].1; let mut best_drain = instances[best_idx].estimated_drain_time(); let mut best_ql = instances[best_idx].queue_len(); let mut all_overloaded = best_ql > self.overload_threshold; for &(_, idx) in &ranked[1..top_k] { let drain = instances[idx].estimated_drain_time(); let ql = instances[idx].queue_len(); if drain < best_drain || (drain == best_drain && ql < best_ql) { best_idx = idx; best_drain = drain; best_ql = ql; } if ql <= self.overload_threshold { all_overloaded = false; } } // Phase 2: if all top candidates are overloaded, search globally // using estimated-TTFT (drain + fetch) for optimal fallback. let reason; if all_overloaded { reason = "affinity fallback: min(drain+fetch)"; let mut best_cost = f64::INFINITY; for &(_, idx) in ranked.iter() { let inst = &instances[idx]; let drain = inst.estimated_drain_time(); let miss_tokens = (req.hash_ids.len() as u32) .saturating_sub(scores[idx]) .saturating_mul(inst.block_size_tokens); let cost = drain + inst.compute.prefill_time(miss_tokens); let ql = inst.queue_len(); if cost < best_cost || (cost == best_cost && ql < best_ql) { best_cost = cost; best_idx = idx; best_ql = ql; } } } else { reason = "prefix affinity: top-K min drain"; } crate::router::local_route_decision( req.req_id, "prefix_affinity", instances[best_idx].id, 0.0, candidates, reason, ) } }