//! Cache-affinity routing tuned for coding-agent workloads. //! //! Motivation — the coding trace has three dominant patterns: //! //! 1. **Short system-prompt-only requests** (≤10 blocks): novel per-chat but //! sharing a small set of system prompts across millions of invocations. //! 2. **Long multi-turn chains**: parent→child prefixes share ~60+ blocks //! and grow by ~6 blocks per turn. Sticking the chain to one instance //! maximises L0 hits for every subsequent turn. //! 3. **Completely novel one-shots**: no existing cache anywhere; should be //! placed to maximise *future* reuse, not just minimise current load. //! //! `cache_score` minimises `α·queue_len + β·miss_blocks`. With the shipping //! defaults (α=1, β=0.1) a single extra queue position is worth ten extra //! miss blocks, so short novel requests — the bulk of traffic — reduce to //! pure least-loaded routing and scatter the same system prompt across //! dozens of instances. Each scattered copy burns HBM that could have held a //! different hot prefix, depressing the cluster-wide L0 hit-rate. //! //! `cache_affinity` fixes this with two changes: //! //! * **Strong cache weight** — cost is `α·queue_len − γ·l0_hit`, with //! γ ≫ α·input_blocks, so any real L0 hit beats load-balancing. A soft //! bonus (`δ·meta_only_hit`) still rewards instances that have the prefix //! in L1/DRAM even when L0 is empty. //! //! * **Deterministic rendezvous tiebreak** — among instances that tie on //! `(cost, hit, queue)`, we rank by `rendezvous(fingerprint, instance_id)` //! where `fingerprint` is an FNV hash of the first few block hashes. This //! turns cold routing from "first-found" (which piles on instance 0 until //! it fills, then spills sequentially) into a consistent hash that maps //! each distinct prefix to the *same* small set of homes. Repeat traffic //! for that prefix therefore concentrates on its home, building a strong //! L0 working set. //! //! Overload protection: if the rendezvous-chosen home already has //! `queue_len > overload_threshold`, the load term dominates and the router //! naturally spills to the next-best instance. use crate::cluster::meta_store::MetaStore; use crate::instance::Instance; use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router}; use crate::trace::RequestRecord; pub struct CacheAffinityRouter { /// Router display / trace name. name: &'static str, /// Weight on queue length (per queued request). load_alpha: f64, /// Reward per L0-hit block (real, locally cached). l0_gamma: f64, /// Reward per block present via meta-store but not in L0 (L1 / remote). meta_delta: f64, /// Number of leading block hashes folded into the prefix fingerprint. fingerprint_k: usize, /// Whether to break ties by rendezvous hash (sticky consistent placement) /// or by first-found order (matches cache_score behaviour). use_rendezvous: bool, } impl CacheAffinityRouter { pub fn new(load_alpha: f64, fingerprint_k: usize) -> Self { Self { name: "cache_affinity", load_alpha, l0_gamma: 1.0, meta_delta: 0.25, fingerprint_k: fingerprint_k.max(1), use_rendezvous: true, } } /// Ablation: cache_score-style weights (γ=0.1, δ=0) but keep rendezvous /// tiebreak. Isolates the contribution of deterministic sticky placement. pub fn weak_with_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self { Self { name: "cache_affinity_weak_rend", load_alpha, l0_gamma: 0.1, meta_delta: 0.0, fingerprint_k: fingerprint_k.max(1), use_rendezvous: true, } } /// Ablation: strong cache weights (γ=1.0, δ=0.25) but first-found tiebreak /// instead of rendezvous. Isolates the contribution of reweighting alone. pub fn strong_no_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self { Self { name: "cache_affinity_strong_only", load_alpha, l0_gamma: 1.0, meta_delta: 0.25, fingerprint_k: fingerprint_k.max(1), use_rendezvous: false, } } /// FNV-1a over the first `k` block hashes — identifies the prefix family /// (system-prompt + early agent context) that drives cache reuse. fn fingerprint(hash_ids: &[u64], k: usize) -> u64 { let n = hash_ids.len().min(k).max(1); let take = hash_ids.len().min(n); let mut fp: u64 = 0xcbf29ce484222325; for &h in &hash_ids[..take] { fp ^= h; fp = fp.wrapping_mul(0x100000001b3); } if take == 0 { // Empty request: still want a deterministic fingerprint (0). fp ^= 0; } fp } /// Splitmix64-style rendezvous score for (fingerprint, instance_id). /// Uniform over u64; higher = preferred home. fn rendezvous(fp: u64, instance_id: u32) -> u64 { let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15); h = h.wrapping_add(0x9e3779b97f4a7c15); h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb); h ^ (h >> 31) } } impl Router for CacheAffinityRouter { fn name(&self) -> &'static str { self.name } fn route( &mut self, req: &RequestRecord, instances: &[Instance], meta: &MetaStore, now: f64, ) -> RouteDecision { let n = instances.len(); let l0 = local_l0_scores(req, instances); // Meta-store predicted prefix — includes L1/remote-reachable blocks. let meta_scores = meta.score_prefix(&req.hash_ids, now, n); let fp = Self::fingerprint(&req.hash_ids, self.fingerprint_k); let mut candidates = Vec::with_capacity(n); let mut best_idx: usize = 0; let mut best_cost = f64::INFINITY; let mut best_hit = 0u32; let mut best_queue = u32::MAX; let mut best_rend: u64 = 0; for (i, inst) in instances.iter().enumerate() { let hit = l0[i]; // meta_only = extra blocks reachable by RDMA/L1 beyond L0 hit. let meta_only = meta_scores[i].saturating_sub(hit); let q = inst.queue_len(); // Cost to minimise — lower is better. // load term: α · queue_len // cache term: − γ · l0_hit − δ · meta_only // Short novel prefixes yield hit=0 on every instance, so cost // reduces to α·q and the rendezvous tiebreak picks the home. let cost = self.load_alpha * q as f64 - self.l0_gamma * hit as f64 - self.meta_delta * meta_only as f64; let rend = Self::rendezvous(fp, inst.id); candidates.push(CandidateInfo { instance: inst.id, predicted_prefix: hit, load_blocks: inst.kv_blocks_used, queue_len: q, }); // Tiebreak chain (descending preference): // 1. lowest cost // 2. highest hit (break cost ties toward real L0 work) // 3. lowest queue // 4. highest rendezvous (deterministic sticky home), optional let better = if cost < best_cost { true } else if cost > best_cost { false } else if hit > best_hit { true } else if hit < best_hit { false } else if q < best_queue { true } else if q > best_queue { false } else if self.use_rendezvous { rend > best_rend } else { // First-found wins on full tie (matches cache_score behaviour). false }; if better { best_cost = cost; best_hit = hit; best_queue = q; best_rend = rend; best_idx = i; } } crate::router::local_route_decision( req.req_id, "cache_affinity", instances[best_idx].id, 0.0, candidates, "argmin(α·q − γ·l0_hit − δ·meta_only) + rendezvous tiebreak", ) } }