feat: update ttft modeling and add cache affinity

2026-04-15 19:08:10 +08:00
parent ff316c6873
commit a3f386c858
15 changed files with 1276 additions and 99 deletions
--- a/src/router/cache_affinity.rs
+++ b/src/router/cache_affinity.rs
@@ -0,0 +1,217 @@
+//! Cache-affinity routing tuned for coding-agent workloads.
+//!
+//! Motivation — the coding trace has three dominant patterns:
+//!
+//! 1. **Short system-prompt-only requests** (≤10 blocks): novel per-chat but
+//!    sharing a small set of system prompts across millions of invocations.
+//! 2. **Long multi-turn chains**: parent→child prefixes share ~60+ blocks
+//!    and grow by ~6 blocks per turn. Sticking the chain to one instance
+//!    maximises L0 hits for every subsequent turn.
+//! 3. **Completely novel one-shots**: no existing cache anywhere; should be
+//!    placed to maximise *future* reuse, not just minimise current load.
+//!
+//! `cache_score` minimises `α·queue_len + β·miss_blocks`. With the shipping
+//! defaults (α=1, β=0.1) a single extra queue position is worth ten extra
+//! miss blocks, so short novel requests — the bulk of traffic — reduce to
+//! pure least-loaded routing and scatter the same system prompt across
+//! dozens of instances. Each scattered copy burns HBM that could have held a
+//! different hot prefix, depressing the cluster-wide L0 hit-rate.
+//!
+//! `cache_affinity` fixes this with two changes:
+//!
+//! * **Strong cache weight** — cost is `α·queue_len − γ·l0_hit`, with
+//!   γ ≫ α·input_blocks, so any real L0 hit beats load-balancing. A soft
+//!   bonus (`δ·meta_only_hit`) still rewards instances that have the prefix
+//!   in L1/DRAM even when L0 is empty.
+//!
+//! * **Deterministic rendezvous tiebreak** — among instances that tie on
+//!   `(cost, hit, queue)`, we rank by `rendezvous(fingerprint, instance_id)`
+//!   where `fingerprint` is an FNV hash of the first few block hashes. This
+//!   turns cold routing from "first-found" (which piles on instance 0 until
+//!   it fills, then spills sequentially) into a consistent hash that maps
+//!   each distinct prefix to the *same* small set of homes. Repeat traffic
+//!   for that prefix therefore concentrates on its home, building a strong
+//!   L0 working set.
+//!
+//! Overload protection: if the rendezvous-chosen home already has
+//! `queue_len > overload_threshold`, the load term dominates and the router
+//! naturally spills to the next-best instance.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct CacheAffinityRouter {
+    /// Router display / trace name.
+    name: &'static str,
+    /// Weight on queue length (per queued request).
+    load_alpha: f64,
+    /// Reward per L0-hit block (real, locally cached).
+    l0_gamma: f64,
+    /// Reward per block present via meta-store but not in L0 (L1 / remote).
+    meta_delta: f64,
+    /// Number of leading block hashes folded into the prefix fingerprint.
+    fingerprint_k: usize,
+    /// Whether to break ties by rendezvous hash (sticky consistent placement)
+    /// or by first-found order (matches cache_score behaviour).
+    use_rendezvous: bool,
+}
+
+impl CacheAffinityRouter {
+    pub fn new(load_alpha: f64) -> Self {
+        Self {
+            name: "cache_affinity",
+            load_alpha,
+            l0_gamma: 1.0,
+            meta_delta: 0.25,
+            fingerprint_k: 4,
+            use_rendezvous: true,
+        }
+    }
+
+    /// Ablation: cache_score-style weights (γ=0.1, δ=0) but keep rendezvous
+    /// tiebreak. Isolates the contribution of deterministic sticky placement.
+    pub fn weak_with_rendezvous(load_alpha: f64) -> Self {
+        Self {
+            name: "cache_affinity_weak_rend",
+            load_alpha,
+            l0_gamma: 0.1,
+            meta_delta: 0.0,
+            fingerprint_k: 4,
+            use_rendezvous: true,
+        }
+    }
+
+    /// Ablation: strong cache weights (γ=1.0, δ=0.25) but first-found tiebreak
+    /// instead of rendezvous. Isolates the contribution of reweighting alone.
+    pub fn strong_no_rendezvous(load_alpha: f64) -> Self {
+        Self {
+            name: "cache_affinity_strong_only",
+            load_alpha,
+            l0_gamma: 1.0,
+            meta_delta: 0.25,
+            fingerprint_k: 4,
+            use_rendezvous: false,
+        }
+    }
+
+    /// FNV-1a over the first `k` block hashes — identifies the prefix family
+    /// (system-prompt + early agent context) that drives cache reuse.
+    fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
+        let n = hash_ids.len().min(k).max(1);
+        let take = hash_ids.len().min(n);
+        let mut fp: u64 = 0xcbf29ce484222325;
+        for &h in &hash_ids[..take] {
+            fp ^= h;
+            fp = fp.wrapping_mul(0x100000001b3);
+        }
+        if take == 0 {
+            // Empty request: still want a deterministic fingerprint (0).
+            fp ^= 0;
+        }
+        fp
+    }
+
+    /// Splitmix64-style rendezvous score for (fingerprint, instance_id).
+    /// Uniform over u64; higher = preferred home.
+    fn rendezvous(fp: u64, instance_id: u32) -> u64 {
+        let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
+        h = h.wrapping_add(0x9e3779b97f4a7c15);
+        h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
+        h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
+        h ^ (h >> 31)
+    }
+}
+
+impl Router for CacheAffinityRouter {
+    fn name(&self) -> &'static str {
+        self.name
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let l0 = local_l0_scores(req, instances);
+        // Meta-store predicted prefix — includes L1/remote-reachable blocks.
+        let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
+        let fp = Self::fingerprint(&req.hash_ids, self.fingerprint_k);
+
+        let mut candidates = Vec::with_capacity(n);
+        let mut best_idx: usize = 0;
+        let mut best_cost = f64::INFINITY;
+        let mut best_hit = 0u32;
+        let mut best_queue = u32::MAX;
+        let mut best_rend: u64 = 0;
+
+        for (i, inst) in instances.iter().enumerate() {
+            let hit = l0[i];
+            // meta_only = extra blocks reachable by RDMA/L1 beyond L0 hit.
+            let meta_only = meta_scores[i].saturating_sub(hit);
+            let q = inst.queue_len();
+
+            // Cost to minimise — lower is better.
+            //   load term:  α · queue_len
+            //   cache term: − γ · l0_hit − δ · meta_only
+            // Short novel prefixes yield hit=0 on every instance, so cost
+            // reduces to α·q and the rendezvous tiebreak picks the home.
+            let cost = self.load_alpha * q as f64
+                - self.l0_gamma * hit as f64
+                - self.meta_delta * meta_only as f64;
+            let rend = Self::rendezvous(fp, inst.id);
+
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: hit,
+                load_blocks: inst.kv_blocks_used,
+                queue_len: q,
+            });
+
+            // Tiebreak chain (descending preference):
+            //   1. lowest cost
+            //   2. highest hit (break cost ties toward real L0 work)
+            //   3. lowest queue
+            //   4. highest rendezvous (deterministic sticky home), optional
+            let better = if cost < best_cost {
+                true
+            } else if cost > best_cost {
+                false
+            } else if hit > best_hit {
+                true
+            } else if hit < best_hit {
+                false
+            } else if q < best_queue {
+                true
+            } else if q > best_queue {
+                false
+            } else if self.use_rendezvous {
+                rend > best_rend
+            } else {
+                // First-found wins on full tie (matches cache_score behaviour).
+                false
+            };
+
+            if better {
+                best_cost = cost;
+                best_hit = hit;
+                best_queue = q;
+                best_rend = rend;
+                best_idx = i;
+            }
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "cache_affinity",
+            chosen: instances[best_idx].id,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "argmin(α·q − γ·l0_hit − δ·meta_only) + rendezvous tiebreak",
+        }
+    }
+}