kvcache-simulator/src/router/cache_affinity.rs

//! Cache-affinity routing tuned for coding-agent workloads.
//!
//! Motivation — the coding trace has three dominant patterns:
//!
//! 1. **Short system-prompt-only requests** (≤10 blocks): novel per-chat but
//!    sharing a small set of system prompts across millions of invocations.
//! 2. **Long multi-turn chains**: parent→child prefixes share ~60+ blocks
//!    and grow by ~6 blocks per turn. Sticking the chain to one instance
//!    maximises L0 hits for every subsequent turn.
//! 3. **Completely novel one-shots**: no existing cache anywhere; should be
//!    placed to maximise *future* reuse, not just minimise current load.
//!
//! `cache_score` minimises `α·queue_len + β·miss_blocks`. With the shipping
//! defaults (α=1, β=0.1) a single extra queue position is worth ten extra
//! miss blocks, so short novel requests — the bulk of traffic — reduce to
//! pure least-loaded routing and scatter the same system prompt across
//! dozens of instances. Each scattered copy burns HBM that could have held a
//! different hot prefix, depressing the cluster-wide L0 hit-rate.
//!
//! `cache_affinity` fixes this with two changes:
//!
//! * **Strong cache weight** — cost is `α·queue_len − γ·l0_hit`, with
//!   γ ≫ α·input_blocks, so any real L0 hit beats load-balancing. A soft
//!   bonus (`δ·meta_only_hit`) still rewards instances that have the prefix
//!   in L1/DRAM even when L0 is empty.
//!
//! * **Deterministic rendezvous tiebreak** — among instances that tie on
//!   `(cost, hit, queue)`, we rank by `rendezvous(fingerprint, instance_id)`
//!   where `fingerprint` is an FNV hash of the first few block hashes. This
//!   turns cold routing from "first-found" (which piles on instance 0 until
//!   it fills, then spills sequentially) into a consistent hash that maps
//!   each distinct prefix to the *same* small set of homes. Repeat traffic
//!   for that prefix therefore concentrates on its home, building a strong
//!   L0 working set.
//!
//! Overload protection: if the rendezvous-chosen home already has
//! `queue_len > overload_threshold`, the load term dominates and the router
//! naturally spills to the next-best instance.

use crate::cluster::meta_store::MetaStore;
use crate::instance::Instance;
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
use crate::trace::RequestRecord;

pub struct CacheAffinityRouter {
    /// Router display / trace name.
    name: &'static str,
    /// Weight on queue length (per queued request).
    load_alpha: f64,
    /// Reward per L0-hit block (real, locally cached).
    l0_gamma: f64,
    /// Reward per block present via meta-store but not in L0 (L1 / remote).
    meta_delta: f64,
    /// Number of leading block hashes folded into the prefix fingerprint.
    fingerprint_k: usize,
    /// Whether to break ties by rendezvous hash (sticky consistent placement)
    /// or by first-found order (matches cache_score behaviour).
    use_rendezvous: bool,
}

impl CacheAffinityRouter {
    pub fn new(load_alpha: f64, fingerprint_k: usize) -> Self {
        Self {
            name: "cache_affinity",
            load_alpha,
            l0_gamma: 1.0,
            meta_delta: 0.25,
            fingerprint_k: fingerprint_k.max(1),
            use_rendezvous: true,
        }
    }

    /// Ablation: cache_score-style weights (γ=0.1, δ=0) but keep rendezvous
    /// tiebreak. Isolates the contribution of deterministic sticky placement.
    pub fn weak_with_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self {
        Self {
            name: "cache_affinity_weak_rend",
            load_alpha,
            l0_gamma: 0.1,
            meta_delta: 0.0,
            fingerprint_k: fingerprint_k.max(1),
            use_rendezvous: true,
        }
    }

    /// Ablation: strong cache weights (γ=1.0, δ=0.25) but first-found tiebreak
    /// instead of rendezvous. Isolates the contribution of reweighting alone.
    pub fn strong_no_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self {
        Self {
            name: "cache_affinity_strong_only",
            load_alpha,
            l0_gamma: 1.0,
            meta_delta: 0.25,
            fingerprint_k: fingerprint_k.max(1),
            use_rendezvous: false,
        }
    }

    /// FNV-1a over the first `k` block hashes — identifies the prefix family
    /// (system-prompt + early agent context) that drives cache reuse.
    fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
        let n = hash_ids.len().min(k).max(1);
        let take = hash_ids.len().min(n);
        let mut fp: u64 = 0xcbf29ce484222325;
        for &h in &hash_ids[..take] {
            fp ^= h;
            fp = fp.wrapping_mul(0x100000001b3);
        }
        if take == 0 {
            // Empty request: still want a deterministic fingerprint (0).
            fp ^= 0;
        }
        fp
    }

    /// Splitmix64-style rendezvous score for (fingerprint, instance_id).
    /// Uniform over u64; higher = preferred home.
    fn rendezvous(fp: u64, instance_id: u32) -> u64 {
        let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
        h = h.wrapping_add(0x9e3779b97f4a7c15);
        h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
        h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
        h ^ (h >> 31)
    }
}

impl Router for CacheAffinityRouter {
    fn name(&self) -> &'static str {
        self.name
    }

    fn route(
        &mut self,
        req: &RequestRecord,
        instances: &[Instance],
        meta: &MetaStore,
        now: f64,
    ) -> RouteDecision {
        let n = instances.len();
        let l0 = local_l0_scores(req, instances);
        // Meta-store predicted prefix — includes L1/remote-reachable blocks.
        let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
        let fp = Self::fingerprint(&req.hash_ids, self.fingerprint_k);

        let mut candidates = Vec::with_capacity(n);
        let mut best_idx: usize = 0;
        let mut best_cost = f64::INFINITY;
        let mut best_hit = 0u32;
        let mut best_queue = u32::MAX;
        let mut best_rend: u64 = 0;

        for (i, inst) in instances.iter().enumerate() {
            let hit = l0[i];
            // meta_only = extra blocks reachable by RDMA/L1 beyond L0 hit.
            let meta_only = meta_scores[i].saturating_sub(hit);
            let q = inst.queue_len();

            // Cost to minimise — lower is better.
            //   load term:  α · queue_len
            //   cache term: − γ · l0_hit − δ · meta_only
            // Short novel prefixes yield hit=0 on every instance, so cost
            // reduces to α·q and the rendezvous tiebreak picks the home.
            let cost = self.load_alpha * q as f64
                - self.l0_gamma * hit as f64
                - self.meta_delta * meta_only as f64;
            let rend = Self::rendezvous(fp, inst.id);

            candidates.push(CandidateInfo {
                instance: inst.id,
                predicted_prefix: hit,
                load_blocks: inst.kv_blocks_used,
                queue_len: q,
            });

            // Tiebreak chain (descending preference):
            //   1. lowest cost
            //   2. highest hit (break cost ties toward real L0 work)
            //   3. lowest queue
            //   4. highest rendezvous (deterministic sticky home), optional
            let better = if cost < best_cost {
                true
            } else if cost > best_cost {
                false
            } else if hit > best_hit {
                true
            } else if hit < best_hit {
                false
            } else if q < best_queue {
                true
            } else if q > best_queue {
                false
            } else if self.use_rendezvous {
                rend > best_rend
            } else {
                // First-found wins on full tie (matches cache_score behaviour).
                false
            };

            if better {
                best_cost = cost;
                best_hit = hit;
                best_queue = q;
                best_rend = rend;
                best_idx = i;
            }
        }

        crate::router::local_route_decision(
            req.req_id,
            "cache_affinity",
            instances[best_idx].id,
            0.0,
            candidates,
            "argmin(α·q − γ·l0_hit − δ·meta_only) + rendezvous tiebreak",
        )
    }
}