kvcache-simulator/src/router/prefix_affinity.rs

//! Prefix-affinity routing with load-aware fallback.
//!
//! **Key insight**: in real LLM traces, 99%+ of requests share a common
//! system-prompt prefix (dozens to hundreds of 16-token blocks).  If we
//! *consistently* route requests with the same prefix to the same small set
//! of instances, L0 (HBM) cache hit rates increase dramatically because the
//! working set per instance is concentrated rather than scattered.
//!
//! Algorithm (rendezvous hashing + drain-time-aware selection):
//!
//! 1. **Fingerprint**: hash the first `K` blocks of the request to produce a
//!    prefix fingerprint that captures the system prompt identity.
//!
//! 2. **Rendezvous ranking**: for each instance `i`, compute
//!    `rendezvous(fingerprint, i)` — a deterministic pseudo-random score.
//!    Sort instances by this score descending to get a stable, per-prefix
//!    ordering.
//!
//! 3. **Select from top candidates**: among the top `fan_out` instances in
//!    the rendezvous ranking, pick the one with the lowest estimated drain
//!    time (architecture-aware, per-request sum).  This accounts for
//!    heterogeneous request sizes in the queue.
//!
//! 4. **Overload fallback**: if all top candidates have queue length above a
//!    threshold, expand to the full instance set and use estimated-TTFT
//!    scoring (drain + fetch) for the best selection.
//!
//! The combination ensures:
//! - **Cache locality**: same-prefix requests cluster on a few instances,
//!   building strong L0 cache entries that benefit subsequent requests.
//! - **Load balance**: within the affinity group, drain-time-aware selection
//!   avoids hot-spotting from large-prompt requests.
//! - **Zero overhead**: no per-instance probes needed; fingerprint +
//!   rendezvous are pure arithmetic.

use crate::cluster::meta_store::MetaStore;
use crate::config::Config;
use crate::instance::Instance;
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
use crate::trace::RequestRecord;

pub struct PrefixAffinityRouter {
    /// Number of leading block hashes used for the prefix fingerprint.
    prefix_k: usize,
    /// Number of top-affinity instances to consider before fallback.
    fan_out: usize,
    /// Queue-length threshold: if all top candidates exceed this, expand to
    /// the full instance set.
    overload_threshold: u32,
}

impl PrefixAffinityRouter {
    pub fn new(config: &Config) -> Self {
        let n = config.cluster.total_instances() as usize;
        let cfg_fan = config.cluster.router.affinity_fan_out;
        // fan_out: if configured, use it; otherwise auto = max(2, n/8).
        let fan_out = if cfg_fan > 0 {
            cfg_fan.min(n)
        } else {
            (n / 8).max(2).min(n)
        };
        Self {
            prefix_k: config.cluster.router.prefix_k,
            fan_out,
            overload_threshold: 4,
        }
    }

    /// Compute a prefix fingerprint from the first K block hashes.
    fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
        let n = hash_ids.len().min(k);
        let mut fp: u64 = 0xcbf29ce484222325; // FNV offset basis
        for &h in &hash_ids[..n] {
            fp ^= h;
            fp = fp.wrapping_mul(0x100000001b3); // FNV prime
        }
        fp
    }

    /// Rendezvous hash: deterministic pseudo-random score for (fingerprint, instance_id).
    /// Higher score = higher affinity.
    fn rendezvous_score(fp: u64, instance_id: u32) -> u64 {
        let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
        // Splitmix64 finalizer
        h = h.wrapping_add(0x9e3779b97f4a7c15);
        h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
        h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
        h ^ (h >> 31)
    }
}

impl Router for PrefixAffinityRouter {
    fn name(&self) -> &'static str {
        "prefix_affinity"
    }

    fn route(
        &mut self,
        req: &RequestRecord,
        instances: &[Instance],
        _meta: &MetaStore,
        _now: f64,
    ) -> RouteDecision {
        let n = instances.len();
        let fp = Self::fingerprint(&req.hash_ids, self.prefix_k);

        // Build rendezvous-ranked list of (score, index).
        let mut ranked: Vec<(u64, usize)> = (0..n)
            .map(|i| (Self::rendezvous_score(fp, instances[i].id), i))
            .collect();
        ranked.sort_unstable_by(|a, b| b.0.cmp(&a.0)); // descending score

        // Collect candidate info for logging (also needed for fallback).
        let scores = local_l0_scores(req, instances);
        let candidates: Vec<CandidateInfo> = instances
            .iter()
            .map(|inst| CandidateInfo {
                instance: inst.id,
                predicted_prefix: scores[inst.id as usize],
                load_blocks: inst.kv_blocks_used,
                queue_len: inst.queue_len(),
            })
            .collect();

        // Phase 1: among top fan_out instances, pick lowest drain time.
        let top_k = self.fan_out.min(n);
        let mut best_idx = ranked[0].1;
        let mut best_drain = instances[best_idx].estimated_drain_time();
        let mut best_ql = instances[best_idx].queue_len();
        let mut all_overloaded = best_ql > self.overload_threshold;

        for &(_, idx) in &ranked[1..top_k] {
            let drain = instances[idx].estimated_drain_time();
            let ql = instances[idx].queue_len();
            if drain < best_drain || (drain == best_drain && ql < best_ql) {
                best_idx = idx;
                best_drain = drain;
                best_ql = ql;
            }
            if ql <= self.overload_threshold {
                all_overloaded = false;
            }
        }

        // Phase 2: if all top candidates are overloaded, search globally
        // using estimated-TTFT (drain + fetch) for optimal fallback.
        let reason;
        if all_overloaded {
            reason = "affinity fallback: min(drain+fetch)";
            let mut best_cost = f64::INFINITY;
            for &(_, idx) in ranked.iter() {
                let inst = &instances[idx];
                let drain = inst.estimated_drain_time();
                let miss_tokens = (req.hash_ids.len() as u32)
                    .saturating_sub(scores[idx])
                    .saturating_mul(inst.block_size_tokens);
                let cost = drain + inst.compute.prefill_time(miss_tokens);
                let ql = inst.queue_len();
                if cost < best_cost || (cost == best_cost && ql < best_ql) {
                    best_cost = cost;
                    best_idx = idx;
                    best_ql = ql;
                }
            }
        } else {
            reason = "prefix affinity: top-K min drain";
        }

        crate::router::local_route_decision(
            req.req_id,
            "prefix_affinity",
            instances[best_idx].id,
            0.0,
            candidates,
            reason,
        )
    }
}