Files
kvcache-simulator/src/router/cache_affinity.rs

218 lines
8.3 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Cache-affinity routing tuned for coding-agent workloads.
//!
//! Motivation — the coding trace has three dominant patterns:
//!
//! 1. **Short system-prompt-only requests** (≤10 blocks): novel per-chat but
//! sharing a small set of system prompts across millions of invocations.
//! 2. **Long multi-turn chains**: parent→child prefixes share ~60+ blocks
//! and grow by ~6 blocks per turn. Sticking the chain to one instance
//! maximises L0 hits for every subsequent turn.
//! 3. **Completely novel one-shots**: no existing cache anywhere; should be
//! placed to maximise *future* reuse, not just minimise current load.
//!
//! `cache_score` minimises `α·queue_len + β·miss_blocks`. With the shipping
//! defaults (α=1, β=0.1) a single extra queue position is worth ten extra
//! miss blocks, so short novel requests — the bulk of traffic — reduce to
//! pure least-loaded routing and scatter the same system prompt across
//! dozens of instances. Each scattered copy burns HBM that could have held a
//! different hot prefix, depressing the cluster-wide L0 hit-rate.
//!
//! `cache_affinity` fixes this with two changes:
//!
//! * **Strong cache weight** — cost is `α·queue_len γ·l0_hit`, with
//! γα·input_blocks, so any real L0 hit beats load-balancing. A soft
//! bonus (`δ·meta_only_hit`) still rewards instances that have the prefix
//! in L1/DRAM even when L0 is empty.
//!
//! * **Deterministic rendezvous tiebreak** — among instances that tie on
//! `(cost, hit, queue)`, we rank by `rendezvous(fingerprint, instance_id)`
//! where `fingerprint` is an FNV hash of the first few block hashes. This
//! turns cold routing from "first-found" (which piles on instance 0 until
//! it fills, then spills sequentially) into a consistent hash that maps
//! each distinct prefix to the *same* small set of homes. Repeat traffic
//! for that prefix therefore concentrates on its home, building a strong
//! L0 working set.
//!
//! Overload protection: if the rendezvous-chosen home already has
//! `queue_len > overload_threshold`, the load term dominates and the router
//! naturally spills to the next-best instance.
use crate::cluster::meta_store::MetaStore;
use crate::instance::Instance;
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
use crate::trace::RequestRecord;
pub struct CacheAffinityRouter {
/// Router display / trace name.
name: &'static str,
/// Weight on queue length (per queued request).
load_alpha: f64,
/// Reward per L0-hit block (real, locally cached).
l0_gamma: f64,
/// Reward per block present via meta-store but not in L0 (L1 / remote).
meta_delta: f64,
/// Number of leading block hashes folded into the prefix fingerprint.
fingerprint_k: usize,
/// Whether to break ties by rendezvous hash (sticky consistent placement)
/// or by first-found order (matches cache_score behaviour).
use_rendezvous: bool,
}
impl CacheAffinityRouter {
pub fn new(load_alpha: f64, fingerprint_k: usize) -> Self {
Self {
name: "cache_affinity",
load_alpha,
l0_gamma: 1.0,
meta_delta: 0.25,
fingerprint_k: fingerprint_k.max(1),
use_rendezvous: true,
}
}
/// Ablation: cache_score-style weights (γ=0.1, δ=0) but keep rendezvous
/// tiebreak. Isolates the contribution of deterministic sticky placement.
pub fn weak_with_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self {
Self {
name: "cache_affinity_weak_rend",
load_alpha,
l0_gamma: 0.1,
meta_delta: 0.0,
fingerprint_k: fingerprint_k.max(1),
use_rendezvous: true,
}
}
/// Ablation: strong cache weights (γ=1.0, δ=0.25) but first-found tiebreak
/// instead of rendezvous. Isolates the contribution of reweighting alone.
pub fn strong_no_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self {
Self {
name: "cache_affinity_strong_only",
load_alpha,
l0_gamma: 1.0,
meta_delta: 0.25,
fingerprint_k: fingerprint_k.max(1),
use_rendezvous: false,
}
}
/// FNV-1a over the first `k` block hashes — identifies the prefix family
/// (system-prompt + early agent context) that drives cache reuse.
fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
let n = hash_ids.len().min(k).max(1);
let take = hash_ids.len().min(n);
let mut fp: u64 = 0xcbf29ce484222325;
for &h in &hash_ids[..take] {
fp ^= h;
fp = fp.wrapping_mul(0x100000001b3);
}
if take == 0 {
// Empty request: still want a deterministic fingerprint (0).
fp ^= 0;
}
fp
}
/// Splitmix64-style rendezvous score for (fingerprint, instance_id).
/// Uniform over u64; higher = preferred home.
fn rendezvous(fp: u64, instance_id: u32) -> u64 {
let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
h = h.wrapping_add(0x9e3779b97f4a7c15);
h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
h ^ (h >> 31)
}
}
impl Router for CacheAffinityRouter {
fn name(&self) -> &'static str {
self.name
}
fn route(
&mut self,
req: &RequestRecord,
instances: &[Instance],
meta: &MetaStore,
now: f64,
) -> RouteDecision {
let n = instances.len();
let l0 = local_l0_scores(req, instances);
// Meta-store predicted prefix — includes L1/remote-reachable blocks.
let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
let fp = Self::fingerprint(&req.hash_ids, self.fingerprint_k);
let mut candidates = Vec::with_capacity(n);
let mut best_idx: usize = 0;
let mut best_cost = f64::INFINITY;
let mut best_hit = 0u32;
let mut best_queue = u32::MAX;
let mut best_rend: u64 = 0;
for (i, inst) in instances.iter().enumerate() {
let hit = l0[i];
// meta_only = extra blocks reachable by RDMA/L1 beyond L0 hit.
let meta_only = meta_scores[i].saturating_sub(hit);
let q = inst.queue_len();
// Cost to minimise — lower is better.
// load term: α · queue_len
// cache term: γ · l0_hit δ · meta_only
// Short novel prefixes yield hit=0 on every instance, so cost
// reduces to α·q and the rendezvous tiebreak picks the home.
let cost = self.load_alpha * q as f64
- self.l0_gamma * hit as f64
- self.meta_delta * meta_only as f64;
let rend = Self::rendezvous(fp, inst.id);
candidates.push(CandidateInfo {
instance: inst.id,
predicted_prefix: hit,
load_blocks: inst.kv_blocks_used,
queue_len: q,
});
// Tiebreak chain (descending preference):
// 1. lowest cost
// 2. highest hit (break cost ties toward real L0 work)
// 3. lowest queue
// 4. highest rendezvous (deterministic sticky home), optional
let better = if cost < best_cost {
true
} else if cost > best_cost {
false
} else if hit > best_hit {
true
} else if hit < best_hit {
false
} else if q < best_queue {
true
} else if q > best_queue {
false
} else if self.use_rendezvous {
rend > best_rend
} else {
// First-found wins on full tie (matches cache_score behaviour).
false
};
if better {
best_cost = cost;
best_hit = hit;
best_queue = q;
best_rend = rend;
best_idx = i;
}
}
crate::router::local_route_decision(
req.req_id,
"cache_affinity",
instances[best_idx].id,
0.0,
candidates,
"argmin(α·q γ·l0_hit δ·meta_only) + rendezvous tiebreak",
)
}
}