218 lines
8.3 KiB
Rust
218 lines
8.3 KiB
Rust
//! Cache-affinity routing tuned for coding-agent workloads.
|
||
//!
|
||
//! Motivation — the coding trace has three dominant patterns:
|
||
//!
|
||
//! 1. **Short system-prompt-only requests** (≤10 blocks): novel per-chat but
|
||
//! sharing a small set of system prompts across millions of invocations.
|
||
//! 2. **Long multi-turn chains**: parent→child prefixes share ~60+ blocks
|
||
//! and grow by ~6 blocks per turn. Sticking the chain to one instance
|
||
//! maximises L0 hits for every subsequent turn.
|
||
//! 3. **Completely novel one-shots**: no existing cache anywhere; should be
|
||
//! placed to maximise *future* reuse, not just minimise current load.
|
||
//!
|
||
//! `cache_score` minimises `α·queue_len + β·miss_blocks`. With the shipping
|
||
//! defaults (α=1, β=0.1) a single extra queue position is worth ten extra
|
||
//! miss blocks, so short novel requests — the bulk of traffic — reduce to
|
||
//! pure least-loaded routing and scatter the same system prompt across
|
||
//! dozens of instances. Each scattered copy burns HBM that could have held a
|
||
//! different hot prefix, depressing the cluster-wide L0 hit-rate.
|
||
//!
|
||
//! `cache_affinity` fixes this with two changes:
|
||
//!
|
||
//! * **Strong cache weight** — cost is `α·queue_len − γ·l0_hit`, with
|
||
//! γ ≫ α·input_blocks, so any real L0 hit beats load-balancing. A soft
|
||
//! bonus (`δ·meta_only_hit`) still rewards instances that have the prefix
|
||
//! in L1/DRAM even when L0 is empty.
|
||
//!
|
||
//! * **Deterministic rendezvous tiebreak** — among instances that tie on
|
||
//! `(cost, hit, queue)`, we rank by `rendezvous(fingerprint, instance_id)`
|
||
//! where `fingerprint` is an FNV hash of the first few block hashes. This
|
||
//! turns cold routing from "first-found" (which piles on instance 0 until
|
||
//! it fills, then spills sequentially) into a consistent hash that maps
|
||
//! each distinct prefix to the *same* small set of homes. Repeat traffic
|
||
//! for that prefix therefore concentrates on its home, building a strong
|
||
//! L0 working set.
|
||
//!
|
||
//! Overload protection: if the rendezvous-chosen home already has
|
||
//! `queue_len > overload_threshold`, the load term dominates and the router
|
||
//! naturally spills to the next-best instance.
|
||
|
||
use crate::cluster::meta_store::MetaStore;
|
||
use crate::instance::Instance;
|
||
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
|
||
use crate::trace::RequestRecord;
|
||
|
||
pub struct CacheAffinityRouter {
|
||
/// Router display / trace name.
|
||
name: &'static str,
|
||
/// Weight on queue length (per queued request).
|
||
load_alpha: f64,
|
||
/// Reward per L0-hit block (real, locally cached).
|
||
l0_gamma: f64,
|
||
/// Reward per block present via meta-store but not in L0 (L1 / remote).
|
||
meta_delta: f64,
|
||
/// Number of leading block hashes folded into the prefix fingerprint.
|
||
fingerprint_k: usize,
|
||
/// Whether to break ties by rendezvous hash (sticky consistent placement)
|
||
/// or by first-found order (matches cache_score behaviour).
|
||
use_rendezvous: bool,
|
||
}
|
||
|
||
impl CacheAffinityRouter {
|
||
pub fn new(load_alpha: f64, fingerprint_k: usize) -> Self {
|
||
Self {
|
||
name: "cache_affinity",
|
||
load_alpha,
|
||
l0_gamma: 1.0,
|
||
meta_delta: 0.25,
|
||
fingerprint_k: fingerprint_k.max(1),
|
||
use_rendezvous: true,
|
||
}
|
||
}
|
||
|
||
/// Ablation: cache_score-style weights (γ=0.1, δ=0) but keep rendezvous
|
||
/// tiebreak. Isolates the contribution of deterministic sticky placement.
|
||
pub fn weak_with_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self {
|
||
Self {
|
||
name: "cache_affinity_weak_rend",
|
||
load_alpha,
|
||
l0_gamma: 0.1,
|
||
meta_delta: 0.0,
|
||
fingerprint_k: fingerprint_k.max(1),
|
||
use_rendezvous: true,
|
||
}
|
||
}
|
||
|
||
/// Ablation: strong cache weights (γ=1.0, δ=0.25) but first-found tiebreak
|
||
/// instead of rendezvous. Isolates the contribution of reweighting alone.
|
||
pub fn strong_no_rendezvous(load_alpha: f64, fingerprint_k: usize) -> Self {
|
||
Self {
|
||
name: "cache_affinity_strong_only",
|
||
load_alpha,
|
||
l0_gamma: 1.0,
|
||
meta_delta: 0.25,
|
||
fingerprint_k: fingerprint_k.max(1),
|
||
use_rendezvous: false,
|
||
}
|
||
}
|
||
|
||
/// FNV-1a over the first `k` block hashes — identifies the prefix family
|
||
/// (system-prompt + early agent context) that drives cache reuse.
|
||
fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
|
||
let n = hash_ids.len().min(k).max(1);
|
||
let take = hash_ids.len().min(n);
|
||
let mut fp: u64 = 0xcbf29ce484222325;
|
||
for &h in &hash_ids[..take] {
|
||
fp ^= h;
|
||
fp = fp.wrapping_mul(0x100000001b3);
|
||
}
|
||
if take == 0 {
|
||
// Empty request: still want a deterministic fingerprint (0).
|
||
fp ^= 0;
|
||
}
|
||
fp
|
||
}
|
||
|
||
/// Splitmix64-style rendezvous score for (fingerprint, instance_id).
|
||
/// Uniform over u64; higher = preferred home.
|
||
fn rendezvous(fp: u64, instance_id: u32) -> u64 {
|
||
let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
|
||
h = h.wrapping_add(0x9e3779b97f4a7c15);
|
||
h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
|
||
h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
|
||
h ^ (h >> 31)
|
||
}
|
||
}
|
||
|
||
impl Router for CacheAffinityRouter {
|
||
fn name(&self) -> &'static str {
|
||
self.name
|
||
}
|
||
|
||
fn route(
|
||
&mut self,
|
||
req: &RequestRecord,
|
||
instances: &[Instance],
|
||
meta: &MetaStore,
|
||
now: f64,
|
||
) -> RouteDecision {
|
||
let n = instances.len();
|
||
let l0 = local_l0_scores(req, instances);
|
||
// Meta-store predicted prefix — includes L1/remote-reachable blocks.
|
||
let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
|
||
let fp = Self::fingerprint(&req.hash_ids, self.fingerprint_k);
|
||
|
||
let mut candidates = Vec::with_capacity(n);
|
||
let mut best_idx: usize = 0;
|
||
let mut best_cost = f64::INFINITY;
|
||
let mut best_hit = 0u32;
|
||
let mut best_queue = u32::MAX;
|
||
let mut best_rend: u64 = 0;
|
||
|
||
for (i, inst) in instances.iter().enumerate() {
|
||
let hit = l0[i];
|
||
// meta_only = extra blocks reachable by RDMA/L1 beyond L0 hit.
|
||
let meta_only = meta_scores[i].saturating_sub(hit);
|
||
let q = inst.queue_len();
|
||
|
||
// Cost to minimise — lower is better.
|
||
// load term: α · queue_len
|
||
// cache term: − γ · l0_hit − δ · meta_only
|
||
// Short novel prefixes yield hit=0 on every instance, so cost
|
||
// reduces to α·q and the rendezvous tiebreak picks the home.
|
||
let cost = self.load_alpha * q as f64
|
||
- self.l0_gamma * hit as f64
|
||
- self.meta_delta * meta_only as f64;
|
||
let rend = Self::rendezvous(fp, inst.id);
|
||
|
||
candidates.push(CandidateInfo {
|
||
instance: inst.id,
|
||
predicted_prefix: hit,
|
||
load_blocks: inst.kv_blocks_used,
|
||
queue_len: q,
|
||
});
|
||
|
||
// Tiebreak chain (descending preference):
|
||
// 1. lowest cost
|
||
// 2. highest hit (break cost ties toward real L0 work)
|
||
// 3. lowest queue
|
||
// 4. highest rendezvous (deterministic sticky home), optional
|
||
let better = if cost < best_cost {
|
||
true
|
||
} else if cost > best_cost {
|
||
false
|
||
} else if hit > best_hit {
|
||
true
|
||
} else if hit < best_hit {
|
||
false
|
||
} else if q < best_queue {
|
||
true
|
||
} else if q > best_queue {
|
||
false
|
||
} else if self.use_rendezvous {
|
||
rend > best_rend
|
||
} else {
|
||
// First-found wins on full tie (matches cache_score behaviour).
|
||
false
|
||
};
|
||
|
||
if better {
|
||
best_cost = cost;
|
||
best_hit = hit;
|
||
best_queue = q;
|
||
best_rend = rend;
|
||
best_idx = i;
|
||
}
|
||
}
|
||
|
||
crate::router::local_route_decision(
|
||
req.req_id,
|
||
"cache_affinity",
|
||
instances[best_idx].id,
|
||
0.0,
|
||
candidates,
|
||
"argmin(α·q − γ·l0_hit − δ·meta_only) + rendezvous tiebreak",
|
||
)
|
||
}
|
||
}
|