feat: update ttft modeling and add cache affinity
This commit is contained in:
217
src/router/cache_affinity.rs
Normal file
217
src/router/cache_affinity.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Cache-affinity routing tuned for coding-agent workloads.
|
||||
//!
|
||||
//! Motivation — the coding trace has three dominant patterns:
|
||||
//!
|
||||
//! 1. **Short system-prompt-only requests** (≤10 blocks): novel per-chat but
|
||||
//! sharing a small set of system prompts across millions of invocations.
|
||||
//! 2. **Long multi-turn chains**: parent→child prefixes share ~60+ blocks
|
||||
//! and grow by ~6 blocks per turn. Sticking the chain to one instance
|
||||
//! maximises L0 hits for every subsequent turn.
|
||||
//! 3. **Completely novel one-shots**: no existing cache anywhere; should be
|
||||
//! placed to maximise *future* reuse, not just minimise current load.
|
||||
//!
|
||||
//! `cache_score` minimises `α·queue_len + β·miss_blocks`. With the shipping
|
||||
//! defaults (α=1, β=0.1) a single extra queue position is worth ten extra
|
||||
//! miss blocks, so short novel requests — the bulk of traffic — reduce to
|
||||
//! pure least-loaded routing and scatter the same system prompt across
|
||||
//! dozens of instances. Each scattered copy burns HBM that could have held a
|
||||
//! different hot prefix, depressing the cluster-wide L0 hit-rate.
|
||||
//!
|
||||
//! `cache_affinity` fixes this with two changes:
|
||||
//!
|
||||
//! * **Strong cache weight** — cost is `α·queue_len − γ·l0_hit`, with
|
||||
//! γ ≫ α·input_blocks, so any real L0 hit beats load-balancing. A soft
|
||||
//! bonus (`δ·meta_only_hit`) still rewards instances that have the prefix
|
||||
//! in L1/DRAM even when L0 is empty.
|
||||
//!
|
||||
//! * **Deterministic rendezvous tiebreak** — among instances that tie on
|
||||
//! `(cost, hit, queue)`, we rank by `rendezvous(fingerprint, instance_id)`
|
||||
//! where `fingerprint` is an FNV hash of the first few block hashes. This
|
||||
//! turns cold routing from "first-found" (which piles on instance 0 until
|
||||
//! it fills, then spills sequentially) into a consistent hash that maps
|
||||
//! each distinct prefix to the *same* small set of homes. Repeat traffic
|
||||
//! for that prefix therefore concentrates on its home, building a strong
|
||||
//! L0 working set.
|
||||
//!
|
||||
//! Overload protection: if the rendezvous-chosen home already has
|
||||
//! `queue_len > overload_threshold`, the load term dominates and the router
|
||||
//! naturally spills to the next-best instance.
|
||||
|
||||
use crate::cluster::meta_store::MetaStore;
|
||||
use crate::instance::Instance;
|
||||
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
|
||||
use crate::trace::RequestRecord;
|
||||
|
||||
pub struct CacheAffinityRouter {
|
||||
/// Router display / trace name.
|
||||
name: &'static str,
|
||||
/// Weight on queue length (per queued request).
|
||||
load_alpha: f64,
|
||||
/// Reward per L0-hit block (real, locally cached).
|
||||
l0_gamma: f64,
|
||||
/// Reward per block present via meta-store but not in L0 (L1 / remote).
|
||||
meta_delta: f64,
|
||||
/// Number of leading block hashes folded into the prefix fingerprint.
|
||||
fingerprint_k: usize,
|
||||
/// Whether to break ties by rendezvous hash (sticky consistent placement)
|
||||
/// or by first-found order (matches cache_score behaviour).
|
||||
use_rendezvous: bool,
|
||||
}
|
||||
|
||||
impl CacheAffinityRouter {
|
||||
pub fn new(load_alpha: f64) -> Self {
|
||||
Self {
|
||||
name: "cache_affinity",
|
||||
load_alpha,
|
||||
l0_gamma: 1.0,
|
||||
meta_delta: 0.25,
|
||||
fingerprint_k: 4,
|
||||
use_rendezvous: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Ablation: cache_score-style weights (γ=0.1, δ=0) but keep rendezvous
|
||||
/// tiebreak. Isolates the contribution of deterministic sticky placement.
|
||||
pub fn weak_with_rendezvous(load_alpha: f64) -> Self {
|
||||
Self {
|
||||
name: "cache_affinity_weak_rend",
|
||||
load_alpha,
|
||||
l0_gamma: 0.1,
|
||||
meta_delta: 0.0,
|
||||
fingerprint_k: 4,
|
||||
use_rendezvous: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Ablation: strong cache weights (γ=1.0, δ=0.25) but first-found tiebreak
|
||||
/// instead of rendezvous. Isolates the contribution of reweighting alone.
|
||||
pub fn strong_no_rendezvous(load_alpha: f64) -> Self {
|
||||
Self {
|
||||
name: "cache_affinity_strong_only",
|
||||
load_alpha,
|
||||
l0_gamma: 1.0,
|
||||
meta_delta: 0.25,
|
||||
fingerprint_k: 4,
|
||||
use_rendezvous: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// FNV-1a over the first `k` block hashes — identifies the prefix family
|
||||
/// (system-prompt + early agent context) that drives cache reuse.
|
||||
fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
|
||||
let n = hash_ids.len().min(k).max(1);
|
||||
let take = hash_ids.len().min(n);
|
||||
let mut fp: u64 = 0xcbf29ce484222325;
|
||||
for &h in &hash_ids[..take] {
|
||||
fp ^= h;
|
||||
fp = fp.wrapping_mul(0x100000001b3);
|
||||
}
|
||||
if take == 0 {
|
||||
// Empty request: still want a deterministic fingerprint (0).
|
||||
fp ^= 0;
|
||||
}
|
||||
fp
|
||||
}
|
||||
|
||||
/// Splitmix64-style rendezvous score for (fingerprint, instance_id).
|
||||
/// Uniform over u64; higher = preferred home.
|
||||
fn rendezvous(fp: u64, instance_id: u32) -> u64 {
|
||||
let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
|
||||
h = h.wrapping_add(0x9e3779b97f4a7c15);
|
||||
h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
|
||||
h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
|
||||
h ^ (h >> 31)
|
||||
}
|
||||
}
|
||||
|
||||
impl Router for CacheAffinityRouter {
|
||||
fn name(&self) -> &'static str {
|
||||
self.name
|
||||
}
|
||||
|
||||
fn route(
|
||||
&mut self,
|
||||
req: &RequestRecord,
|
||||
instances: &[Instance],
|
||||
meta: &MetaStore,
|
||||
now: f64,
|
||||
) -> RouteDecision {
|
||||
let n = instances.len();
|
||||
let l0 = local_l0_scores(req, instances);
|
||||
// Meta-store predicted prefix — includes L1/remote-reachable blocks.
|
||||
let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
|
||||
let fp = Self::fingerprint(&req.hash_ids, self.fingerprint_k);
|
||||
|
||||
let mut candidates = Vec::with_capacity(n);
|
||||
let mut best_idx: usize = 0;
|
||||
let mut best_cost = f64::INFINITY;
|
||||
let mut best_hit = 0u32;
|
||||
let mut best_queue = u32::MAX;
|
||||
let mut best_rend: u64 = 0;
|
||||
|
||||
for (i, inst) in instances.iter().enumerate() {
|
||||
let hit = l0[i];
|
||||
// meta_only = extra blocks reachable by RDMA/L1 beyond L0 hit.
|
||||
let meta_only = meta_scores[i].saturating_sub(hit);
|
||||
let q = inst.queue_len();
|
||||
|
||||
// Cost to minimise — lower is better.
|
||||
// load term: α · queue_len
|
||||
// cache term: − γ · l0_hit − δ · meta_only
|
||||
// Short novel prefixes yield hit=0 on every instance, so cost
|
||||
// reduces to α·q and the rendezvous tiebreak picks the home.
|
||||
let cost = self.load_alpha * q as f64
|
||||
- self.l0_gamma * hit as f64
|
||||
- self.meta_delta * meta_only as f64;
|
||||
let rend = Self::rendezvous(fp, inst.id);
|
||||
|
||||
candidates.push(CandidateInfo {
|
||||
instance: inst.id,
|
||||
predicted_prefix: hit,
|
||||
load_blocks: inst.kv_blocks_used,
|
||||
queue_len: q,
|
||||
});
|
||||
|
||||
// Tiebreak chain (descending preference):
|
||||
// 1. lowest cost
|
||||
// 2. highest hit (break cost ties toward real L0 work)
|
||||
// 3. lowest queue
|
||||
// 4. highest rendezvous (deterministic sticky home), optional
|
||||
let better = if cost < best_cost {
|
||||
true
|
||||
} else if cost > best_cost {
|
||||
false
|
||||
} else if hit > best_hit {
|
||||
true
|
||||
} else if hit < best_hit {
|
||||
false
|
||||
} else if q < best_queue {
|
||||
true
|
||||
} else if q > best_queue {
|
||||
false
|
||||
} else if self.use_rendezvous {
|
||||
rend > best_rend
|
||||
} else {
|
||||
// First-found wins on full tie (matches cache_score behaviour).
|
||||
false
|
||||
};
|
||||
|
||||
if better {
|
||||
best_cost = cost;
|
||||
best_hit = hit;
|
||||
best_queue = q;
|
||||
best_rend = rend;
|
||||
best_idx = i;
|
||||
}
|
||||
}
|
||||
|
||||
RouteDecision {
|
||||
req_id: req.req_id,
|
||||
mode: "cache_affinity",
|
||||
chosen: instances[best_idx].id,
|
||||
probe_overhead_s: 0.0,
|
||||
candidates,
|
||||
reason: "argmin(α·q − γ·l0_hit − δ·meta_only) + rendezvous tiebreak",
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user