feat: update ttft modeling and add cache affinity

This commit is contained in:
2026-04-15 19:08:10 +08:00
parent ff316c6873
commit a3f386c858
15 changed files with 1276 additions and 99 deletions

View File

@@ -0,0 +1,217 @@
//! Cache-affinity routing tuned for coding-agent workloads.
//!
//! Motivation — the coding trace has three dominant patterns:
//!
//! 1. **Short system-prompt-only requests** (≤10 blocks): novel per-chat but
//! sharing a small set of system prompts across millions of invocations.
//! 2. **Long multi-turn chains**: parent→child prefixes share ~60+ blocks
//! and grow by ~6 blocks per turn. Sticking the chain to one instance
//! maximises L0 hits for every subsequent turn.
//! 3. **Completely novel one-shots**: no existing cache anywhere; should be
//! placed to maximise *future* reuse, not just minimise current load.
//!
//! `cache_score` minimises `α·queue_len + β·miss_blocks`. With the shipping
//! defaults (α=1, β=0.1) a single extra queue position is worth ten extra
//! miss blocks, so short novel requests — the bulk of traffic — reduce to
//! pure least-loaded routing and scatter the same system prompt across
//! dozens of instances. Each scattered copy burns HBM that could have held a
//! different hot prefix, depressing the cluster-wide L0 hit-rate.
//!
//! `cache_affinity` fixes this with two changes:
//!
//! * **Strong cache weight** — cost is `α·queue_len γ·l0_hit`, with
//! γα·input_blocks, so any real L0 hit beats load-balancing. A soft
//! bonus (`δ·meta_only_hit`) still rewards instances that have the prefix
//! in L1/DRAM even when L0 is empty.
//!
//! * **Deterministic rendezvous tiebreak** — among instances that tie on
//! `(cost, hit, queue)`, we rank by `rendezvous(fingerprint, instance_id)`
//! where `fingerprint` is an FNV hash of the first few block hashes. This
//! turns cold routing from "first-found" (which piles on instance 0 until
//! it fills, then spills sequentially) into a consistent hash that maps
//! each distinct prefix to the *same* small set of homes. Repeat traffic
//! for that prefix therefore concentrates on its home, building a strong
//! L0 working set.
//!
//! Overload protection: if the rendezvous-chosen home already has
//! `queue_len > overload_threshold`, the load term dominates and the router
//! naturally spills to the next-best instance.
use crate::cluster::meta_store::MetaStore;
use crate::instance::Instance;
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
use crate::trace::RequestRecord;
pub struct CacheAffinityRouter {
/// Router display / trace name.
name: &'static str,
/// Weight on queue length (per queued request).
load_alpha: f64,
/// Reward per L0-hit block (real, locally cached).
l0_gamma: f64,
/// Reward per block present via meta-store but not in L0 (L1 / remote).
meta_delta: f64,
/// Number of leading block hashes folded into the prefix fingerprint.
fingerprint_k: usize,
/// Whether to break ties by rendezvous hash (sticky consistent placement)
/// or by first-found order (matches cache_score behaviour).
use_rendezvous: bool,
}
impl CacheAffinityRouter {
pub fn new(load_alpha: f64) -> Self {
Self {
name: "cache_affinity",
load_alpha,
l0_gamma: 1.0,
meta_delta: 0.25,
fingerprint_k: 4,
use_rendezvous: true,
}
}
/// Ablation: cache_score-style weights (γ=0.1, δ=0) but keep rendezvous
/// tiebreak. Isolates the contribution of deterministic sticky placement.
pub fn weak_with_rendezvous(load_alpha: f64) -> Self {
Self {
name: "cache_affinity_weak_rend",
load_alpha,
l0_gamma: 0.1,
meta_delta: 0.0,
fingerprint_k: 4,
use_rendezvous: true,
}
}
/// Ablation: strong cache weights (γ=1.0, δ=0.25) but first-found tiebreak
/// instead of rendezvous. Isolates the contribution of reweighting alone.
pub fn strong_no_rendezvous(load_alpha: f64) -> Self {
Self {
name: "cache_affinity_strong_only",
load_alpha,
l0_gamma: 1.0,
meta_delta: 0.25,
fingerprint_k: 4,
use_rendezvous: false,
}
}
/// FNV-1a over the first `k` block hashes — identifies the prefix family
/// (system-prompt + early agent context) that drives cache reuse.
fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
let n = hash_ids.len().min(k).max(1);
let take = hash_ids.len().min(n);
let mut fp: u64 = 0xcbf29ce484222325;
for &h in &hash_ids[..take] {
fp ^= h;
fp = fp.wrapping_mul(0x100000001b3);
}
if take == 0 {
// Empty request: still want a deterministic fingerprint (0).
fp ^= 0;
}
fp
}
/// Splitmix64-style rendezvous score for (fingerprint, instance_id).
/// Uniform over u64; higher = preferred home.
fn rendezvous(fp: u64, instance_id: u32) -> u64 {
let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
h = h.wrapping_add(0x9e3779b97f4a7c15);
h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
h ^ (h >> 31)
}
}
impl Router for CacheAffinityRouter {
fn name(&self) -> &'static str {
self.name
}
fn route(
&mut self,
req: &RequestRecord,
instances: &[Instance],
meta: &MetaStore,
now: f64,
) -> RouteDecision {
let n = instances.len();
let l0 = local_l0_scores(req, instances);
// Meta-store predicted prefix — includes L1/remote-reachable blocks.
let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
let fp = Self::fingerprint(&req.hash_ids, self.fingerprint_k);
let mut candidates = Vec::with_capacity(n);
let mut best_idx: usize = 0;
let mut best_cost = f64::INFINITY;
let mut best_hit = 0u32;
let mut best_queue = u32::MAX;
let mut best_rend: u64 = 0;
for (i, inst) in instances.iter().enumerate() {
let hit = l0[i];
// meta_only = extra blocks reachable by RDMA/L1 beyond L0 hit.
let meta_only = meta_scores[i].saturating_sub(hit);
let q = inst.queue_len();
// Cost to minimise — lower is better.
// load term: α · queue_len
// cache term: γ · l0_hit δ · meta_only
// Short novel prefixes yield hit=0 on every instance, so cost
// reduces to α·q and the rendezvous tiebreak picks the home.
let cost = self.load_alpha * q as f64
- self.l0_gamma * hit as f64
- self.meta_delta * meta_only as f64;
let rend = Self::rendezvous(fp, inst.id);
candidates.push(CandidateInfo {
instance: inst.id,
predicted_prefix: hit,
load_blocks: inst.kv_blocks_used,
queue_len: q,
});
// Tiebreak chain (descending preference):
// 1. lowest cost
// 2. highest hit (break cost ties toward real L0 work)
// 3. lowest queue
// 4. highest rendezvous (deterministic sticky home), optional
let better = if cost < best_cost {
true
} else if cost > best_cost {
false
} else if hit > best_hit {
true
} else if hit < best_hit {
false
} else if q < best_queue {
true
} else if q > best_queue {
false
} else if self.use_rendezvous {
rend > best_rend
} else {
// First-found wins on full tie (matches cache_score behaviour).
false
};
if better {
best_cost = cost;
best_hit = hit;
best_queue = q;
best_rend = rend;
best_idx = i;
}
}
RouteDecision {
req_id: req.req_id,
mode: "cache_affinity",
chosen: instances[best_idx].id,
probe_overhead_s: 0.0,
candidates,
reason: "argmin(α·q γ·l0_hit δ·meta_only) + rendezvous tiebreak",
}
}
}