feat: update ttft modeling and add cache affinity
This commit is contained in:
@@ -1,30 +1,30 @@
|
||||
//! First-principles TTFT-estimate routing using local L0 hits only.
|
||||
//! First-principles TTFT-estimate routing with calibrated compute and
|
||||
//! tier-aware KV prepare costs.
|
||||
//!
|
||||
//! Estimates the actual time-to-first-token for each candidate instance:
|
||||
//!
|
||||
//! `TTFT(r,i) = drain(i) + prefill(local_l0_miss_i)`
|
||||
//!
|
||||
//! - **drain** — exact queue drain time: sum of per-request `prefill_time()`
|
||||
//! using the architecture-aware compute model (quadratic / DSA).
|
||||
//!
|
||||
//! - **prefill** — compute for tokens whose blocks are absent from the
|
||||
//! instance's current L0 cache.
|
||||
//!
|
||||
//! L1 / remote reuse can still reduce execution-time misses later in the
|
||||
//! cluster fetch chain, but they are not counted as `kvcache hit` when
|
||||
//! comparing routing candidates.
|
||||
//! `TTFT(r,i) = drain(i) + scheduler + kv_prepare(r,i) + prefill(miss_i) + first_token_tail`
|
||||
|
||||
use crate::cluster::meta_store::MetaStore;
|
||||
use crate::config::Config;
|
||||
use crate::instance::Instance;
|
||||
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
|
||||
use crate::router::{CandidateInfo, RouteDecision, Router};
|
||||
use crate::trace::RequestRecord;
|
||||
use crate::ttft::{classify_prefix_tiers, TtftModel};
|
||||
|
||||
pub struct EstimatedTtftRouter;
|
||||
pub struct EstimatedTtftRouter {
|
||||
ttft_model: TtftModel,
|
||||
}
|
||||
|
||||
impl EstimatedTtftRouter {
|
||||
pub fn new(_config: &Config) -> Self {
|
||||
Self
|
||||
pub fn new(config: &Config) -> Self {
|
||||
Self {
|
||||
ttft_model: TtftModel::new(
|
||||
&config.hardware,
|
||||
&config.calibration,
|
||||
config.model.kv_block_bytes(),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,50 +37,51 @@ impl Router for EstimatedTtftRouter {
|
||||
&mut self,
|
||||
req: &RequestRecord,
|
||||
instances: &[Instance],
|
||||
_meta: &MetaStore,
|
||||
_now: f64,
|
||||
meta: &MetaStore,
|
||||
now: f64,
|
||||
) -> RouteDecision {
|
||||
let scheduler = self.ttft_model.scheduler_overhead_s(instances.len(), 3);
|
||||
let n = instances.len();
|
||||
let scores = local_l0_scores(req, instances);
|
||||
let input_blocks = req.hash_ids.len() as u32;
|
||||
|
||||
let mut best: u32 = 0;
|
||||
let mut best_cost = f64::INFINITY;
|
||||
let mut best_queue = u32::MAX;
|
||||
let mut best_local = 0u32;
|
||||
let mut best_reuse = 0u32;
|
||||
let mut candidates = Vec::with_capacity(n);
|
||||
|
||||
for inst in instances {
|
||||
let i = inst.id as usize;
|
||||
let local_prefix = scores[i];
|
||||
let residency = classify_prefix_tiers(&req.hash_ids, inst, meta, now);
|
||||
|
||||
// 1. Exact queue drain time (architecture-aware, per-request sum).
|
||||
let drain = inst.estimated_drain_time();
|
||||
|
||||
// 2. Prefill compute for blocks absent from local L0.
|
||||
let miss_tokens = input_blocks
|
||||
.saturating_sub(local_prefix)
|
||||
.saturating_mul(inst.block_size_tokens);
|
||||
let cost = drain + inst.compute.prefill_time(miss_tokens);
|
||||
let miss_tokens = residency.miss_blocks.saturating_mul(inst.block_size_tokens);
|
||||
let kv_prepare = self.ttft_model.kv_prepare_time_s(residency);
|
||||
let first_token_tail = self.ttft_model.first_token_tail_s();
|
||||
let cost =
|
||||
drain + scheduler + kv_prepare + inst.compute.prefill_time(miss_tokens) + first_token_tail;
|
||||
|
||||
candidates.push(CandidateInfo {
|
||||
instance: inst.id,
|
||||
predicted_prefix: local_prefix,
|
||||
predicted_prefix: residency.l0_hit_blocks
|
||||
+ residency.l1_hit_blocks
|
||||
+ residency.remote_hit_blocks,
|
||||
load_blocks: inst.kv_blocks_used,
|
||||
queue_len: inst.queue_len(),
|
||||
});
|
||||
|
||||
// Minimise (cost, queue_len, -local_prefix).
|
||||
let ql = inst.queue_len();
|
||||
let reusable = residency.l0_hit_blocks + residency.l1_hit_blocks + residency.remote_hit_blocks;
|
||||
let better = cost < best_cost
|
||||
|| (cost == best_cost && ql < best_queue)
|
||||
|| (cost == best_cost && ql == best_queue && local_prefix > best_local);
|
||||
|| (cost == best_cost && ql == best_queue && reusable > best_reuse);
|
||||
|
||||
if better {
|
||||
best_cost = cost;
|
||||
best = inst.id;
|
||||
best_queue = ql;
|
||||
best_local = local_prefix;
|
||||
best_reuse = reusable;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,7 +91,7 @@ impl Router for EstimatedTtftRouter {
|
||||
chosen: best,
|
||||
probe_overhead_s: 0.0,
|
||||
candidates,
|
||||
reason: "argmin(drain_time + local-L0-miss prefill_time)",
|
||||
reason: "argmin(drain + scheduler + kv_prepare + prefill + first_token_tail)",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user