Files
kvcache-simulator/src/router/estimated_ttft.rs

102 lines
3.2 KiB
Rust

//! First-principles TTFT-estimate routing with calibrated compute and
//! tier-aware KV prepare costs.
//!
//! Estimates the actual time-to-first-token for each candidate instance:
//!
//! `TTFT(r,i) = drain(i) + scheduler + kv_prepare(r,i) + prefill(miss_i) + first_token_tail`
use crate::cluster::meta_store::MetaStore;
use crate::config::Config;
use crate::instance::Instance;
use crate::router::{CandidateInfo, RouteDecision, Router};
use crate::trace::RequestRecord;
use crate::ttft::{classify_prefix_tiers, TtftModel};
pub struct EstimatedTtftRouter {
ttft_model: TtftModel,
}
impl EstimatedTtftRouter {
pub fn new(config: &Config) -> Self {
Self {
ttft_model: TtftModel::new(
&config.hardware,
&config.calibration,
config.model.kv_block_bytes(),
),
}
}
}
impl Router for EstimatedTtftRouter {
fn name(&self) -> &'static str {
"estimated_ttft"
}
fn route(
&mut self,
req: &RequestRecord,
instances: &[Instance],
meta: &MetaStore,
now: f64,
) -> RouteDecision {
let scheduler = self.ttft_model.scheduler_overhead_s(instances.len(), 3);
let n = instances.len();
let mut best: u32 = 0;
let mut best_cost = f64::INFINITY;
let mut best_queue = u32::MAX;
let mut best_reuse = 0u32;
let mut candidates = Vec::with_capacity(n);
for inst in instances {
let residency = classify_prefix_tiers(&req.hash_ids, inst, meta, now);
// 1. Exact queue drain time (architecture-aware, per-request sum).
let drain = inst.estimated_drain_time();
let miss_tokens = residency.miss_blocks.saturating_mul(inst.block_size_tokens);
let kv_prepare = self.ttft_model.kv_prepare_time_s(residency);
let first_token_tail = self.ttft_model.first_token_tail_s();
let cost = drain
+ scheduler
+ kv_prepare
+ inst.compute.prefill_time(miss_tokens)
+ first_token_tail;
candidates.push(CandidateInfo {
instance: inst.id,
predicted_prefix: residency.l0_hit_blocks
+ residency.l1_hit_blocks
+ residency.remote_hit_blocks,
load_blocks: inst.kv_blocks_used,
queue_len: inst.queue_len(),
});
// Minimise (cost, queue_len, -local_prefix).
let ql = inst.queue_len();
let reusable =
residency.l0_hit_blocks + residency.l1_hit_blocks + residency.remote_hit_blocks;
let better = cost < best_cost
|| (cost == best_cost && ql < best_queue)
|| (cost == best_cost && ql == best_queue && reusable > best_reuse);
if better {
best_cost = cost;
best = inst.id;
best_queue = ql;
best_reuse = reusable;
}
}
crate::router::local_route_decision(
req.req_id,
"estimated_ttft",
best,
0.0,
candidates,
"argmin(drain + scheduler + kv_prepare + prefill + first_token_tail)",
)
}
}