//! First-principles TTFT-estimate routing with calibrated compute and //! tier-aware KV prepare costs. //! //! Estimates the actual time-to-first-token for each candidate instance: //! //! `TTFT(r,i) = drain(i) + scheduler + kv_prepare(r,i) + prefill(miss_i) + first_token_tail` use crate::cluster::meta_store::MetaStore; use crate::config::Config; use crate::instance::Instance; use crate::router::{CandidateInfo, RouteDecision, Router}; use crate::trace::RequestRecord; use crate::ttft::{classify_prefix_tiers, TtftModel}; pub struct EstimatedTtftRouter { ttft_model: TtftModel, } impl EstimatedTtftRouter { pub fn new(config: &Config) -> Self { Self { ttft_model: TtftModel::new( &config.hardware, &config.calibration, config.model.kv_block_bytes(), ), } } } impl Router for EstimatedTtftRouter { fn name(&self) -> &'static str { "estimated_ttft" } fn route( &mut self, req: &RequestRecord, instances: &[Instance], meta: &MetaStore, now: f64, ) -> RouteDecision { let scheduler = self.ttft_model.scheduler_overhead_s(instances.len(), 3); let n = instances.len(); let mut best: u32 = 0; let mut best_cost = f64::INFINITY; let mut best_queue = u32::MAX; let mut best_reuse = 0u32; let mut candidates = Vec::with_capacity(n); for inst in instances { let residency = classify_prefix_tiers(&req.hash_ids, inst, meta, now); // 1. Exact queue drain time (architecture-aware, per-request sum). let drain = inst.estimated_drain_time(); let miss_tokens = residency.miss_blocks.saturating_mul(inst.block_size_tokens); let kv_prepare = self.ttft_model.kv_prepare_time_s(residency); let first_token_tail = self.ttft_model.first_token_tail_s(); let cost = drain + scheduler + kv_prepare + inst.compute.prefill_time(miss_tokens) + first_token_tail; candidates.push(CandidateInfo { instance: inst.id, predicted_prefix: residency.l0_hit_blocks + residency.l1_hit_blocks + residency.remote_hit_blocks, load_blocks: inst.kv_blocks_used, queue_len: inst.queue_len(), }); // Minimise (cost, queue_len, -local_prefix). let ql = inst.queue_len(); let reusable = residency.l0_hit_blocks + residency.l1_hit_blocks + residency.remote_hit_blocks; let better = cost < best_cost || (cost == best_cost && ql < best_queue) || (cost == best_cost && ql == best_queue && reusable > best_reuse); if better { best_cost = cost; best = inst.id; best_queue = ql; best_reuse = reusable; } } crate::router::local_route_decision( req.req_id, "estimated_ttft", best, 0.0, candidates, "argmin(drain + scheduler + kv_prepare + prefill + first_token_tail)", ) } }