kvcache-simulator/src/router/estimated_ttft.rs

//! First-principles TTFT-estimate routing with calibrated compute and
//! tier-aware KV prepare costs.
//!
//! Estimates the actual time-to-first-token for each candidate instance:
//!
//! `TTFT(r,i) = drain(i) + scheduler + kv_prepare(r,i) + prefill(miss_i) + first_token_tail`

use crate::cluster::meta_store::MetaStore;
use crate::config::Config;
use crate::instance::Instance;
use crate::router::{CandidateInfo, RouteDecision, Router};
use crate::trace::RequestRecord;
use crate::ttft::{classify_prefix_tiers, TtftModel};

pub struct EstimatedTtftRouter {
    ttft_model: TtftModel,
}

impl EstimatedTtftRouter {
    pub fn new(config: &Config) -> Self {
        Self {
            ttft_model: TtftModel::new(
                &config.hardware,
                &config.calibration,
                config.model.kv_block_bytes(),
            ),
        }
    }
}

impl Router for EstimatedTtftRouter {
    fn name(&self) -> &'static str {
        "estimated_ttft"
    }

    fn route(
        &mut self,
        req: &RequestRecord,
        instances: &[Instance],
        meta: &MetaStore,
        now: f64,
    ) -> RouteDecision {
        let scheduler = self.ttft_model.scheduler_overhead_s(instances.len(), 3);
        let n = instances.len();

        let mut best: u32 = 0;
        let mut best_cost = f64::INFINITY;
        let mut best_queue = u32::MAX;
        let mut best_reuse = 0u32;
        let mut candidates = Vec::with_capacity(n);

        for inst in instances {
            let residency = classify_prefix_tiers(&req.hash_ids, inst, meta, now);

            // 1. Exact queue drain time (architecture-aware, per-request sum).
            let drain = inst.estimated_drain_time();

            let miss_tokens = residency.miss_blocks.saturating_mul(inst.block_size_tokens);
            let kv_prepare = self.ttft_model.kv_prepare_time_s(residency);
            let first_token_tail = self.ttft_model.first_token_tail_s();
            let cost = drain
                + scheduler
                + kv_prepare
                + inst.compute.prefill_time(miss_tokens)
                + first_token_tail;

            candidates.push(CandidateInfo {
                instance: inst.id,
                predicted_prefix: residency.l0_hit_blocks
                    + residency.l1_hit_blocks
                    + residency.remote_hit_blocks,
                load_blocks: inst.kv_blocks_used,
                queue_len: inst.queue_len(),
            });

            // Minimise (cost, queue_len, -local_prefix).
            let ql = inst.queue_len();
            let reusable =
                residency.l0_hit_blocks + residency.l1_hit_blocks + residency.remote_hit_blocks;
            let better = cost < best_cost
                || (cost == best_cost && ql < best_queue)
                || (cost == best_cost && ql == best_queue && reusable > best_reuse);

            if better {
                best_cost = cost;
                best = inst.id;
                best_queue = ql;
                best_reuse = reusable;
            }
        }

        crate::router::local_route_decision(
            req.req_id,
            "estimated_ttft",
            best,
            0.0,
            candidates,
            "argmin(drain + scheduler + kv_prepare + prefill + first_token_tail)",
        )
    }
}