feat: update ttft modeling and add cache affinity

2026-04-15 19:08:10 +08:00
parent ff316c6873
commit a3f386c858
15 changed files with 1276 additions and 99 deletions
--- a/src/router/estimated_ttft.rs
+++ b/src/router/estimated_ttft.rs
@@ -1,30 +1,30 @@
-//! First-principles TTFT-estimate routing using local L0 hits only.
+//! First-principles TTFT-estimate routing with calibrated compute and
+//! tier-aware KV prepare costs.
 //!
 //! Estimates the actual time-to-first-token for each candidate instance:
 //!
-//! `TTFT(r,i) = drain(i) + prefill(local_l0_miss_i)`
-//!
-//! - **drain** — exact queue drain time: sum of per-request `prefill_time()`
-//!   using the architecture-aware compute model (quadratic / DSA).
-//!
-//! - **prefill** — compute for tokens whose blocks are absent from the
-//!   instance's current L0 cache.
-//!
-//! L1 / remote reuse can still reduce execution-time misses later in the
-//! cluster fetch chain, but they are not counted as `kvcache hit` when
-//! comparing routing candidates.
+//! `TTFT(r,i) = drain(i) + scheduler + kv_prepare(r,i) + prefill(miss_i) + first_token_tail`

 use crate::cluster::meta_store::MetaStore;
 use crate::config::Config;
 use crate::instance::Instance;
-use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
+use crate::router::{CandidateInfo, RouteDecision, Router};
 use crate::trace::RequestRecord;
+use crate::ttft::{classify_prefix_tiers, TtftModel};

-pub struct EstimatedTtftRouter;
+pub struct EstimatedTtftRouter {
+    ttft_model: TtftModel,
+}

 impl EstimatedTtftRouter {
-    pub fn new(_config: &Config) -> Self {
-        Self
+    pub fn new(config: &Config) -> Self {
+        Self {
+            ttft_model: TtftModel::new(
+                &config.hardware,
+                &config.calibration,
+                config.model.kv_block_bytes(),
+            ),
+        }
    }
 }

@@ -37,50 +37,51 @@ impl Router for EstimatedTtftRouter {
        &mut self,
        req: &RequestRecord,
        instances: &[Instance],
-        _meta: &MetaStore,
-        _now: f64,
+        meta: &MetaStore,
+        now: f64,
    ) -> RouteDecision {
+        let scheduler = self.ttft_model.scheduler_overhead_s(instances.len(), 3);
        let n = instances.len();
-        let scores = local_l0_scores(req, instances);
-        let input_blocks = req.hash_ids.len() as u32;

        let mut best: u32 = 0;
        let mut best_cost = f64::INFINITY;
        let mut best_queue = u32::MAX;
-        let mut best_local = 0u32;
+        let mut best_reuse = 0u32;
        let mut candidates = Vec::with_capacity(n);

        for inst in instances {
-            let i = inst.id as usize;
-            let local_prefix = scores[i];
+            let residency = classify_prefix_tiers(&req.hash_ids, inst, meta, now);

            // 1. Exact queue drain time (architecture-aware, per-request sum).
            let drain = inst.estimated_drain_time();

-            // 2. Prefill compute for blocks absent from local L0.
-            let miss_tokens = input_blocks
-                .saturating_sub(local_prefix)
-                .saturating_mul(inst.block_size_tokens);
-            let cost = drain + inst.compute.prefill_time(miss_tokens);
+            let miss_tokens = residency.miss_blocks.saturating_mul(inst.block_size_tokens);
+            let kv_prepare = self.ttft_model.kv_prepare_time_s(residency);
+            let first_token_tail = self.ttft_model.first_token_tail_s();
+            let cost =
+                drain + scheduler + kv_prepare + inst.compute.prefill_time(miss_tokens) + first_token_tail;

            candidates.push(CandidateInfo {
                instance: inst.id,
-                predicted_prefix: local_prefix,
+                predicted_prefix: residency.l0_hit_blocks
+                    + residency.l1_hit_blocks
+                    + residency.remote_hit_blocks,
                load_blocks: inst.kv_blocks_used,
                queue_len: inst.queue_len(),
            });

            // Minimise (cost, queue_len, -local_prefix).
            let ql = inst.queue_len();
+            let reusable = residency.l0_hit_blocks + residency.l1_hit_blocks + residency.remote_hit_blocks;
            let better = cost < best_cost
                || (cost == best_cost && ql < best_queue)
-                || (cost == best_cost && ql == best_queue && local_prefix > best_local);
+                || (cost == best_cost && ql == best_queue && reusable > best_reuse);

            if better {
                best_cost = cost;
                best = inst.id;
                best_queue = ql;
-                best_local = local_prefix;
+                best_reuse = reusable;
            }
        }

@@ -90,7 +91,7 @@ impl Router for EstimatedTtftRouter {
            chosen: best,
            probe_overhead_s: 0.0,
            candidates,
-            reason: "argmin(drain_time + local-L0-miss prefill_time)",
+            reason: "argmin(drain + scheduler + kv_prepare + prefill + first_token_tail)",
        }
    }
 }