fix: cache calculation

2026-04-15 17:31:39 +08:00
parent 365ceac3be
commit ff316c6873
23 changed files with 500 additions and 336 deletions
--- a/src/router/estimated_ttft.rs
+++ b/src/router/estimated_ttft.rs
@@ -1,56 +1,30 @@
-//! First-principles TTFT-optimal routing.
+//! First-principles TTFT-estimate routing using local L0 hits only.
 //!
 //! Estimates the actual time-to-first-token for each candidate instance:
 //!
-//! `TTFT(r,i) = drain(i) + fetch(r,i) + prefill(miss)`
+//! `TTFT(r,i) = drain(i) + prefill(local_l0_miss_i)`
 //!
 //! - **drain** — exact queue drain time: sum of per-request `prefill_time()`
 //!   using the architecture-aware compute model (quadratic / DSA).
 //!
-//! - **fetch** — RDMA fetch time for blocks cached elsewhere in the cluster
-//!   but not on instance `i` locally.
+//! - **prefill** — compute for tokens whose blocks are absent from the
+//!   instance's current L0 cache.
 //!
-//! - **prefill** — compute for cluster-wide cache-miss tokens (constant
-//!   across instances, cancels in the argmin).
-//!
-//! The router minimises `drain(i) + fetch(r,i)`, with ties broken by
-//! lowest `queue_len` then most local cache.  The fetch overlap with queue
-//! drain is handled by keeping the additive form: this gives double
-//! incentive to prefer instances with local cache, which empirically
-//! outperforms the `max(drain, fetch)` alternative because even small
-//! RDMA savings compound across thousands of routing decisions.
+//! L1 / remote reuse can still reduce execution-time misses later in the
+//! cluster fetch chain, but they are not counted as `kvcache hit` when
+//! comparing routing candidates.

 use crate::cluster::meta_store::MetaStore;
 use crate::config::Config;
 use crate::instance::Instance;
-use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
 use crate::trace::RequestRecord;

-pub struct EstimatedTtftRouter {
-    /// Bytes per KV block (for RDMA cost estimation).
-    kv_block_bytes: f64,
-    /// RDMA bandwidth in bytes/s.
-    rdma_bw: f64,
-    /// RDMA per-transfer latency in seconds.
-    rdma_latency_s: f64,
-}
+pub struct EstimatedTtftRouter;

 impl EstimatedTtftRouter {
-    pub fn new(config: &Config) -> Self {
-        Self {
-            kv_block_bytes: config.model.kv_block_bytes() as f64,
-            rdma_bw: config.hardware.rdma_bw,
-            rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
-        }
-    }
-
-    /// Estimate RDMA fetch time for `remote_blocks` blocks.
-    fn fetch_time(&self, remote_blocks: u32) -> f64 {
-        if remote_blocks == 0 {
-            return 0.0;
-        }
-        let bytes = remote_blocks as f64 * self.kv_block_bytes;
-        bytes / self.rdma_bw + self.rdma_latency_s
+    pub fn new(_config: &Config) -> Self {
+        Self
    }
 }

@@ -63,14 +37,12 @@ impl Router for EstimatedTtftRouter {
        &mut self,
        req: &RequestRecord,
        instances: &[Instance],
-        meta: &MetaStore,
-        now: f64,
+        _meta: &MetaStore,
+        _now: f64,
    ) -> RouteDecision {
        let n = instances.len();
-        let scores = meta.score_prefix(&req.hash_ids, now, n);
-
-        // Cluster-wide max prefix: blocks reachable via RDMA from any peer.
-        let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
+        let scores = local_l0_scores(req, instances);
+        let input_blocks = req.hash_ids.len() as u32;

        let mut best: u32 = 0;
        let mut best_cost = f64::INFINITY;
@@ -85,15 +57,11 @@ impl Router for EstimatedTtftRouter {
            // 1. Exact queue drain time (architecture-aware, per-request sum).
            let drain = inst.estimated_drain_time();

-            // 2. RDMA fetch cost for blocks not locally cached.
-            let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
-            let fetch = self.fetch_time(remote_blocks);
-
-            // Additive cost: drain + fetch.
-            // The additive form gives explicit incentive to prefer local cache
-            // (lower fetch) even when the queue is non-empty, which reduces
-            // total RDMA traffic and improves TTFT in aggregate.
-            let cost = drain + fetch;
+            // 2. Prefill compute for blocks absent from local L0.
+            let miss_tokens = input_blocks
+                .saturating_sub(local_prefix)
+                .saturating_mul(inst.block_size_tokens);
+            let cost = drain + inst.compute.prefill_time(miss_tokens);

            candidates.push(CandidateInfo {
                instance: inst.id,
@@ -122,7 +90,7 @@ impl Router for EstimatedTtftRouter {
            chosen: best,
            probe_overhead_s: 0.0,
            candidates,
-            reason: "argmin(drain_time + fetch_time)",
+            reason: "argmin(drain_time + local-L0-miss prefill_time)",
        }
    }
 }