fix: cache calculation

2026-04-15 17:31:39 +08:00
parent 365ceac3be
commit ff316c6873
23 changed files with 500 additions and 336 deletions
--- a/src/router/prefix_affinity.rs
+++ b/src/router/prefix_affinity.rs
@@ -36,7 +36,7 @@
 use crate::cluster::meta_store::MetaStore;
 use crate::config::Config;
 use crate::instance::Instance;
-use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
 use crate::trace::RequestRecord;

 pub struct PrefixAffinityRouter {
@@ -47,12 +47,6 @@ pub struct PrefixAffinityRouter {
    /// Queue-length threshold: if all top candidates exceed this, expand to
    /// the full instance set.
    overload_threshold: u32,
-    /// Bytes per KV block (for RDMA cost estimation in fallback path).
-    kv_block_bytes: f64,
-    /// RDMA bandwidth in bytes/s.
-    rdma_bw: f64,
-    /// RDMA per-transfer latency in seconds.
-    rdma_latency_s: f64,
 }

 impl PrefixAffinityRouter {
@@ -69,9 +63,6 @@ impl PrefixAffinityRouter {
            prefix_k: config.cluster.router.prefix_k,
            fan_out,
            overload_threshold: 4,
-            kv_block_bytes: config.model.kv_block_bytes() as f64,
-            rdma_bw: config.hardware.rdma_bw,
-            rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
        }
    }

@@ -96,15 +87,6 @@ impl PrefixAffinityRouter {
        h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
        h ^ (h >> 31)
    }
-
-    /// Estimate RDMA fetch time for `remote_blocks` blocks.
-    fn fetch_time(&self, remote_blocks: u32) -> f64 {
-        if remote_blocks == 0 {
-            return 0.0;
-        }
-        let bytes = remote_blocks as f64 * self.kv_block_bytes;
-        bytes / self.rdma_bw + self.rdma_latency_s
-    }
 }

 impl Router for PrefixAffinityRouter {
@@ -116,8 +98,8 @@ impl Router for PrefixAffinityRouter {
        &mut self,
        req: &RequestRecord,
        instances: &[Instance],
-        meta: &MetaStore,
-        now: f64,
+        _meta: &MetaStore,
+        _now: f64,
    ) -> RouteDecision {
        let n = instances.len();
        let fp = Self::fingerprint(&req.hash_ids, self.prefix_k);
@@ -129,7 +111,7 @@ impl Router for PrefixAffinityRouter {
        ranked.sort_unstable_by(|a, b| b.0.cmp(&a.0)); // descending score

        // Collect candidate info for logging (also needed for fallback).
-        let scores = meta.score_prefix(&req.hash_ids, now, n);
+        let scores = local_l0_scores(req, instances);
        let candidates: Vec<CandidateInfo> = instances
            .iter()
            .map(|inst| CandidateInfo {
@@ -165,14 +147,14 @@ impl Router for PrefixAffinityRouter {
        let reason;
        if all_overloaded {
            reason = "affinity fallback: min(drain+fetch)";
-            let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
            let mut best_cost = f64::INFINITY;
            for &(_, idx) in ranked.iter() {
                let inst = &instances[idx];
                let drain = inst.estimated_drain_time();
-                let local_prefix = scores[idx];
-                let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
-                let cost = drain + self.fetch_time(remote_blocks);
+                let miss_tokens = (req.hash_ids.len() as u32)
+                    .saturating_sub(scores[idx])
+                    .saturating_mul(inst.block_size_tokens);
+                let cost = drain + inst.compute.prefill_time(miss_tokens);
                let ql = inst.queue_len();
                if cost < best_cost || (cost == best_cost && ql < best_ql) {
                    best_cost = cost;