fix: cache calculation
This commit is contained in:
@@ -1,56 +1,30 @@
|
||||
//! First-principles TTFT-optimal routing.
|
||||
//! First-principles TTFT-estimate routing using local L0 hits only.
|
||||
//!
|
||||
//! Estimates the actual time-to-first-token for each candidate instance:
|
||||
//!
|
||||
//! `TTFT(r,i) = drain(i) + fetch(r,i) + prefill(miss)`
|
||||
//! `TTFT(r,i) = drain(i) + prefill(local_l0_miss_i)`
|
||||
//!
|
||||
//! - **drain** — exact queue drain time: sum of per-request `prefill_time()`
|
||||
//! using the architecture-aware compute model (quadratic / DSA).
|
||||
//!
|
||||
//! - **fetch** — RDMA fetch time for blocks cached elsewhere in the cluster
|
||||
//! but not on instance `i` locally.
|
||||
//! - **prefill** — compute for tokens whose blocks are absent from the
|
||||
//! instance's current L0 cache.
|
||||
//!
|
||||
//! - **prefill** — compute for cluster-wide cache-miss tokens (constant
|
||||
//! across instances, cancels in the argmin).
|
||||
//!
|
||||
//! The router minimises `drain(i) + fetch(r,i)`, with ties broken by
|
||||
//! lowest `queue_len` then most local cache. The fetch overlap with queue
|
||||
//! drain is handled by keeping the additive form: this gives double
|
||||
//! incentive to prefer instances with local cache, which empirically
|
||||
//! outperforms the `max(drain, fetch)` alternative because even small
|
||||
//! RDMA savings compound across thousands of routing decisions.
|
||||
//! L1 / remote reuse can still reduce execution-time misses later in the
|
||||
//! cluster fetch chain, but they are not counted as `kvcache hit` when
|
||||
//! comparing routing candidates.
|
||||
|
||||
use crate::cluster::meta_store::MetaStore;
|
||||
use crate::config::Config;
|
||||
use crate::instance::Instance;
|
||||
use crate::router::{CandidateInfo, RouteDecision, Router};
|
||||
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
|
||||
use crate::trace::RequestRecord;
|
||||
|
||||
pub struct EstimatedTtftRouter {
|
||||
/// Bytes per KV block (for RDMA cost estimation).
|
||||
kv_block_bytes: f64,
|
||||
/// RDMA bandwidth in bytes/s.
|
||||
rdma_bw: f64,
|
||||
/// RDMA per-transfer latency in seconds.
|
||||
rdma_latency_s: f64,
|
||||
}
|
||||
pub struct EstimatedTtftRouter;
|
||||
|
||||
impl EstimatedTtftRouter {
|
||||
pub fn new(config: &Config) -> Self {
|
||||
Self {
|
||||
kv_block_bytes: config.model.kv_block_bytes() as f64,
|
||||
rdma_bw: config.hardware.rdma_bw,
|
||||
rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
|
||||
}
|
||||
}
|
||||
|
||||
/// Estimate RDMA fetch time for `remote_blocks` blocks.
|
||||
fn fetch_time(&self, remote_blocks: u32) -> f64 {
|
||||
if remote_blocks == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
let bytes = remote_blocks as f64 * self.kv_block_bytes;
|
||||
bytes / self.rdma_bw + self.rdma_latency_s
|
||||
pub fn new(_config: &Config) -> Self {
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,14 +37,12 @@ impl Router for EstimatedTtftRouter {
|
||||
&mut self,
|
||||
req: &RequestRecord,
|
||||
instances: &[Instance],
|
||||
meta: &MetaStore,
|
||||
now: f64,
|
||||
_meta: &MetaStore,
|
||||
_now: f64,
|
||||
) -> RouteDecision {
|
||||
let n = instances.len();
|
||||
let scores = meta.score_prefix(&req.hash_ids, now, n);
|
||||
|
||||
// Cluster-wide max prefix: blocks reachable via RDMA from any peer.
|
||||
let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
|
||||
let scores = local_l0_scores(req, instances);
|
||||
let input_blocks = req.hash_ids.len() as u32;
|
||||
|
||||
let mut best: u32 = 0;
|
||||
let mut best_cost = f64::INFINITY;
|
||||
@@ -85,15 +57,11 @@ impl Router for EstimatedTtftRouter {
|
||||
// 1. Exact queue drain time (architecture-aware, per-request sum).
|
||||
let drain = inst.estimated_drain_time();
|
||||
|
||||
// 2. RDMA fetch cost for blocks not locally cached.
|
||||
let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
|
||||
let fetch = self.fetch_time(remote_blocks);
|
||||
|
||||
// Additive cost: drain + fetch.
|
||||
// The additive form gives explicit incentive to prefer local cache
|
||||
// (lower fetch) even when the queue is non-empty, which reduces
|
||||
// total RDMA traffic and improves TTFT in aggregate.
|
||||
let cost = drain + fetch;
|
||||
// 2. Prefill compute for blocks absent from local L0.
|
||||
let miss_tokens = input_blocks
|
||||
.saturating_sub(local_prefix)
|
||||
.saturating_mul(inst.block_size_tokens);
|
||||
let cost = drain + inst.compute.prefill_time(miss_tokens);
|
||||
|
||||
candidates.push(CandidateInfo {
|
||||
instance: inst.id,
|
||||
@@ -122,7 +90,7 @@ impl Router for EstimatedTtftRouter {
|
||||
chosen: best,
|
||||
probe_overhead_s: 0.0,
|
||||
candidates,
|
||||
reason: "argmin(drain_time + fetch_time)",
|
||||
reason: "argmin(drain_time + local-L0-miss prefill_time)",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user