fix: cache calculation
This commit is contained in:
@@ -36,7 +36,7 @@
|
||||
use crate::cluster::meta_store::MetaStore;
|
||||
use crate::config::Config;
|
||||
use crate::instance::Instance;
|
||||
use crate::router::{CandidateInfo, RouteDecision, Router};
|
||||
use crate::router::{local_l0_scores, CandidateInfo, RouteDecision, Router};
|
||||
use crate::trace::RequestRecord;
|
||||
|
||||
pub struct PrefixAffinityRouter {
|
||||
@@ -47,12 +47,6 @@ pub struct PrefixAffinityRouter {
|
||||
/// Queue-length threshold: if all top candidates exceed this, expand to
|
||||
/// the full instance set.
|
||||
overload_threshold: u32,
|
||||
/// Bytes per KV block (for RDMA cost estimation in fallback path).
|
||||
kv_block_bytes: f64,
|
||||
/// RDMA bandwidth in bytes/s.
|
||||
rdma_bw: f64,
|
||||
/// RDMA per-transfer latency in seconds.
|
||||
rdma_latency_s: f64,
|
||||
}
|
||||
|
||||
impl PrefixAffinityRouter {
|
||||
@@ -69,9 +63,6 @@ impl PrefixAffinityRouter {
|
||||
prefix_k: config.cluster.router.prefix_k,
|
||||
fan_out,
|
||||
overload_threshold: 4,
|
||||
kv_block_bytes: config.model.kv_block_bytes() as f64,
|
||||
rdma_bw: config.hardware.rdma_bw,
|
||||
rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,15 +87,6 @@ impl PrefixAffinityRouter {
|
||||
h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
|
||||
h ^ (h >> 31)
|
||||
}
|
||||
|
||||
/// Estimate RDMA fetch time for `remote_blocks` blocks.
|
||||
fn fetch_time(&self, remote_blocks: u32) -> f64 {
|
||||
if remote_blocks == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
let bytes = remote_blocks as f64 * self.kv_block_bytes;
|
||||
bytes / self.rdma_bw + self.rdma_latency_s
|
||||
}
|
||||
}
|
||||
|
||||
impl Router for PrefixAffinityRouter {
|
||||
@@ -116,8 +98,8 @@ impl Router for PrefixAffinityRouter {
|
||||
&mut self,
|
||||
req: &RequestRecord,
|
||||
instances: &[Instance],
|
||||
meta: &MetaStore,
|
||||
now: f64,
|
||||
_meta: &MetaStore,
|
||||
_now: f64,
|
||||
) -> RouteDecision {
|
||||
let n = instances.len();
|
||||
let fp = Self::fingerprint(&req.hash_ids, self.prefix_k);
|
||||
@@ -129,7 +111,7 @@ impl Router for PrefixAffinityRouter {
|
||||
ranked.sort_unstable_by(|a, b| b.0.cmp(&a.0)); // descending score
|
||||
|
||||
// Collect candidate info for logging (also needed for fallback).
|
||||
let scores = meta.score_prefix(&req.hash_ids, now, n);
|
||||
let scores = local_l0_scores(req, instances);
|
||||
let candidates: Vec<CandidateInfo> = instances
|
||||
.iter()
|
||||
.map(|inst| CandidateInfo {
|
||||
@@ -165,14 +147,14 @@ impl Router for PrefixAffinityRouter {
|
||||
let reason;
|
||||
if all_overloaded {
|
||||
reason = "affinity fallback: min(drain+fetch)";
|
||||
let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
|
||||
let mut best_cost = f64::INFINITY;
|
||||
for &(_, idx) in ranked.iter() {
|
||||
let inst = &instances[idx];
|
||||
let drain = inst.estimated_drain_time();
|
||||
let local_prefix = scores[idx];
|
||||
let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
|
||||
let cost = drain + self.fetch_time(remote_blocks);
|
||||
let miss_tokens = (req.hash_ids.len() as u32)
|
||||
.saturating_sub(scores[idx])
|
||||
.saturating_mul(inst.block_size_tokens);
|
||||
let cost = drain + inst.compute.prefill_time(miss_tokens);
|
||||
let ql = inst.queue_len();
|
||||
if cost < best_cost || (cost == best_cost && ql < best_ql) {
|
||||
best_cost = cost;
|
||||
|
||||
Reference in New Issue
Block a user