feat: update ttft modeling and add cache affinity

This commit is contained in:
2026-04-15 19:08:10 +08:00
parent ff316c6873
commit a3f386c858
15 changed files with 1276 additions and 99 deletions

177
src/ttft.rs Normal file
View File

@@ -0,0 +1,177 @@
use crate::cluster::meta_store::MetaStore;
use crate::config::{CalibrationConfig, HardwareConfig};
use crate::instance::Instance;
#[derive(Debug, Clone, Copy, Default)]
pub struct PrefixResidency {
pub l0_hit_blocks: u32,
pub l1_hit_blocks: u32,
pub remote_hit_blocks: u32,
pub miss_blocks: u32,
}
#[derive(Debug, Clone)]
pub struct TtftModel {
kv_block_bytes: u64,
host_dram_bw: f64,
pcie_bw: f64,
pcie_latency_s: f64,
rdma_bw: f64,
rdma_latency_s: f64,
scheduler_base_s: f64,
scheduler_per_candidate_s: f64,
cache_probe_per_tier_s: f64,
batch_pack_s: f64,
dram_access_s: f64,
remote_metadata_s: f64,
layout_transform_s: f64,
first_token_tail_s: f64,
}
impl TtftModel {
pub fn new(hw: &HardwareConfig, calib: &CalibrationConfig, kv_block_bytes: u64) -> Self {
Self {
kv_block_bytes,
host_dram_bw: hw.host_dram_bw * calib.dram_bw_util,
pcie_bw: hw.pcie_bw * calib.pcie_bw_util,
pcie_latency_s: hw.pcie_latency_us * 1e-6,
rdma_bw: hw.rdma_bw * calib.rdma_bw_util,
rdma_latency_s: hw.rdma_latency_us * 1e-6,
scheduler_base_s: calib.scheduler_base_overhead_us * 1e-6,
scheduler_per_candidate_s: calib.scheduler_per_candidate_us * 1e-6,
cache_probe_per_tier_s: calib.cache_probe_us_per_tier * 1e-6,
batch_pack_s: calib.batch_pack_overhead_us * 1e-6,
dram_access_s: calib.dram_access_latency_us * 1e-6,
remote_metadata_s: calib.remote_metadata_us * 1e-6,
layout_transform_s: calib.layout_transform_fixed_us * 1e-6,
first_token_tail_s: (calib.final_sync_us + calib.first_token_ready_us) * 1e-6,
}
}
pub fn scheduler_overhead_s(&self, num_candidates: usize, num_tiers: usize) -> f64 {
self.scheduler_base_s
+ self.scheduler_per_candidate_s * num_candidates as f64
+ self.cache_probe_per_tier_s * num_tiers as f64
+ self.batch_pack_s
}
pub fn first_token_tail_s(&self) -> f64 {
self.first_token_tail_s
}
pub fn block_bytes(&self, blocks: u32) -> u64 {
self.kv_block_bytes * blocks as u64
}
pub fn local_l1_prepare_time_s(&self, blocks: u32) -> f64 {
if blocks == 0 {
return 0.0;
}
let bytes = self.block_bytes(blocks);
self.dram_access_s
+ bytes as f64 / self.host_dram_bw.max(1.0)
+ self.pcie_cost_s(bytes)
+ self.layout_transform_s
}
pub fn remote_prepare_time_s(&self, blocks: u32) -> f64 {
if blocks == 0 {
return 0.0;
}
let bytes = self.block_bytes(blocks);
self.remote_metadata_s + self.rdma_cost_s(bytes) + self.pcie_cost_s(bytes) + self.layout_transform_s
}
pub fn pcie_cost_s(&self, bytes: u64) -> f64 {
if bytes == 0 {
self.pcie_latency_s
} else {
self.pcie_latency_s + bytes as f64 / self.pcie_bw.max(1.0)
}
}
pub fn rdma_cost_s(&self, bytes: u64) -> f64 {
if bytes == 0 {
self.rdma_latency_s
} else {
self.rdma_latency_s + bytes as f64 / self.rdma_bw.max(1.0)
}
}
pub fn kv_prepare_time_s(&self, residency: PrefixResidency) -> f64 {
self.local_l1_prepare_time_s(residency.l1_hit_blocks)
+ self.remote_prepare_time_s(residency.remote_hit_blocks)
}
}
pub fn classify_prefix_tiers(
req_hashes: &[u64],
inst: &Instance,
meta: &MetaStore,
now: f64,
) -> PrefixResidency {
let total_blocks = req_hashes.len() as u32;
let l0_hit_blocks = inst.cache.l0.longest_prefix_peek(req_hashes) as u32;
let suffix_after_l0 = &req_hashes[l0_hit_blocks as usize..];
let l1_hit_blocks = inst.cache.l1.longest_prefix_peek(suffix_after_l0) as u32;
let suffix_after_l1 = &suffix_after_l0[l1_hit_blocks as usize..];
let mut remote_hit_blocks = 0;
for &h in suffix_after_l1 {
let owners = meta.instances_for(h, now);
if owners.iter().any(|o| *o != inst.id) {
remote_hit_blocks += 1;
} else {
break;
}
}
PrefixResidency {
l0_hit_blocks,
l1_hit_blocks,
remote_hit_blocks,
miss_blocks: total_blocks - l0_hit_blocks - l1_hit_blocks - remote_hit_blocks,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::{CalibrationConfig, HardwareConfig};
#[test]
fn remote_prepare_includes_fixed_overheads() {
let hw = HardwareConfig {
gpu_flops: 1.0e14,
gpu_fp8_flops: 0.0,
gpu_fp4_flops: 0.0,
gpu_mem_bw: 1.0e12,
hbm_bytes: 1.0e9,
dram_bytes: 4.0e9,
host_dram_bw: 5.0e11,
pcie_bw: 32.0e9,
pcie_latency_us: 1.0,
rdma_bw: 12.0e9,
rdma_latency_us: 5.0,
intra_node_tp_bw: 9.0e11,
intra_node_tp_latency_us: 2.0,
tp_degree: 1,
max_batch_slots: 32,
prefill_chunk_tokens: 1024,
};
let model = TtftModel::new(
&hw,
&CalibrationConfig {
remote_metadata_us: 11.0,
layout_transform_fixed_us: 7.0,
..CalibrationConfig::default()
},
4096,
);
let transport_only = model.rdma_cost_s(4096) + model.pcie_cost_s(4096);
let total = model.remote_prepare_time_s(1);
assert!(total > transport_only);
}
}