use crate::cluster::meta_store::MetaStore; use crate::config::{CalibrationConfig, HardwareConfig}; use crate::instance::Instance; #[derive(Debug, Clone, Copy, Default)] pub struct PrefixResidency { pub l0_hit_blocks: u32, pub l1_hit_blocks: u32, pub remote_hit_blocks: u32, pub miss_blocks: u32, } #[derive(Debug, Clone)] pub struct TtftModel { kv_block_bytes: u64, host_dram_bw: f64, pcie_bw: f64, pcie_latency_s: f64, rdma_bw: f64, rdma_latency_s: f64, scheduler_base_s: f64, scheduler_per_candidate_s: f64, cache_probe_per_tier_s: f64, batch_pack_s: f64, dram_access_s: f64, remote_metadata_s: f64, layout_transform_s: f64, first_token_tail_s: f64, } impl TtftModel { pub fn new(hw: &HardwareConfig, calib: &CalibrationConfig, kv_block_bytes: u64) -> Self { Self { kv_block_bytes, host_dram_bw: hw.host_dram_bw * calib.dram_bw_util, pcie_bw: hw.pcie_bw * calib.pcie_bw_util, pcie_latency_s: hw.pcie_latency_us * 1e-6, rdma_bw: hw.rdma_bw * calib.rdma_bw_util, rdma_latency_s: hw.rdma_latency_us * 1e-6, scheduler_base_s: calib.scheduler_base_overhead_us * 1e-6, scheduler_per_candidate_s: calib.scheduler_per_candidate_us * 1e-6, cache_probe_per_tier_s: calib.cache_probe_us_per_tier * 1e-6, batch_pack_s: calib.batch_pack_overhead_us * 1e-6, dram_access_s: calib.dram_access_latency_us * 1e-6, remote_metadata_s: calib.remote_metadata_us * 1e-6, layout_transform_s: calib.layout_transform_fixed_us * 1e-6, first_token_tail_s: (calib.final_sync_us + calib.first_token_ready_us) * 1e-6, } } pub fn scheduler_overhead_s(&self, num_candidates: usize, num_tiers: usize) -> f64 { self.scheduler_base_s + self.scheduler_per_candidate_s * num_candidates as f64 + self.cache_probe_per_tier_s * num_tiers as f64 + self.batch_pack_s } pub fn first_token_tail_s(&self) -> f64 { self.first_token_tail_s } pub fn block_bytes(&self, blocks: u32) -> u64 { self.kv_block_bytes * blocks as u64 } pub fn local_l1_prepare_time_s(&self, blocks: u32) -> f64 { if blocks == 0 { return 0.0; } let bytes = self.block_bytes(blocks); self.dram_access_s + bytes as f64 / self.host_dram_bw.max(1.0) + self.pcie_cost_s(bytes) + self.layout_transform_s } pub fn remote_prepare_time_s(&self, blocks: u32) -> f64 { if blocks == 0 { return 0.0; } let bytes = self.block_bytes(blocks); self.remote_metadata_s + self.rdma_cost_s(bytes) + self.pcie_cost_s(bytes) + self.layout_transform_s } pub fn pcie_cost_s(&self, bytes: u64) -> f64 { if bytes == 0 { self.pcie_latency_s } else { self.pcie_latency_s + bytes as f64 / self.pcie_bw.max(1.0) } } pub fn rdma_cost_s(&self, bytes: u64) -> f64 { if bytes == 0 { self.rdma_latency_s } else { self.rdma_latency_s + bytes as f64 / self.rdma_bw.max(1.0) } } pub fn kv_prepare_time_s(&self, residency: PrefixResidency) -> f64 { self.local_l1_prepare_time_s(residency.l1_hit_blocks) + self.remote_prepare_time_s(residency.remote_hit_blocks) } } pub fn classify_prefix_tiers( req_hashes: &[u64], inst: &Instance, meta: &MetaStore, now: f64, ) -> PrefixResidency { let total_blocks = req_hashes.len() as u32; let l0_hit_blocks = inst.cache.l0.longest_prefix_peek(req_hashes) as u32; let suffix_after_l0 = &req_hashes[l0_hit_blocks as usize..]; let l1_hit_blocks = inst.cache.l1.longest_prefix_peek(suffix_after_l0) as u32; let suffix_after_l1 = &suffix_after_l0[l1_hit_blocks as usize..]; let mut remote_hit_blocks = 0; for &h in suffix_after_l1 { let owners = meta.instances_for(h, now); if owners.iter().any(|o| *o != inst.id) { remote_hit_blocks += 1; } else { break; } } PrefixResidency { l0_hit_blocks, l1_hit_blocks, remote_hit_blocks, miss_blocks: total_blocks - l0_hit_blocks - l1_hit_blocks - remote_hit_blocks, } } #[cfg(test)] mod tests { use super::*; use crate::config::{CalibrationConfig, HardwareConfig}; #[test] fn remote_prepare_includes_fixed_overheads() { let hw = HardwareConfig { gpu_flops: 1.0e14, gpu_fp8_flops: 0.0, gpu_fp4_flops: 0.0, gpu_mem_bw: 1.0e12, hbm_bytes: 1.0e9, dram_bytes: 4.0e9, host_dram_bw: 5.0e11, pcie_bw: 32.0e9, pcie_latency_us: 1.0, rdma_bw: 12.0e9, rdma_latency_us: 5.0, intra_node_tp_bw: 9.0e11, intra_node_tp_latency_us: 2.0, tp_degree: 1, max_batch_slots: 32, prefill_chunk_tokens: 1024, }; let model = TtftModel::new( &hw, &CalibrationConfig { remote_metadata_us: 11.0, layout_transform_fixed_us: 7.0, ..CalibrationConfig::default() }, 4096, ); let transport_only = model.rdma_cost_s(4096) + model.pcie_cost_s(4096); let total = model.remote_prepare_time_s(1); assert!(total > transport_only); } }