feat: update ttft modeling and add cache affinity

2026-04-15 19:08:10 +08:00
parent ff316c6873
commit a3f386c858
15 changed files with 1276 additions and 99 deletions
--- a/src/ttft.rs
+++ b/src/ttft.rs
@@ -0,0 +1,177 @@
+use crate::cluster::meta_store::MetaStore;
+use crate::config::{CalibrationConfig, HardwareConfig};
+use crate::instance::Instance;
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct PrefixResidency {
+    pub l0_hit_blocks: u32,
+    pub l1_hit_blocks: u32,
+    pub remote_hit_blocks: u32,
+    pub miss_blocks: u32,
+}
+
+#[derive(Debug, Clone)]
+pub struct TtftModel {
+    kv_block_bytes: u64,
+    host_dram_bw: f64,
+    pcie_bw: f64,
+    pcie_latency_s: f64,
+    rdma_bw: f64,
+    rdma_latency_s: f64,
+    scheduler_base_s: f64,
+    scheduler_per_candidate_s: f64,
+    cache_probe_per_tier_s: f64,
+    batch_pack_s: f64,
+    dram_access_s: f64,
+    remote_metadata_s: f64,
+    layout_transform_s: f64,
+    first_token_tail_s: f64,
+}
+
+impl TtftModel {
+    pub fn new(hw: &HardwareConfig, calib: &CalibrationConfig, kv_block_bytes: u64) -> Self {
+        Self {
+            kv_block_bytes,
+            host_dram_bw: hw.host_dram_bw * calib.dram_bw_util,
+            pcie_bw: hw.pcie_bw * calib.pcie_bw_util,
+            pcie_latency_s: hw.pcie_latency_us * 1e-6,
+            rdma_bw: hw.rdma_bw * calib.rdma_bw_util,
+            rdma_latency_s: hw.rdma_latency_us * 1e-6,
+            scheduler_base_s: calib.scheduler_base_overhead_us * 1e-6,
+            scheduler_per_candidate_s: calib.scheduler_per_candidate_us * 1e-6,
+            cache_probe_per_tier_s: calib.cache_probe_us_per_tier * 1e-6,
+            batch_pack_s: calib.batch_pack_overhead_us * 1e-6,
+            dram_access_s: calib.dram_access_latency_us * 1e-6,
+            remote_metadata_s: calib.remote_metadata_us * 1e-6,
+            layout_transform_s: calib.layout_transform_fixed_us * 1e-6,
+            first_token_tail_s: (calib.final_sync_us + calib.first_token_ready_us) * 1e-6,
+        }
+    }
+
+    pub fn scheduler_overhead_s(&self, num_candidates: usize, num_tiers: usize) -> f64 {
+        self.scheduler_base_s
+            + self.scheduler_per_candidate_s * num_candidates as f64
+            + self.cache_probe_per_tier_s * num_tiers as f64
+            + self.batch_pack_s
+    }
+
+    pub fn first_token_tail_s(&self) -> f64 {
+        self.first_token_tail_s
+    }
+
+    pub fn block_bytes(&self, blocks: u32) -> u64 {
+        self.kv_block_bytes * blocks as u64
+    }
+
+    pub fn local_l1_prepare_time_s(&self, blocks: u32) -> f64 {
+        if blocks == 0 {
+            return 0.0;
+        }
+        let bytes = self.block_bytes(blocks);
+        self.dram_access_s
+            + bytes as f64 / self.host_dram_bw.max(1.0)
+            + self.pcie_cost_s(bytes)
+            + self.layout_transform_s
+    }
+
+    pub fn remote_prepare_time_s(&self, blocks: u32) -> f64 {
+        if blocks == 0 {
+            return 0.0;
+        }
+        let bytes = self.block_bytes(blocks);
+        self.remote_metadata_s + self.rdma_cost_s(bytes) + self.pcie_cost_s(bytes) + self.layout_transform_s
+    }
+
+    pub fn pcie_cost_s(&self, bytes: u64) -> f64 {
+        if bytes == 0 {
+            self.pcie_latency_s
+        } else {
+            self.pcie_latency_s + bytes as f64 / self.pcie_bw.max(1.0)
+        }
+    }
+
+    pub fn rdma_cost_s(&self, bytes: u64) -> f64 {
+        if bytes == 0 {
+            self.rdma_latency_s
+        } else {
+            self.rdma_latency_s + bytes as f64 / self.rdma_bw.max(1.0)
+        }
+    }
+
+    pub fn kv_prepare_time_s(&self, residency: PrefixResidency) -> f64 {
+        self.local_l1_prepare_time_s(residency.l1_hit_blocks)
+            + self.remote_prepare_time_s(residency.remote_hit_blocks)
+    }
+}
+
+pub fn classify_prefix_tiers(
+    req_hashes: &[u64],
+    inst: &Instance,
+    meta: &MetaStore,
+    now: f64,
+) -> PrefixResidency {
+    let total_blocks = req_hashes.len() as u32;
+    let l0_hit_blocks = inst.cache.l0.longest_prefix_peek(req_hashes) as u32;
+    let suffix_after_l0 = &req_hashes[l0_hit_blocks as usize..];
+
+    let l1_hit_blocks = inst.cache.l1.longest_prefix_peek(suffix_after_l0) as u32;
+    let suffix_after_l1 = &suffix_after_l0[l1_hit_blocks as usize..];
+
+    let mut remote_hit_blocks = 0;
+    for &h in suffix_after_l1 {
+        let owners = meta.instances_for(h, now);
+        if owners.iter().any(|o| *o != inst.id) {
+            remote_hit_blocks += 1;
+        } else {
+            break;
+        }
+    }
+
+    PrefixResidency {
+        l0_hit_blocks,
+        l1_hit_blocks,
+        remote_hit_blocks,
+        miss_blocks: total_blocks - l0_hit_blocks - l1_hit_blocks - remote_hit_blocks,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::config::{CalibrationConfig, HardwareConfig};
+
+    #[test]
+    fn remote_prepare_includes_fixed_overheads() {
+        let hw = HardwareConfig {
+            gpu_flops: 1.0e14,
+            gpu_fp8_flops: 0.0,
+            gpu_fp4_flops: 0.0,
+            gpu_mem_bw: 1.0e12,
+            hbm_bytes: 1.0e9,
+            dram_bytes: 4.0e9,
+            host_dram_bw: 5.0e11,
+            pcie_bw: 32.0e9,
+            pcie_latency_us: 1.0,
+            rdma_bw: 12.0e9,
+            rdma_latency_us: 5.0,
+            intra_node_tp_bw: 9.0e11,
+            intra_node_tp_latency_us: 2.0,
+            tp_degree: 1,
+            max_batch_slots: 32,
+            prefill_chunk_tokens: 1024,
+        };
+        let model = TtftModel::new(
+            &hw,
+            &CalibrationConfig {
+                remote_metadata_us: 11.0,
+                layout_transform_fixed_us: 7.0,
+                ..CalibrationConfig::default()
+            },
+            4096,
+        );
+
+        let transport_only = model.rdma_cost_s(4096) + model.pcie_cost_s(4096);
+        let total = model.remote_prepare_time_s(1);
+        assert!(total > transport_only);
+    }
+}