feat: update ttft modeling and add cache affinity

2026-04-15 19:08:10 +08:00
parent ff316c6873
commit a3f386c858
15 changed files with 1276 additions and 99 deletions
--- a/src/instance/compute.rs
+++ b/src/instance/compute.rs
@@ -22,7 +22,7 @@
 //! `effective_ctx(N)` equals `N` for dense attention (→ O(N²) total) but
 //! is sub-linear for DSA / sliding-window.

-use crate::config::{AttentionConfig, HardwareConfig, ModelConfig};
+use crate::config::{AttentionConfig, CalibrationConfig, HardwareConfig, ModelConfig};

 /// Resolved attention pattern used at runtime.
 #[derive(Debug, Clone)]
@@ -55,24 +55,46 @@ pub struct ComputeModel {
    pub attn_pattern: AttentionPattern,
    /// Weight bytes read from HBM per layer (for memory-bound check).
    pub weight_bytes_per_layer: f64,
+    /// Approximate bytes moved by each TP collective, per token per layer.
+    pub tp_bytes_per_token: f64,
+    /// Number of TP collectives per layer on the critical path.
+    pub tp_collective_count_per_layer: f64,
    /// Peak GPU FLOPs (aggregate across TP group).
    pub gpu_flops: f64,
    /// Peak GPU memory bandwidth (aggregate across TP group).
    pub gpu_mem_bw: f64,
+    /// Peak node-local TP bandwidth.
+    pub intra_node_tp_bw: f64,
+    /// Fixed latency per TP collective.
+    pub intra_node_tp_latency_s: f64,
+    /// Effective utilization for GEMM-like linear kernels.
+    pub matmul_util: f64,
+    /// Effective utilization for attention kernels.
+    pub attention_util: f64,
+    /// Effective utilization for HBM streaming.
+    pub hbm_bw_util: f64,
+    /// Effective utilization for TP bandwidth.
+    pub tp_bw_util: f64,
+    /// Fraction of TP communication that can overlap with compute.
+    pub tp_overlap_ratio: f64,
+    /// Fixed per-layer non-FLOP overhead.
+    pub misc_layer_overhead_s: f64,
+    /// Fixed launch overhead per prefill chunk.
+    pub chunk_launch_overhead_s: f64,
 }

 impl ComputeModel {
-    pub fn new(model: &ModelConfig, hw: &HardwareConfig) -> Self {
+    pub fn new(model: &ModelConfig, hw: &HardwareConfig, calib: &CalibrationConfig) -> Self {
        if model.is_arch_mode() {
-            Self::from_arch(model, hw)
+            Self::from_arch(model, hw, calib)
        } else {
-            Self::from_manual(model, hw)
+            Self::from_manual(model, hw, calib)
        }
    }

    // ----- Architecture-derived construction --------------------------------

-    fn from_arch(model: &ModelConfig, hw: &HardwareConfig) -> Self {
+    fn from_arch(model: &ModelConfig, hw: &HardwareConfig, calib: &CalibrationConfig) -> Self {
        let h = model.hidden_size.unwrap() as f64;
        let n_heads = model.num_attention_heads.unwrap_or(model.num_kv_heads) as f64;
        let n_kv = model.num_kv_heads as f64;
@@ -115,6 +137,11 @@ impl ComputeModel {
        };

        let linear_flops = attn_linear + mlp;
+        let tp_bytes_per_token = if hw.tp_degree > 1 {
+            h * model.dtype_bytes as f64
+        } else {
+            0.0
+        };

        // --- Attention quadratic coefficient ---
        // attn_flops_per_layer(N) = attn_coeff * N * effective_ctx(N)
@@ -183,14 +210,29 @@ impl ComputeModel {
            attn_coeff,
            attn_pattern,
            weight_bytes_per_layer: weight_bytes,
+            tp_bytes_per_token,
+            tp_collective_count_per_layer: if hw.tp_degree > 1 {
+                calib.tp_collective_count_per_layer
+            } else {
+                0.0
+            },
            gpu_flops: hw.gpu_flops,
            gpu_mem_bw: hw.gpu_mem_bw,
+            intra_node_tp_bw: hw.intra_node_tp_bw,
+            intra_node_tp_latency_s: hw.intra_node_tp_latency_us * 1e-6,
+            matmul_util: calib.matmul_util,
+            attention_util: calib.attention_util,
+            hbm_bw_util: calib.hbm_bw_util,
+            tp_bw_util: calib.tp_bw_util,
+            tp_overlap_ratio: calib.tp_overlap_ratio,
+            misc_layer_overhead_s: calib.misc_layer_overhead_us * 1e-6,
+            chunk_launch_overhead_s: calib.chunk_launch_overhead_us * 1e-6,
        }
    }

    // ----- Legacy manual construction ---------------------------------------

-    fn from_manual(model: &ModelConfig, hw: &HardwareConfig) -> Self {
+    fn from_manual(model: &ModelConfig, hw: &HardwareConfig, calib: &CalibrationConfig) -> Self {
        Self {
            num_layers: model.num_layers as f64,
            first_dense_layers: model.num_layers as f64,
@@ -198,8 +240,19 @@ impl ComputeModel {
            attn_coeff: model.attn_quadratic_coeff.unwrap_or(0.0),
            attn_pattern: AttentionPattern::Dense,
            weight_bytes_per_layer: 0.0,
+            tp_bytes_per_token: 0.0,
+            tp_collective_count_per_layer: 0.0,
            gpu_flops: hw.gpu_flops,
            gpu_mem_bw: hw.gpu_mem_bw,
+            intra_node_tp_bw: hw.intra_node_tp_bw,
+            intra_node_tp_latency_s: hw.intra_node_tp_latency_us * 1e-6,
+            matmul_util: calib.matmul_util,
+            attention_util: calib.attention_util,
+            hbm_bw_util: calib.hbm_bw_util,
+            tp_bw_util: calib.tp_bw_util,
+            tp_overlap_ratio: calib.tp_overlap_ratio,
+            misc_layer_overhead_s: calib.misc_layer_overhead_us * 1e-6,
+            chunk_launch_overhead_s: calib.chunk_launch_overhead_us * 1e-6,
        }
    }

@@ -232,23 +285,38 @@ impl ComputeModel {
            return 0.0;
        }
        let n = n as f64;
-        let linear = n * self.linear_flops_per_token;
+        let linear_flops = n * self.linear_flops_per_token;

        // Compute FLOPs across all layers (dense + sparse may differ).
        let dense_layers = self.first_dense_layers;
        let sparse_layers = self.num_layers - dense_layers;

-        let dense_flops =
-            dense_layers * (linear + self.attn_coeff * n * self.effective_ctx(n, true));
-        let sparse_flops =
-            sparse_layers * (linear + self.attn_coeff * n * self.effective_ctx(n, false));
-        let total_flops = dense_flops + sparse_flops;
+        let linear_total_flops = self.num_layers * linear_flops;
+        let dense_attn_flops = dense_layers * (self.attn_coeff * n * self.effective_ctx(n, true));
+        let sparse_attn_flops =
+            sparse_layers * (self.attn_coeff * n * self.effective_ctx(n, false));
+        let attn_total_flops = dense_attn_flops + sparse_attn_flops;

-        let compute_time = total_flops / self.gpu_flops;
+        let linear_time = linear_total_flops / (self.gpu_flops * self.matmul_util.max(1e-6));
+        let attn_time = attn_total_flops / (self.gpu_flops * self.attention_util.max(1e-6));
+        let compute_time = linear_time + attn_time + self.num_layers * self.misc_layer_overhead_s;
        // Weight stream: all layers' active weights read once from HBM.
-        let mem_time = self.weight_bytes_per_layer * self.num_layers / self.gpu_mem_bw;
+        let mem_time =
+            self.weight_bytes_per_layer * self.num_layers / (self.gpu_mem_bw * self.hbm_bw_util.max(1e-6));
+        let tp_comm_time = if self.tp_collective_count_per_layer > 0.0
+            && self.tp_bytes_per_token > 0.0
+            && self.intra_node_tp_bw > 0.0
+        {
+            self.num_layers
+                * (self.tp_collective_count_per_layer * self.intra_node_tp_latency_s
+                    + self.tp_collective_count_per_layer * self.tp_bytes_per_token * n
+                        / (self.intra_node_tp_bw * self.tp_bw_util.max(1e-6)))
+        } else {
+            0.0
+        };
+        let tp_tail = (tp_comm_time - self.tp_overlap_ratio * (linear_time + attn_time)).max(0.0);

-        compute_time.max(mem_time)
+        self.chunk_launch_overhead_s + compute_time.max(mem_time) + tp_tail
    }

    /// Print human-readable derived coefficients (for `validate` output).
@@ -277,6 +345,7 @@ impl ComputeModel {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::config::CalibrationConfig;

    fn cm_legacy() -> ComputeModel {
        ComputeModel {
@@ -286,8 +355,19 @@ mod tests {
            attn_coeff: 1024.0,
            attn_pattern: AttentionPattern::Dense,
            weight_bytes_per_layer: 0.0,
+            tp_bytes_per_token: 0.0,
+            tp_collective_count_per_layer: 0.0,
            gpu_flops: 9.89e14,
            gpu_mem_bw: 3.35e12,
+            intra_node_tp_bw: 9.0e11,
+            intra_node_tp_latency_s: 2.0e-6,
+            matmul_util: 1.0,
+            attention_util: 1.0,
+            hbm_bw_util: 1.0,
+            tp_bw_util: 1.0,
+            tp_overlap_ratio: 1.0,
+            misc_layer_overhead_s: 0.0,
+            chunk_launch_overhead_s: 0.0,
        }
    }

@@ -327,8 +407,19 @@ mod tests {
            attn_coeff: 139264.0,
            attn_pattern: AttentionPattern::Dense,
            weight_bytes_per_layer: 0.0,
+            tp_bytes_per_token: 0.0,
+            tp_collective_count_per_layer: 0.0,
            gpu_flops: 1.8e16,
            gpu_mem_bw: 6.4e13,
+            intra_node_tp_bw: 9.0e11,
+            intra_node_tp_latency_s: 2.0e-6,
+            matmul_util: 1.0,
+            attention_util: 1.0,
+            hbm_bw_util: 1.0,
+            tp_bw_util: 1.0,
+            tp_overlap_ratio: 1.0,
+            misc_layer_overhead_s: 0.0,
+            chunk_launch_overhead_s: 0.0,
        };
        let dsa = ComputeModel {
            attn_pattern: AttentionPattern::Dsa {
@@ -360,8 +451,19 @@ mod tests {
            attn_coeff: 1.0,
            attn_pattern: AttentionPattern::Dense,
            weight_bytes_per_layer: 1.0e12, // 1 TB per layer
+            tp_bytes_per_token: 0.0,
+            tp_collective_count_per_layer: 0.0,
            gpu_flops: 1.0e15,
            gpu_mem_bw: 1.0e12,
+            intra_node_tp_bw: 9.0e11,
+            intra_node_tp_latency_s: 2.0e-6,
+            matmul_util: 1.0,
+            attention_util: 1.0,
+            hbm_bw_util: 1.0,
+            tp_bw_util: 1.0,
+            tp_overlap_ratio: 1.0,
+            misc_layer_overhead_s: 0.0,
+            chunk_launch_overhead_s: 0.0,
        };
        let t1 = m.prefill_time(1);
        let t8 = m.prefill_time(8);
@@ -393,18 +495,122 @@ mod tests {
            gpu_mem_bw: 1e12,
            hbm_bytes: 1e9,
            dram_bytes: 4e9,
+            host_dram_bw: 5.0e11,
            pcie_bw: 32e9,
            pcie_latency_us: 1.0,
            rdma_bw: 12e9,
            rdma_latency_us: 5.0,
+            intra_node_tp_bw: 9.0e11,
+            intra_node_tp_latency_us: 2.0,
+            tp_degree: 1,
            max_batch_slots: 32,
            prefill_chunk_tokens: 1024,
        };
-        let cm = ComputeModel::new(&model, &hw);
+        let cm = ComputeModel::new(&model, &hw, &CalibrationConfig::default());
        assert!(cm.linear_flops_per_token > 0.0);
        assert!(cm.attn_coeff > 0.0);
        assert!(cm.weight_bytes_per_layer > 0.0);
        let t = cm.prefill_time(1024);
        assert!(t > 0.0);
    }
+
+    #[test]
+    fn lower_utilization_increases_prefill_time() {
+        let model = ModelConfig {
+            name: "test".into(),
+            num_layers: 8,
+            num_kv_heads: 4,
+            head_dim: 128,
+            dtype_bytes: 2,
+            block_size_tokens: 16,
+            hidden_size: Some(1024),
+            num_attention_heads: Some(8),
+            intermediate_size: Some(4096),
+            ..Default::default()
+        };
+        let hw = HardwareConfig {
+            gpu_flops: 1e14,
+            gpu_fp8_flops: 0.0,
+            gpu_fp4_flops: 0.0,
+            gpu_mem_bw: 1e12,
+            hbm_bytes: 1e9,
+            dram_bytes: 4e9,
+            host_dram_bw: 5.0e11,
+            pcie_bw: 32e9,
+            pcie_latency_us: 1.0,
+            rdma_bw: 12e9,
+            rdma_latency_us: 5.0,
+            intra_node_tp_bw: 9.0e11,
+            intra_node_tp_latency_us: 2.0,
+            tp_degree: 1,
+            max_batch_slots: 32,
+            prefill_chunk_tokens: 1024,
+        };
+        let fast = ComputeModel::new(&model, &hw, &CalibrationConfig::default());
+        let slow = ComputeModel::new(
+            &model,
+            &hw,
+            &CalibrationConfig {
+                matmul_util: 0.2,
+                attention_util: 0.15,
+                ..CalibrationConfig::default()
+            },
+        );
+
+        assert!(slow.prefill_time(4096) > fast.prefill_time(4096));
+    }
+
+    #[test]
+    fn tp_communication_adds_tail_when_overlap_is_limited() {
+        let model = ModelConfig {
+            name: "test".into(),
+            num_layers: 8,
+            num_kv_heads: 4,
+            head_dim: 128,
+            dtype_bytes: 2,
+            block_size_tokens: 16,
+            hidden_size: Some(2048),
+            num_attention_heads: Some(16),
+            intermediate_size: Some(8192),
+            ..Default::default()
+        };
+        let hw = HardwareConfig {
+            gpu_flops: 1e14,
+            gpu_fp8_flops: 0.0,
+            gpu_fp4_flops: 0.0,
+            gpu_mem_bw: 1e12,
+            hbm_bytes: 1e9,
+            dram_bytes: 4e9,
+            host_dram_bw: 5.0e11,
+            pcie_bw: 32e9,
+            pcie_latency_us: 1.0,
+            rdma_bw: 12e9,
+            rdma_latency_us: 5.0,
+            intra_node_tp_bw: 1.0e10,
+            intra_node_tp_latency_us: 20.0,
+            tp_degree: 8,
+            max_batch_slots: 32,
+            prefill_chunk_tokens: 1024,
+        };
+        let no_tp = ComputeModel::new(
+            &model,
+            &hw,
+            &CalibrationConfig {
+                tp_overlap_ratio: 1.0,
+                tp_bw_util: 1.0,
+                ..CalibrationConfig::default()
+            },
+        );
+        let tp_tail = ComputeModel::new(
+            &model,
+            &hw,
+            &CalibrationConfig {
+                tp_overlap_ratio: 0.0,
+                tp_bw_util: 0.2,
+                ..CalibrationConfig::default()
+            },
+        );
+
+        assert!(tp_tail.prefill_time(2048) > no_tp.prefill_time(2048));
+    }
 }
--- a/src/instance/instance.rs
+++ b/src/instance/instance.rs
@@ -19,7 +19,7 @@

 use std::collections::VecDeque;

-use crate::config::{HardwareConfig, ModelConfig};
+use crate::config::{CalibrationConfig, HardwareConfig, ModelConfig};
 use crate::instance::compute::ComputeModel;
 use crate::instance::kv_cache::TwoTierCache;
 use crate::network::InstanceLinks;
@@ -37,6 +37,8 @@ pub struct AdmittedRequest {
    /// KV blocks reserved on this instance's HBM for the lifetime of this
    /// request's prefill (= number of input blocks).
    pub reserved_blocks: u32,
+    /// Tail latency between prefill completion and first-token visibility.
+    pub completion_tail_s: f64,
 }

 #[derive(Debug)]
@@ -68,15 +70,20 @@ pub struct Instance {
 }

 impl Instance {
-    pub fn new(id: InstanceId, model: &ModelConfig, hw: &HardwareConfig) -> Self {
+    pub fn new(
+        id: InstanceId,
+        model: &ModelConfig,
+        hw: &HardwareConfig,
+        calib: &CalibrationConfig,
+    ) -> Self {
        let block_bytes = model.kv_block_bytes() as f64;
        let hbm_blocks = (hw.hbm_bytes / block_bytes).max(1.0) as u32;
        let dram_blocks = (hw.dram_bytes / block_bytes).max(1.0) as u32;
        Self {
            id,
            cache: TwoTierCache::new(hbm_blocks as usize, dram_blocks as usize),
-            links: InstanceLinks::from_hw(hw),
-            compute: ComputeModel::new(model, hw),
+            links: InstanceLinks::from_hw(hw, calib),
+            compute: ComputeModel::new(model, hw, calib),
            block_size_tokens: model.block_size_tokens,
            hbm_block_budget: hbm_blocks,
            dram_block_budget: dram_blocks,
@@ -137,16 +144,17 @@ impl Instance {
            if self.kv_blocks_used + front.reserved_blocks > self.hbm_block_budget {
                break;
            }
-            let r = self.pending.pop_front().unwrap();
-            self.kv_blocks_used += r.reserved_blocks;
-            if r.prefill_tokens_remaining == 0 {
-                // Full cache hit: nothing to compute. TTFT == fetch time.
-                let ttft = now - r.arrival;
-                self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
-                completed.push((r.req_id, ttft, now));
-            } else {
-                self.prefilling.push_back(r);
-            }
+                let r = self.pending.pop_front().unwrap();
+                self.kv_blocks_used += r.reserved_blocks;
+                if r.prefill_tokens_remaining == 0 {
+                    // Full cache hit: nothing to compute. TTFT == fetch time.
+                    let t_done = now + r.completion_tail_s;
+                    let ttft = t_done - r.arrival;
+                    self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
+                    completed.push((r.req_id, ttft, t_done));
+                } else {
+                    self.prefilling.push_back(r);
+                }
        }

        // 2. Run one chunked-prefill step on the head of `prefilling`.
@@ -171,9 +179,10 @@ impl Instance {
        head.prefill_tokens_remaining -= chunk_tokens;
        if head.prefill_tokens_remaining == 0 {
            let done = self.prefilling.pop_front().unwrap();
-            let ttft = t_end - done.arrival;
+            let t_done = t_end + done.completion_tail_s;
+            let ttft = t_done - done.arrival;
            self.kv_blocks_used = self.kv_blocks_used.saturating_sub(done.reserved_blocks);
-            completed.push((done.req_id, ttft, t_end));
+            completed.push((done.req_id, ttft, t_done));
        }

        StepResult {