feat: update ttft modeling and add cache affinity
This commit is contained in:
@@ -22,7 +22,7 @@
|
||||
//! `effective_ctx(N)` equals `N` for dense attention (→ O(N²) total) but
|
||||
//! is sub-linear for DSA / sliding-window.
|
||||
|
||||
use crate::config::{AttentionConfig, HardwareConfig, ModelConfig};
|
||||
use crate::config::{AttentionConfig, CalibrationConfig, HardwareConfig, ModelConfig};
|
||||
|
||||
/// Resolved attention pattern used at runtime.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -55,24 +55,46 @@ pub struct ComputeModel {
|
||||
pub attn_pattern: AttentionPattern,
|
||||
/// Weight bytes read from HBM per layer (for memory-bound check).
|
||||
pub weight_bytes_per_layer: f64,
|
||||
/// Approximate bytes moved by each TP collective, per token per layer.
|
||||
pub tp_bytes_per_token: f64,
|
||||
/// Number of TP collectives per layer on the critical path.
|
||||
pub tp_collective_count_per_layer: f64,
|
||||
/// Peak GPU FLOPs (aggregate across TP group).
|
||||
pub gpu_flops: f64,
|
||||
/// Peak GPU memory bandwidth (aggregate across TP group).
|
||||
pub gpu_mem_bw: f64,
|
||||
/// Peak node-local TP bandwidth.
|
||||
pub intra_node_tp_bw: f64,
|
||||
/// Fixed latency per TP collective.
|
||||
pub intra_node_tp_latency_s: f64,
|
||||
/// Effective utilization for GEMM-like linear kernels.
|
||||
pub matmul_util: f64,
|
||||
/// Effective utilization for attention kernels.
|
||||
pub attention_util: f64,
|
||||
/// Effective utilization for HBM streaming.
|
||||
pub hbm_bw_util: f64,
|
||||
/// Effective utilization for TP bandwidth.
|
||||
pub tp_bw_util: f64,
|
||||
/// Fraction of TP communication that can overlap with compute.
|
||||
pub tp_overlap_ratio: f64,
|
||||
/// Fixed per-layer non-FLOP overhead.
|
||||
pub misc_layer_overhead_s: f64,
|
||||
/// Fixed launch overhead per prefill chunk.
|
||||
pub chunk_launch_overhead_s: f64,
|
||||
}
|
||||
|
||||
impl ComputeModel {
|
||||
pub fn new(model: &ModelConfig, hw: &HardwareConfig) -> Self {
|
||||
pub fn new(model: &ModelConfig, hw: &HardwareConfig, calib: &CalibrationConfig) -> Self {
|
||||
if model.is_arch_mode() {
|
||||
Self::from_arch(model, hw)
|
||||
Self::from_arch(model, hw, calib)
|
||||
} else {
|
||||
Self::from_manual(model, hw)
|
||||
Self::from_manual(model, hw, calib)
|
||||
}
|
||||
}
|
||||
|
||||
// ----- Architecture-derived construction --------------------------------
|
||||
|
||||
fn from_arch(model: &ModelConfig, hw: &HardwareConfig) -> Self {
|
||||
fn from_arch(model: &ModelConfig, hw: &HardwareConfig, calib: &CalibrationConfig) -> Self {
|
||||
let h = model.hidden_size.unwrap() as f64;
|
||||
let n_heads = model.num_attention_heads.unwrap_or(model.num_kv_heads) as f64;
|
||||
let n_kv = model.num_kv_heads as f64;
|
||||
@@ -115,6 +137,11 @@ impl ComputeModel {
|
||||
};
|
||||
|
||||
let linear_flops = attn_linear + mlp;
|
||||
let tp_bytes_per_token = if hw.tp_degree > 1 {
|
||||
h * model.dtype_bytes as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// --- Attention quadratic coefficient ---
|
||||
// attn_flops_per_layer(N) = attn_coeff * N * effective_ctx(N)
|
||||
@@ -183,14 +210,29 @@ impl ComputeModel {
|
||||
attn_coeff,
|
||||
attn_pattern,
|
||||
weight_bytes_per_layer: weight_bytes,
|
||||
tp_bytes_per_token,
|
||||
tp_collective_count_per_layer: if hw.tp_degree > 1 {
|
||||
calib.tp_collective_count_per_layer
|
||||
} else {
|
||||
0.0
|
||||
},
|
||||
gpu_flops: hw.gpu_flops,
|
||||
gpu_mem_bw: hw.gpu_mem_bw,
|
||||
intra_node_tp_bw: hw.intra_node_tp_bw,
|
||||
intra_node_tp_latency_s: hw.intra_node_tp_latency_us * 1e-6,
|
||||
matmul_util: calib.matmul_util,
|
||||
attention_util: calib.attention_util,
|
||||
hbm_bw_util: calib.hbm_bw_util,
|
||||
tp_bw_util: calib.tp_bw_util,
|
||||
tp_overlap_ratio: calib.tp_overlap_ratio,
|
||||
misc_layer_overhead_s: calib.misc_layer_overhead_us * 1e-6,
|
||||
chunk_launch_overhead_s: calib.chunk_launch_overhead_us * 1e-6,
|
||||
}
|
||||
}
|
||||
|
||||
// ----- Legacy manual construction ---------------------------------------
|
||||
|
||||
fn from_manual(model: &ModelConfig, hw: &HardwareConfig) -> Self {
|
||||
fn from_manual(model: &ModelConfig, hw: &HardwareConfig, calib: &CalibrationConfig) -> Self {
|
||||
Self {
|
||||
num_layers: model.num_layers as f64,
|
||||
first_dense_layers: model.num_layers as f64,
|
||||
@@ -198,8 +240,19 @@ impl ComputeModel {
|
||||
attn_coeff: model.attn_quadratic_coeff.unwrap_or(0.0),
|
||||
attn_pattern: AttentionPattern::Dense,
|
||||
weight_bytes_per_layer: 0.0,
|
||||
tp_bytes_per_token: 0.0,
|
||||
tp_collective_count_per_layer: 0.0,
|
||||
gpu_flops: hw.gpu_flops,
|
||||
gpu_mem_bw: hw.gpu_mem_bw,
|
||||
intra_node_tp_bw: hw.intra_node_tp_bw,
|
||||
intra_node_tp_latency_s: hw.intra_node_tp_latency_us * 1e-6,
|
||||
matmul_util: calib.matmul_util,
|
||||
attention_util: calib.attention_util,
|
||||
hbm_bw_util: calib.hbm_bw_util,
|
||||
tp_bw_util: calib.tp_bw_util,
|
||||
tp_overlap_ratio: calib.tp_overlap_ratio,
|
||||
misc_layer_overhead_s: calib.misc_layer_overhead_us * 1e-6,
|
||||
chunk_launch_overhead_s: calib.chunk_launch_overhead_us * 1e-6,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -232,23 +285,38 @@ impl ComputeModel {
|
||||
return 0.0;
|
||||
}
|
||||
let n = n as f64;
|
||||
let linear = n * self.linear_flops_per_token;
|
||||
let linear_flops = n * self.linear_flops_per_token;
|
||||
|
||||
// Compute FLOPs across all layers (dense + sparse may differ).
|
||||
let dense_layers = self.first_dense_layers;
|
||||
let sparse_layers = self.num_layers - dense_layers;
|
||||
|
||||
let dense_flops =
|
||||
dense_layers * (linear + self.attn_coeff * n * self.effective_ctx(n, true));
|
||||
let sparse_flops =
|
||||
sparse_layers * (linear + self.attn_coeff * n * self.effective_ctx(n, false));
|
||||
let total_flops = dense_flops + sparse_flops;
|
||||
let linear_total_flops = self.num_layers * linear_flops;
|
||||
let dense_attn_flops = dense_layers * (self.attn_coeff * n * self.effective_ctx(n, true));
|
||||
let sparse_attn_flops =
|
||||
sparse_layers * (self.attn_coeff * n * self.effective_ctx(n, false));
|
||||
let attn_total_flops = dense_attn_flops + sparse_attn_flops;
|
||||
|
||||
let compute_time = total_flops / self.gpu_flops;
|
||||
let linear_time = linear_total_flops / (self.gpu_flops * self.matmul_util.max(1e-6));
|
||||
let attn_time = attn_total_flops / (self.gpu_flops * self.attention_util.max(1e-6));
|
||||
let compute_time = linear_time + attn_time + self.num_layers * self.misc_layer_overhead_s;
|
||||
// Weight stream: all layers' active weights read once from HBM.
|
||||
let mem_time = self.weight_bytes_per_layer * self.num_layers / self.gpu_mem_bw;
|
||||
let mem_time =
|
||||
self.weight_bytes_per_layer * self.num_layers / (self.gpu_mem_bw * self.hbm_bw_util.max(1e-6));
|
||||
let tp_comm_time = if self.tp_collective_count_per_layer > 0.0
|
||||
&& self.tp_bytes_per_token > 0.0
|
||||
&& self.intra_node_tp_bw > 0.0
|
||||
{
|
||||
self.num_layers
|
||||
* (self.tp_collective_count_per_layer * self.intra_node_tp_latency_s
|
||||
+ self.tp_collective_count_per_layer * self.tp_bytes_per_token * n
|
||||
/ (self.intra_node_tp_bw * self.tp_bw_util.max(1e-6)))
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let tp_tail = (tp_comm_time - self.tp_overlap_ratio * (linear_time + attn_time)).max(0.0);
|
||||
|
||||
compute_time.max(mem_time)
|
||||
self.chunk_launch_overhead_s + compute_time.max(mem_time) + tp_tail
|
||||
}
|
||||
|
||||
/// Print human-readable derived coefficients (for `validate` output).
|
||||
@@ -277,6 +345,7 @@ impl ComputeModel {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::config::CalibrationConfig;
|
||||
|
||||
fn cm_legacy() -> ComputeModel {
|
||||
ComputeModel {
|
||||
@@ -286,8 +355,19 @@ mod tests {
|
||||
attn_coeff: 1024.0,
|
||||
attn_pattern: AttentionPattern::Dense,
|
||||
weight_bytes_per_layer: 0.0,
|
||||
tp_bytes_per_token: 0.0,
|
||||
tp_collective_count_per_layer: 0.0,
|
||||
gpu_flops: 9.89e14,
|
||||
gpu_mem_bw: 3.35e12,
|
||||
intra_node_tp_bw: 9.0e11,
|
||||
intra_node_tp_latency_s: 2.0e-6,
|
||||
matmul_util: 1.0,
|
||||
attention_util: 1.0,
|
||||
hbm_bw_util: 1.0,
|
||||
tp_bw_util: 1.0,
|
||||
tp_overlap_ratio: 1.0,
|
||||
misc_layer_overhead_s: 0.0,
|
||||
chunk_launch_overhead_s: 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -327,8 +407,19 @@ mod tests {
|
||||
attn_coeff: 139264.0,
|
||||
attn_pattern: AttentionPattern::Dense,
|
||||
weight_bytes_per_layer: 0.0,
|
||||
tp_bytes_per_token: 0.0,
|
||||
tp_collective_count_per_layer: 0.0,
|
||||
gpu_flops: 1.8e16,
|
||||
gpu_mem_bw: 6.4e13,
|
||||
intra_node_tp_bw: 9.0e11,
|
||||
intra_node_tp_latency_s: 2.0e-6,
|
||||
matmul_util: 1.0,
|
||||
attention_util: 1.0,
|
||||
hbm_bw_util: 1.0,
|
||||
tp_bw_util: 1.0,
|
||||
tp_overlap_ratio: 1.0,
|
||||
misc_layer_overhead_s: 0.0,
|
||||
chunk_launch_overhead_s: 0.0,
|
||||
};
|
||||
let dsa = ComputeModel {
|
||||
attn_pattern: AttentionPattern::Dsa {
|
||||
@@ -360,8 +451,19 @@ mod tests {
|
||||
attn_coeff: 1.0,
|
||||
attn_pattern: AttentionPattern::Dense,
|
||||
weight_bytes_per_layer: 1.0e12, // 1 TB per layer
|
||||
tp_bytes_per_token: 0.0,
|
||||
tp_collective_count_per_layer: 0.0,
|
||||
gpu_flops: 1.0e15,
|
||||
gpu_mem_bw: 1.0e12,
|
||||
intra_node_tp_bw: 9.0e11,
|
||||
intra_node_tp_latency_s: 2.0e-6,
|
||||
matmul_util: 1.0,
|
||||
attention_util: 1.0,
|
||||
hbm_bw_util: 1.0,
|
||||
tp_bw_util: 1.0,
|
||||
tp_overlap_ratio: 1.0,
|
||||
misc_layer_overhead_s: 0.0,
|
||||
chunk_launch_overhead_s: 0.0,
|
||||
};
|
||||
let t1 = m.prefill_time(1);
|
||||
let t8 = m.prefill_time(8);
|
||||
@@ -393,18 +495,122 @@ mod tests {
|
||||
gpu_mem_bw: 1e12,
|
||||
hbm_bytes: 1e9,
|
||||
dram_bytes: 4e9,
|
||||
host_dram_bw: 5.0e11,
|
||||
pcie_bw: 32e9,
|
||||
pcie_latency_us: 1.0,
|
||||
rdma_bw: 12e9,
|
||||
rdma_latency_us: 5.0,
|
||||
intra_node_tp_bw: 9.0e11,
|
||||
intra_node_tp_latency_us: 2.0,
|
||||
tp_degree: 1,
|
||||
max_batch_slots: 32,
|
||||
prefill_chunk_tokens: 1024,
|
||||
};
|
||||
let cm = ComputeModel::new(&model, &hw);
|
||||
let cm = ComputeModel::new(&model, &hw, &CalibrationConfig::default());
|
||||
assert!(cm.linear_flops_per_token > 0.0);
|
||||
assert!(cm.attn_coeff > 0.0);
|
||||
assert!(cm.weight_bytes_per_layer > 0.0);
|
||||
let t = cm.prefill_time(1024);
|
||||
assert!(t > 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lower_utilization_increases_prefill_time() {
|
||||
let model = ModelConfig {
|
||||
name: "test".into(),
|
||||
num_layers: 8,
|
||||
num_kv_heads: 4,
|
||||
head_dim: 128,
|
||||
dtype_bytes: 2,
|
||||
block_size_tokens: 16,
|
||||
hidden_size: Some(1024),
|
||||
num_attention_heads: Some(8),
|
||||
intermediate_size: Some(4096),
|
||||
..Default::default()
|
||||
};
|
||||
let hw = HardwareConfig {
|
||||
gpu_flops: 1e14,
|
||||
gpu_fp8_flops: 0.0,
|
||||
gpu_fp4_flops: 0.0,
|
||||
gpu_mem_bw: 1e12,
|
||||
hbm_bytes: 1e9,
|
||||
dram_bytes: 4e9,
|
||||
host_dram_bw: 5.0e11,
|
||||
pcie_bw: 32e9,
|
||||
pcie_latency_us: 1.0,
|
||||
rdma_bw: 12e9,
|
||||
rdma_latency_us: 5.0,
|
||||
intra_node_tp_bw: 9.0e11,
|
||||
intra_node_tp_latency_us: 2.0,
|
||||
tp_degree: 1,
|
||||
max_batch_slots: 32,
|
||||
prefill_chunk_tokens: 1024,
|
||||
};
|
||||
let fast = ComputeModel::new(&model, &hw, &CalibrationConfig::default());
|
||||
let slow = ComputeModel::new(
|
||||
&model,
|
||||
&hw,
|
||||
&CalibrationConfig {
|
||||
matmul_util: 0.2,
|
||||
attention_util: 0.15,
|
||||
..CalibrationConfig::default()
|
||||
},
|
||||
);
|
||||
|
||||
assert!(slow.prefill_time(4096) > fast.prefill_time(4096));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tp_communication_adds_tail_when_overlap_is_limited() {
|
||||
let model = ModelConfig {
|
||||
name: "test".into(),
|
||||
num_layers: 8,
|
||||
num_kv_heads: 4,
|
||||
head_dim: 128,
|
||||
dtype_bytes: 2,
|
||||
block_size_tokens: 16,
|
||||
hidden_size: Some(2048),
|
||||
num_attention_heads: Some(16),
|
||||
intermediate_size: Some(8192),
|
||||
..Default::default()
|
||||
};
|
||||
let hw = HardwareConfig {
|
||||
gpu_flops: 1e14,
|
||||
gpu_fp8_flops: 0.0,
|
||||
gpu_fp4_flops: 0.0,
|
||||
gpu_mem_bw: 1e12,
|
||||
hbm_bytes: 1e9,
|
||||
dram_bytes: 4e9,
|
||||
host_dram_bw: 5.0e11,
|
||||
pcie_bw: 32e9,
|
||||
pcie_latency_us: 1.0,
|
||||
rdma_bw: 12e9,
|
||||
rdma_latency_us: 5.0,
|
||||
intra_node_tp_bw: 1.0e10,
|
||||
intra_node_tp_latency_us: 20.0,
|
||||
tp_degree: 8,
|
||||
max_batch_slots: 32,
|
||||
prefill_chunk_tokens: 1024,
|
||||
};
|
||||
let no_tp = ComputeModel::new(
|
||||
&model,
|
||||
&hw,
|
||||
&CalibrationConfig {
|
||||
tp_overlap_ratio: 1.0,
|
||||
tp_bw_util: 1.0,
|
||||
..CalibrationConfig::default()
|
||||
},
|
||||
);
|
||||
let tp_tail = ComputeModel::new(
|
||||
&model,
|
||||
&hw,
|
||||
&CalibrationConfig {
|
||||
tp_overlap_ratio: 0.0,
|
||||
tp_bw_util: 0.2,
|
||||
..CalibrationConfig::default()
|
||||
},
|
||||
);
|
||||
|
||||
assert!(tp_tail.prefill_time(2048) > no_tp.prefill_time(2048));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use crate::config::{HardwareConfig, ModelConfig};
|
||||
use crate::config::{CalibrationConfig, HardwareConfig, ModelConfig};
|
||||
use crate::instance::compute::ComputeModel;
|
||||
use crate::instance::kv_cache::TwoTierCache;
|
||||
use crate::network::InstanceLinks;
|
||||
@@ -37,6 +37,8 @@ pub struct AdmittedRequest {
|
||||
/// KV blocks reserved on this instance's HBM for the lifetime of this
|
||||
/// request's prefill (= number of input blocks).
|
||||
pub reserved_blocks: u32,
|
||||
/// Tail latency between prefill completion and first-token visibility.
|
||||
pub completion_tail_s: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -68,15 +70,20 @@ pub struct Instance {
|
||||
}
|
||||
|
||||
impl Instance {
|
||||
pub fn new(id: InstanceId, model: &ModelConfig, hw: &HardwareConfig) -> Self {
|
||||
pub fn new(
|
||||
id: InstanceId,
|
||||
model: &ModelConfig,
|
||||
hw: &HardwareConfig,
|
||||
calib: &CalibrationConfig,
|
||||
) -> Self {
|
||||
let block_bytes = model.kv_block_bytes() as f64;
|
||||
let hbm_blocks = (hw.hbm_bytes / block_bytes).max(1.0) as u32;
|
||||
let dram_blocks = (hw.dram_bytes / block_bytes).max(1.0) as u32;
|
||||
Self {
|
||||
id,
|
||||
cache: TwoTierCache::new(hbm_blocks as usize, dram_blocks as usize),
|
||||
links: InstanceLinks::from_hw(hw),
|
||||
compute: ComputeModel::new(model, hw),
|
||||
links: InstanceLinks::from_hw(hw, calib),
|
||||
compute: ComputeModel::new(model, hw, calib),
|
||||
block_size_tokens: model.block_size_tokens,
|
||||
hbm_block_budget: hbm_blocks,
|
||||
dram_block_budget: dram_blocks,
|
||||
@@ -137,16 +144,17 @@ impl Instance {
|
||||
if self.kv_blocks_used + front.reserved_blocks > self.hbm_block_budget {
|
||||
break;
|
||||
}
|
||||
let r = self.pending.pop_front().unwrap();
|
||||
self.kv_blocks_used += r.reserved_blocks;
|
||||
if r.prefill_tokens_remaining == 0 {
|
||||
// Full cache hit: nothing to compute. TTFT == fetch time.
|
||||
let ttft = now - r.arrival;
|
||||
self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
|
||||
completed.push((r.req_id, ttft, now));
|
||||
} else {
|
||||
self.prefilling.push_back(r);
|
||||
}
|
||||
let r = self.pending.pop_front().unwrap();
|
||||
self.kv_blocks_used += r.reserved_blocks;
|
||||
if r.prefill_tokens_remaining == 0 {
|
||||
// Full cache hit: nothing to compute. TTFT == fetch time.
|
||||
let t_done = now + r.completion_tail_s;
|
||||
let ttft = t_done - r.arrival;
|
||||
self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
|
||||
completed.push((r.req_id, ttft, t_done));
|
||||
} else {
|
||||
self.prefilling.push_back(r);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Run one chunked-prefill step on the head of `prefilling`.
|
||||
@@ -171,9 +179,10 @@ impl Instance {
|
||||
head.prefill_tokens_remaining -= chunk_tokens;
|
||||
if head.prefill_tokens_remaining == 0 {
|
||||
let done = self.prefilling.pop_front().unwrap();
|
||||
let ttft = t_end - done.arrival;
|
||||
let t_done = t_end + done.completion_tail_s;
|
||||
let ttft = t_done - done.arrival;
|
||||
self.kv_blocks_used = self.kv_blocks_used.saturating_sub(done.reserved_blocks);
|
||||
completed.push((done.req_id, ttft, t_end));
|
||||
completed.push((done.req_id, ttft, t_done));
|
||||
}
|
||||
|
||||
StepResult {
|
||||
|
||||
Reference in New Issue
Block a user