181 lines
5.5 KiB
Rust
181 lines
5.5 KiB
Rust
use crate::cluster::meta_store::MetaStore;
|
|
use crate::config::{CalibrationConfig, HardwareConfig};
|
|
use crate::instance::Instance;
|
|
|
|
#[derive(Debug, Clone, Copy, Default)]
|
|
pub struct PrefixResidency {
|
|
pub l0_hit_blocks: u32,
|
|
pub l1_hit_blocks: u32,
|
|
pub remote_hit_blocks: u32,
|
|
pub miss_blocks: u32,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct TtftModel {
|
|
kv_block_bytes: u64,
|
|
host_dram_bw: f64,
|
|
pcie_bw: f64,
|
|
pcie_latency_s: f64,
|
|
rdma_bw: f64,
|
|
rdma_latency_s: f64,
|
|
scheduler_base_s: f64,
|
|
scheduler_per_candidate_s: f64,
|
|
cache_probe_per_tier_s: f64,
|
|
batch_pack_s: f64,
|
|
dram_access_s: f64,
|
|
remote_metadata_s: f64,
|
|
layout_transform_s: f64,
|
|
first_token_tail_s: f64,
|
|
}
|
|
|
|
impl TtftModel {
|
|
pub fn new(hw: &HardwareConfig, calib: &CalibrationConfig, kv_block_bytes: u64) -> Self {
|
|
Self {
|
|
kv_block_bytes,
|
|
host_dram_bw: hw.host_dram_bw * calib.dram_bw_util,
|
|
pcie_bw: hw.pcie_bw * calib.pcie_bw_util,
|
|
pcie_latency_s: hw.pcie_latency_us * 1e-6,
|
|
rdma_bw: hw.rdma_bw * calib.rdma_bw_util,
|
|
rdma_latency_s: hw.rdma_latency_us * 1e-6,
|
|
scheduler_base_s: calib.scheduler_base_overhead_us * 1e-6,
|
|
scheduler_per_candidate_s: calib.scheduler_per_candidate_us * 1e-6,
|
|
cache_probe_per_tier_s: calib.cache_probe_us_per_tier * 1e-6,
|
|
batch_pack_s: calib.batch_pack_overhead_us * 1e-6,
|
|
dram_access_s: calib.dram_access_latency_us * 1e-6,
|
|
remote_metadata_s: calib.remote_metadata_us * 1e-6,
|
|
layout_transform_s: calib.layout_transform_fixed_us * 1e-6,
|
|
first_token_tail_s: (calib.final_sync_us + calib.first_token_ready_us) * 1e-6,
|
|
}
|
|
}
|
|
|
|
pub fn scheduler_overhead_s(&self, num_candidates: usize, num_tiers: usize) -> f64 {
|
|
self.scheduler_base_s
|
|
+ self.scheduler_per_candidate_s * num_candidates as f64
|
|
+ self.cache_probe_per_tier_s * num_tiers as f64
|
|
+ self.batch_pack_s
|
|
}
|
|
|
|
pub fn first_token_tail_s(&self) -> f64 {
|
|
self.first_token_tail_s
|
|
}
|
|
|
|
pub fn block_bytes(&self, blocks: u32) -> u64 {
|
|
self.kv_block_bytes * blocks as u64
|
|
}
|
|
|
|
pub fn local_l1_prepare_time_s(&self, blocks: u32) -> f64 {
|
|
if blocks == 0 {
|
|
return 0.0;
|
|
}
|
|
let bytes = self.block_bytes(blocks);
|
|
self.dram_access_s
|
|
+ bytes as f64 / self.host_dram_bw.max(1.0)
|
|
+ self.pcie_cost_s(bytes)
|
|
+ self.layout_transform_s
|
|
}
|
|
|
|
pub fn remote_prepare_time_s(&self, blocks: u32) -> f64 {
|
|
if blocks == 0 {
|
|
return 0.0;
|
|
}
|
|
let bytes = self.block_bytes(blocks);
|
|
self.remote_metadata_s
|
|
+ self.rdma_cost_s(bytes)
|
|
+ self.pcie_cost_s(bytes)
|
|
+ self.layout_transform_s
|
|
}
|
|
|
|
pub fn pcie_cost_s(&self, bytes: u64) -> f64 {
|
|
if bytes == 0 {
|
|
self.pcie_latency_s
|
|
} else {
|
|
self.pcie_latency_s + bytes as f64 / self.pcie_bw.max(1.0)
|
|
}
|
|
}
|
|
|
|
pub fn rdma_cost_s(&self, bytes: u64) -> f64 {
|
|
if bytes == 0 {
|
|
self.rdma_latency_s
|
|
} else {
|
|
self.rdma_latency_s + bytes as f64 / self.rdma_bw.max(1.0)
|
|
}
|
|
}
|
|
|
|
pub fn kv_prepare_time_s(&self, residency: PrefixResidency) -> f64 {
|
|
self.local_l1_prepare_time_s(residency.l1_hit_blocks)
|
|
+ self.remote_prepare_time_s(residency.remote_hit_blocks)
|
|
}
|
|
}
|
|
|
|
pub fn classify_prefix_tiers(
|
|
req_hashes: &[u64],
|
|
inst: &Instance,
|
|
meta: &MetaStore,
|
|
now: f64,
|
|
) -> PrefixResidency {
|
|
let total_blocks = req_hashes.len() as u32;
|
|
let l0_hit_blocks = inst.cache.l0.longest_prefix_peek(req_hashes) as u32;
|
|
let suffix_after_l0 = &req_hashes[l0_hit_blocks as usize..];
|
|
|
|
let l1_hit_blocks = inst.cache.l1.longest_prefix_peek(suffix_after_l0) as u32;
|
|
let suffix_after_l1 = &suffix_after_l0[l1_hit_blocks as usize..];
|
|
|
|
let mut remote_hit_blocks = 0;
|
|
for &h in suffix_after_l1 {
|
|
let owners = meta.instances_for(h, now);
|
|
if owners.iter().any(|o| *o != inst.id) {
|
|
remote_hit_blocks += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
PrefixResidency {
|
|
l0_hit_blocks,
|
|
l1_hit_blocks,
|
|
remote_hit_blocks,
|
|
miss_blocks: total_blocks - l0_hit_blocks - l1_hit_blocks - remote_hit_blocks,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::config::{CalibrationConfig, HardwareConfig};
|
|
|
|
#[test]
|
|
fn remote_prepare_includes_fixed_overheads() {
|
|
let hw = HardwareConfig {
|
|
gpu_flops: 1.0e14,
|
|
gpu_fp8_flops: 0.0,
|
|
gpu_fp4_flops: 0.0,
|
|
gpu_mem_bw: 1.0e12,
|
|
hbm_bytes: 1.0e9,
|
|
dram_bytes: 4.0e9,
|
|
host_dram_bw: 5.0e11,
|
|
pcie_bw: 32.0e9,
|
|
pcie_latency_us: 1.0,
|
|
rdma_bw: 12.0e9,
|
|
rdma_latency_us: 5.0,
|
|
intra_node_tp_bw: 9.0e11,
|
|
intra_node_tp_latency_us: 2.0,
|
|
tp_degree: 1,
|
|
max_batch_slots: 32,
|
|
prefill_chunk_tokens: 1024,
|
|
};
|
|
let model = TtftModel::new(
|
|
&hw,
|
|
&CalibrationConfig {
|
|
remote_metadata_us: 11.0,
|
|
layout_transform_fixed_us: 7.0,
|
|
..CalibrationConfig::default()
|
|
},
|
|
4096,
|
|
);
|
|
|
|
let transport_only = model.rdma_cost_s(4096) + model.pcie_cost_s(4096);
|
|
let total = model.remote_prepare_time_s(1);
|
|
assert!(total > transport_only);
|
|
}
|
|
}
|