feat: new router and benchmark setup
This commit is contained in:
@@ -301,8 +301,8 @@ impl ComputeModel {
|
||||
let attn_time = attn_total_flops / (self.gpu_flops * self.attention_util.max(1e-6));
|
||||
let compute_time = linear_time + attn_time + self.num_layers * self.misc_layer_overhead_s;
|
||||
// Weight stream: all layers' active weights read once from HBM.
|
||||
let mem_time =
|
||||
self.weight_bytes_per_layer * self.num_layers / (self.gpu_mem_bw * self.hbm_bw_util.max(1e-6));
|
||||
let mem_time = self.weight_bytes_per_layer * self.num_layers
|
||||
/ (self.gpu_mem_bw * self.hbm_bw_util.max(1e-6));
|
||||
let tp_comm_time = if self.tp_collective_count_per_layer > 0.0
|
||||
&& self.tp_bytes_per_token > 0.0
|
||||
&& self.intra_node_tp_bw > 0.0
|
||||
|
||||
@@ -144,17 +144,17 @@ impl Instance {
|
||||
if self.kv_blocks_used + front.reserved_blocks > self.hbm_block_budget {
|
||||
break;
|
||||
}
|
||||
let r = self.pending.pop_front().unwrap();
|
||||
self.kv_blocks_used += r.reserved_blocks;
|
||||
if r.prefill_tokens_remaining == 0 {
|
||||
// Full cache hit: nothing to compute. TTFT == fetch time.
|
||||
let t_done = now + r.completion_tail_s;
|
||||
let ttft = t_done - r.arrival;
|
||||
self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
|
||||
completed.push((r.req_id, ttft, t_done));
|
||||
} else {
|
||||
self.prefilling.push_back(r);
|
||||
}
|
||||
let r = self.pending.pop_front().unwrap();
|
||||
self.kv_blocks_used += r.reserved_blocks;
|
||||
if r.prefill_tokens_remaining == 0 {
|
||||
// Full cache hit: nothing to compute. TTFT == fetch time.
|
||||
let t_done = now + r.completion_tail_s;
|
||||
let ttft = t_done - r.arrival;
|
||||
self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
|
||||
completed.push((r.req_id, ttft, t_done));
|
||||
} else {
|
||||
self.prefilling.push_back(r);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Run one chunked-prefill step on the head of `prefilling`.
|
||||
|
||||
Reference in New Issue
Block a user