feat: new router and benchmark setup

This commit is contained in:
2026-04-16 14:23:53 +08:00
parent c86d931d8f
commit 996511f300
35 changed files with 1480 additions and 76 deletions

View File

@@ -301,8 +301,8 @@ impl ComputeModel {
let attn_time = attn_total_flops / (self.gpu_flops * self.attention_util.max(1e-6));
let compute_time = linear_time + attn_time + self.num_layers * self.misc_layer_overhead_s;
// Weight stream: all layers' active weights read once from HBM.
let mem_time =
self.weight_bytes_per_layer * self.num_layers / (self.gpu_mem_bw * self.hbm_bw_util.max(1e-6));
let mem_time = self.weight_bytes_per_layer * self.num_layers
/ (self.gpu_mem_bw * self.hbm_bw_util.max(1e-6));
let tp_comm_time = if self.tp_collective_count_per_layer > 0.0
&& self.tp_bytes_per_token > 0.0
&& self.intra_node_tp_bw > 0.0

View File

@@ -144,17 +144,17 @@ impl Instance {
if self.kv_blocks_used + front.reserved_blocks > self.hbm_block_budget {
break;
}
let r = self.pending.pop_front().unwrap();
self.kv_blocks_used += r.reserved_blocks;
if r.prefill_tokens_remaining == 0 {
// Full cache hit: nothing to compute. TTFT == fetch time.
let t_done = now + r.completion_tail_s;
let ttft = t_done - r.arrival;
self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
completed.push((r.req_id, ttft, t_done));
} else {
self.prefilling.push_back(r);
}
let r = self.pending.pop_front().unwrap();
self.kv_blocks_used += r.reserved_blocks;
if r.prefill_tokens_remaining == 0 {
// Full cache hit: nothing to compute. TTFT == fetch time.
let t_done = now + r.completion_tail_s;
let ttft = t_done - r.arrival;
self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
completed.push((r.req_id, ttft, t_done));
} else {
self.prefilling.push_back(r);
}
}
// 2. Run one chunked-prefill step on the head of `prefilling`.