KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
7
src/metrics/mod.rs
Normal file
7
src/metrics/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
pub mod per_request;
|
||||
pub mod routing_log;
|
||||
pub mod summary;
|
||||
pub mod timeseries;
|
||||
|
||||
pub use per_request::PerRequestRow;
|
||||
pub use summary::Summary;
|
||||
42
src/metrics/per_request.rs
Normal file
42
src/metrics/per_request.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use anyhow::Result;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct PerRequestRow {
|
||||
pub req_id: u64,
|
||||
pub arrival: f64,
|
||||
pub ttft: f64,
|
||||
pub e2e: f64,
|
||||
pub instance: u32,
|
||||
pub total_blocks: u32,
|
||||
pub l0_hit_blocks: u32,
|
||||
pub l1_hit_blocks: u32,
|
||||
pub remote_hit_blocks: u32,
|
||||
pub miss_blocks: u32,
|
||||
pub rdma_bytes: u64,
|
||||
pub pcie_bytes: u64,
|
||||
pub probe_overhead_s: f64,
|
||||
}
|
||||
|
||||
pub struct PerRequestWriter {
|
||||
inner: csv::Writer<std::fs::File>,
|
||||
}
|
||||
|
||||
impl PerRequestWriter {
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let f = std::fs::File::create(path)?;
|
||||
let inner = csv::Writer::from_writer(f);
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
pub fn write(&mut self, row: &PerRequestRow) -> Result<()> {
|
||||
self.inner.serialize(row)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> Result<()> {
|
||||
self.inner.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
29
src/metrics/routing_log.rs
Normal file
29
src/metrics/routing_log.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use anyhow::Result;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::router::RouteDecision;
|
||||
|
||||
pub struct RoutingLogWriter {
|
||||
inner: BufWriter<File>,
|
||||
}
|
||||
|
||||
impl RoutingLogWriter {
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let f = File::create(path)?;
|
||||
Ok(Self { inner: BufWriter::new(f) })
|
||||
}
|
||||
|
||||
pub fn write(&mut self, decision: &RouteDecision) -> Result<()> {
|
||||
let line = serde_json::to_string(decision)?;
|
||||
self.inner.write_all(line.as_bytes())?;
|
||||
self.inner.write_all(b"\n")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> Result<()> {
|
||||
self.inner.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
80
src/metrics/summary.rs
Normal file
80
src/metrics/summary.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::metrics::per_request::PerRequestRow;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Default)]
|
||||
pub struct Summary {
|
||||
pub router: String,
|
||||
pub num_requests: u64,
|
||||
pub sim_duration_s: f64,
|
||||
pub throughput_req_per_s: f64,
|
||||
pub ttft_mean: f64,
|
||||
pub ttft_p50: f64,
|
||||
pub ttft_p95: f64,
|
||||
pub ttft_p99: f64,
|
||||
pub e2e_mean: f64,
|
||||
pub e2e_p50: f64,
|
||||
pub e2e_p95: f64,
|
||||
pub e2e_p99: f64,
|
||||
pub total_blocks: u64,
|
||||
pub hit_rate_l0: f64,
|
||||
pub hit_rate_l1: f64,
|
||||
pub hit_rate_remote: f64,
|
||||
pub miss_rate: f64,
|
||||
pub total_rdma_bytes: u64,
|
||||
pub total_pcie_bytes: u64,
|
||||
}
|
||||
|
||||
impl Summary {
|
||||
pub fn from_rows(router: &str, rows: &[PerRequestRow], sim_duration_s: f64) -> Self {
|
||||
if rows.is_empty() {
|
||||
return Summary {
|
||||
router: router.to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
}
|
||||
let mut ttfts: Vec<f64> = rows.iter().map(|r| r.ttft).collect();
|
||||
let mut e2es: Vec<f64> = rows.iter().map(|r| r.e2e).collect();
|
||||
ttfts.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
e2es.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
let pct = |v: &[f64], q: f64| -> f64 {
|
||||
let n = v.len();
|
||||
let idx = ((n as f64 - 1.0) * q).round() as usize;
|
||||
v[idx.min(n - 1)]
|
||||
};
|
||||
let mean = |v: &[f64]| -> f64 {
|
||||
if v.is_empty() {
|
||||
0.0
|
||||
} else {
|
||||
v.iter().sum::<f64>() / v.len() as f64
|
||||
}
|
||||
};
|
||||
let total_blocks: u64 = rows.iter().map(|r| r.total_blocks as u64).sum();
|
||||
let l0: u64 = rows.iter().map(|r| r.l0_hit_blocks as u64).sum();
|
||||
let l1: u64 = rows.iter().map(|r| r.l1_hit_blocks as u64).sum();
|
||||
let remote: u64 = rows.iter().map(|r| r.remote_hit_blocks as u64).sum();
|
||||
let miss: u64 = rows.iter().map(|r| r.miss_blocks as u64).sum();
|
||||
let denom = total_blocks.max(1) as f64;
|
||||
Summary {
|
||||
router: router.to_string(),
|
||||
num_requests: rows.len() as u64,
|
||||
sim_duration_s,
|
||||
throughput_req_per_s: rows.len() as f64 / sim_duration_s.max(1e-9),
|
||||
ttft_mean: mean(&ttfts),
|
||||
ttft_p50: pct(&ttfts, 0.50),
|
||||
ttft_p95: pct(&ttfts, 0.95),
|
||||
ttft_p99: pct(&ttfts, 0.99),
|
||||
e2e_mean: mean(&e2es),
|
||||
e2e_p50: pct(&e2es, 0.50),
|
||||
e2e_p95: pct(&e2es, 0.95),
|
||||
e2e_p99: pct(&e2es, 0.99),
|
||||
total_blocks,
|
||||
hit_rate_l0: l0 as f64 / denom,
|
||||
hit_rate_l1: l1 as f64 / denom,
|
||||
hit_rate_remote: remote as f64 / denom,
|
||||
miss_rate: miss as f64 / denom,
|
||||
total_rdma_bytes: rows.iter().map(|r| r.rdma_bytes).sum(),
|
||||
total_pcie_bytes: rows.iter().map(|r| r.pcie_bytes).sum(),
|
||||
}
|
||||
}
|
||||
}
|
||||
34
src/metrics/timeseries.rs
Normal file
34
src/metrics/timeseries.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use anyhow::Result;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct TimeseriesRow {
|
||||
pub t: f64,
|
||||
pub instance: u32,
|
||||
pub queue_len: u32,
|
||||
pub kv_blocks_used: u32,
|
||||
pub kv_blocks_total: u32,
|
||||
pub busy: u8,
|
||||
}
|
||||
|
||||
pub struct TimeseriesWriter {
|
||||
inner: csv::Writer<std::fs::File>,
|
||||
}
|
||||
|
||||
impl TimeseriesWriter {
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let f = std::fs::File::create(path)?;
|
||||
Ok(Self { inner: csv::Writer::from_writer(f) })
|
||||
}
|
||||
|
||||
pub fn write(&mut self, row: &TimeseriesRow) -> Result<()> {
|
||||
self.inner.serialize(row)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> Result<()> {
|
||||
self.inner.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user