KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies
in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache
hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention,
architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide
meta-store for prefix-aware routing decisions.

Includes 11 routing policies (random, round_robin, least_loaded,
least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score,
estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing,
built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation
tooling for systematic policy comparison across real Alibaba serving traces.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions

7
src/metrics/mod.rs Normal file
View File

@@ -0,0 +1,7 @@
pub mod per_request;
pub mod routing_log;
pub mod summary;
pub mod timeseries;
pub use per_request::PerRequestRow;
pub use summary::Summary;

View File

@@ -0,0 +1,42 @@
use anyhow::Result;
use serde::Serialize;
use std::path::Path;
#[derive(Debug, Clone, Serialize)]
pub struct PerRequestRow {
pub req_id: u64,
pub arrival: f64,
pub ttft: f64,
pub e2e: f64,
pub instance: u32,
pub total_blocks: u32,
pub l0_hit_blocks: u32,
pub l1_hit_blocks: u32,
pub remote_hit_blocks: u32,
pub miss_blocks: u32,
pub rdma_bytes: u64,
pub pcie_bytes: u64,
pub probe_overhead_s: f64,
}
pub struct PerRequestWriter {
inner: csv::Writer<std::fs::File>,
}
impl PerRequestWriter {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
let f = std::fs::File::create(path)?;
let inner = csv::Writer::from_writer(f);
Ok(Self { inner })
}
pub fn write(&mut self, row: &PerRequestRow) -> Result<()> {
self.inner.serialize(row)?;
Ok(())
}
pub fn finish(mut self) -> Result<()> {
self.inner.flush()?;
Ok(())
}
}

View File

@@ -0,0 +1,29 @@
use anyhow::Result;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
use crate::router::RouteDecision;
pub struct RoutingLogWriter {
inner: BufWriter<File>,
}
impl RoutingLogWriter {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
let f = File::create(path)?;
Ok(Self { inner: BufWriter::new(f) })
}
pub fn write(&mut self, decision: &RouteDecision) -> Result<()> {
let line = serde_json::to_string(decision)?;
self.inner.write_all(line.as_bytes())?;
self.inner.write_all(b"\n")?;
Ok(())
}
pub fn finish(mut self) -> Result<()> {
self.inner.flush()?;
Ok(())
}
}

80
src/metrics/summary.rs Normal file
View File

@@ -0,0 +1,80 @@
use serde::Serialize;
use crate::metrics::per_request::PerRequestRow;
#[derive(Debug, Clone, Serialize, Default)]
pub struct Summary {
pub router: String,
pub num_requests: u64,
pub sim_duration_s: f64,
pub throughput_req_per_s: f64,
pub ttft_mean: f64,
pub ttft_p50: f64,
pub ttft_p95: f64,
pub ttft_p99: f64,
pub e2e_mean: f64,
pub e2e_p50: f64,
pub e2e_p95: f64,
pub e2e_p99: f64,
pub total_blocks: u64,
pub hit_rate_l0: f64,
pub hit_rate_l1: f64,
pub hit_rate_remote: f64,
pub miss_rate: f64,
pub total_rdma_bytes: u64,
pub total_pcie_bytes: u64,
}
impl Summary {
pub fn from_rows(router: &str, rows: &[PerRequestRow], sim_duration_s: f64) -> Self {
if rows.is_empty() {
return Summary {
router: router.to_string(),
..Default::default()
};
}
let mut ttfts: Vec<f64> = rows.iter().map(|r| r.ttft).collect();
let mut e2es: Vec<f64> = rows.iter().map(|r| r.e2e).collect();
ttfts.sort_by(|a, b| a.partial_cmp(b).unwrap());
e2es.sort_by(|a, b| a.partial_cmp(b).unwrap());
let pct = |v: &[f64], q: f64| -> f64 {
let n = v.len();
let idx = ((n as f64 - 1.0) * q).round() as usize;
v[idx.min(n - 1)]
};
let mean = |v: &[f64]| -> f64 {
if v.is_empty() {
0.0
} else {
v.iter().sum::<f64>() / v.len() as f64
}
};
let total_blocks: u64 = rows.iter().map(|r| r.total_blocks as u64).sum();
let l0: u64 = rows.iter().map(|r| r.l0_hit_blocks as u64).sum();
let l1: u64 = rows.iter().map(|r| r.l1_hit_blocks as u64).sum();
let remote: u64 = rows.iter().map(|r| r.remote_hit_blocks as u64).sum();
let miss: u64 = rows.iter().map(|r| r.miss_blocks as u64).sum();
let denom = total_blocks.max(1) as f64;
Summary {
router: router.to_string(),
num_requests: rows.len() as u64,
sim_duration_s,
throughput_req_per_s: rows.len() as f64 / sim_duration_s.max(1e-9),
ttft_mean: mean(&ttfts),
ttft_p50: pct(&ttfts, 0.50),
ttft_p95: pct(&ttfts, 0.95),
ttft_p99: pct(&ttfts, 0.99),
e2e_mean: mean(&e2es),
e2e_p50: pct(&e2es, 0.50),
e2e_p95: pct(&e2es, 0.95),
e2e_p99: pct(&e2es, 0.99),
total_blocks,
hit_rate_l0: l0 as f64 / denom,
hit_rate_l1: l1 as f64 / denom,
hit_rate_remote: remote as f64 / denom,
miss_rate: miss as f64 / denom,
total_rdma_bytes: rows.iter().map(|r| r.rdma_bytes).sum(),
total_pcie_bytes: rows.iter().map(|r| r.pcie_bytes).sum(),
}
}
}

34
src/metrics/timeseries.rs Normal file
View File

@@ -0,0 +1,34 @@
use anyhow::Result;
use serde::Serialize;
use std::path::Path;
#[derive(Debug, Clone, Serialize)]
pub struct TimeseriesRow {
pub t: f64,
pub instance: u32,
pub queue_len: u32,
pub kv_blocks_used: u32,
pub kv_blocks_total: u32,
pub busy: u8,
}
pub struct TimeseriesWriter {
inner: csv::Writer<std::fs::File>,
}
impl TimeseriesWriter {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
let f = std::fs::File::create(path)?;
Ok(Self { inner: csv::Writer::from_writer(f) })
}
pub fn write(&mut self, row: &TimeseriesRow) -> Result<()> {
self.inner.serialize(row)?;
Ok(())
}
pub fn finish(mut self) -> Result<()> {
self.inner.flush()?;
Ok(())
}
}