KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies
in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache
hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention,
architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide
meta-store for prefix-aware routing decisions.

Includes 11 routing policies (random, round_robin, least_loaded,
least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score,
estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing,
built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation
tooling for systematic policy comparison across real Alibaba serving traces.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions

102
src/trace.rs Normal file
View File

@@ -0,0 +1,102 @@
//! Streaming JSONL reader for the qwen-bailian trace format.
//!
//! Schema (per upstream README):
//! chat_id: i64
//! parent_chat_id: i64 (-1 = root)
//! timestamp: f64 (seconds since trace start)
//! input_length: i64
//! output_length: i64
//! type: string (text/search/image/file)
//! turn: i64
//! hash_ids: [i64] (16-token blocks, salted SipHash)
use anyhow::{Context, Result};
use serde::Deserialize;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone, Deserialize)]
struct RawRecord {
#[serde(default)]
chat_id: i64,
#[serde(default)]
timestamp: f64,
#[serde(default)]
input_length: i64,
#[serde(default)]
output_length: i64,
#[serde(default)]
hash_ids: Vec<i64>,
}
#[derive(Debug, Clone)]
pub struct RequestRecord {
pub req_id: u64,
pub chat_id: i64,
pub arrival: f64,
pub input_len: u32,
pub output_len: u32,
pub hash_ids: Vec<u64>,
}
pub struct TraceReader {
inner: BufReader<File>,
next_id: u64,
line_buf: String,
max_requests: Option<u64>,
}
impl TraceReader {
pub fn open<P: AsRef<Path>>(path: P, max_requests: Option<u64>) -> Result<Self> {
let path = path.as_ref();
let f = File::open(path)
.with_context(|| format!("opening trace {}", path.display()))?;
Ok(Self {
inner: BufReader::with_capacity(1 << 20, f),
next_id: 0,
line_buf: String::with_capacity(4096),
max_requests,
})
}
}
impl Iterator for TraceReader {
type Item = Result<RequestRecord>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(cap) = self.max_requests {
if self.next_id >= cap {
return None;
}
}
loop {
self.line_buf.clear();
match self.inner.read_line(&mut self.line_buf) {
Ok(0) => return None,
Ok(_) => {
let trimmed = self.line_buf.trim();
if trimmed.is_empty() {
continue;
}
let parsed: Result<RawRecord, _> = serde_json::from_str(trimmed);
let raw = match parsed {
Ok(r) => r,
Err(e) => return Some(Err(anyhow::anyhow!("trace parse: {e}"))),
};
let id = self.next_id;
self.next_id += 1;
return Some(Ok(RequestRecord {
req_id: id,
chat_id: raw.chat_id,
arrival: raw.timestamp,
input_len: raw.input_length.max(0) as u32,
output_len: raw.output_length.max(0) as u32,
hash_ids: raw.hash_ids.into_iter().map(|h| h as u64).collect(),
}));
}
Err(e) => return Some(Err(e.into())),
}
}
}
}