KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
102
src/trace.rs
Normal file
102
src/trace.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
//! Streaming JSONL reader for the qwen-bailian trace format.
|
||||
//!
|
||||
//! Schema (per upstream README):
|
||||
//! chat_id: i64
|
||||
//! parent_chat_id: i64 (-1 = root)
|
||||
//! timestamp: f64 (seconds since trace start)
|
||||
//! input_length: i64
|
||||
//! output_length: i64
|
||||
//! type: string (text/search/image/file)
|
||||
//! turn: i64
|
||||
//! hash_ids: [i64] (16-token blocks, salted SipHash)
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::Deserialize;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
struct RawRecord {
|
||||
#[serde(default)]
|
||||
chat_id: i64,
|
||||
#[serde(default)]
|
||||
timestamp: f64,
|
||||
#[serde(default)]
|
||||
input_length: i64,
|
||||
#[serde(default)]
|
||||
output_length: i64,
|
||||
#[serde(default)]
|
||||
hash_ids: Vec<i64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RequestRecord {
|
||||
pub req_id: u64,
|
||||
pub chat_id: i64,
|
||||
pub arrival: f64,
|
||||
pub input_len: u32,
|
||||
pub output_len: u32,
|
||||
pub hash_ids: Vec<u64>,
|
||||
}
|
||||
|
||||
pub struct TraceReader {
|
||||
inner: BufReader<File>,
|
||||
next_id: u64,
|
||||
line_buf: String,
|
||||
max_requests: Option<u64>,
|
||||
}
|
||||
|
||||
impl TraceReader {
|
||||
pub fn open<P: AsRef<Path>>(path: P, max_requests: Option<u64>) -> Result<Self> {
|
||||
let path = path.as_ref();
|
||||
let f = File::open(path)
|
||||
.with_context(|| format!("opening trace {}", path.display()))?;
|
||||
Ok(Self {
|
||||
inner: BufReader::with_capacity(1 << 20, f),
|
||||
next_id: 0,
|
||||
line_buf: String::with_capacity(4096),
|
||||
max_requests,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for TraceReader {
|
||||
type Item = Result<RequestRecord>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if let Some(cap) = self.max_requests {
|
||||
if self.next_id >= cap {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
loop {
|
||||
self.line_buf.clear();
|
||||
match self.inner.read_line(&mut self.line_buf) {
|
||||
Ok(0) => return None,
|
||||
Ok(_) => {
|
||||
let trimmed = self.line_buf.trim();
|
||||
if trimmed.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let parsed: Result<RawRecord, _> = serde_json::from_str(trimmed);
|
||||
let raw = match parsed {
|
||||
Ok(r) => r,
|
||||
Err(e) => return Some(Err(anyhow::anyhow!("trace parse: {e}"))),
|
||||
};
|
||||
let id = self.next_id;
|
||||
self.next_id += 1;
|
||||
return Some(Ok(RequestRecord {
|
||||
req_id: id,
|
||||
chat_id: raw.chat_id,
|
||||
arrival: raw.timestamp,
|
||||
input_len: raw.input_length.max(0) as u32,
|
||||
output_len: raw.output_length.max(0) as u32,
|
||||
hash_ids: raw.hash_ids.into_iter().map(|h| h as u64).collect(),
|
||||
}));
|
||||
}
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user