KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions
--- a/src/trace.rs
+++ b/src/trace.rs
@@ -0,0 +1,102 @@
+//! Streaming JSONL reader for the qwen-bailian trace format.
+//!
+//! Schema (per upstream README):
+//!   chat_id: i64
+//!   parent_chat_id: i64           (-1 = root)
+//!   timestamp: f64                (seconds since trace start)
+//!   input_length: i64
+//!   output_length: i64
+//!   type: string                  (text/search/image/file)
+//!   turn: i64
+//!   hash_ids: [i64]               (16-token blocks, salted SipHash)
+
+use anyhow::{Context, Result};
+use serde::Deserialize;
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+use std::path::Path;
+
+#[derive(Debug, Clone, Deserialize)]
+struct RawRecord {
+    #[serde(default)]
+    chat_id: i64,
+    #[serde(default)]
+    timestamp: f64,
+    #[serde(default)]
+    input_length: i64,
+    #[serde(default)]
+    output_length: i64,
+    #[serde(default)]
+    hash_ids: Vec<i64>,
+}
+
+#[derive(Debug, Clone)]
+pub struct RequestRecord {
+    pub req_id: u64,
+    pub chat_id: i64,
+    pub arrival: f64,
+    pub input_len: u32,
+    pub output_len: u32,
+    pub hash_ids: Vec<u64>,
+}
+
+pub struct TraceReader {
+    inner: BufReader<File>,
+    next_id: u64,
+    line_buf: String,
+    max_requests: Option<u64>,
+}
+
+impl TraceReader {
+    pub fn open<P: AsRef<Path>>(path: P, max_requests: Option<u64>) -> Result<Self> {
+        let path = path.as_ref();
+        let f = File::open(path)
+            .with_context(|| format!("opening trace {}", path.display()))?;
+        Ok(Self {
+            inner: BufReader::with_capacity(1 << 20, f),
+            next_id: 0,
+            line_buf: String::with_capacity(4096),
+            max_requests,
+        })
+    }
+}
+
+impl Iterator for TraceReader {
+    type Item = Result<RequestRecord>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(cap) = self.max_requests {
+            if self.next_id >= cap {
+                return None;
+            }
+        }
+        loop {
+            self.line_buf.clear();
+            match self.inner.read_line(&mut self.line_buf) {
+                Ok(0) => return None,
+                Ok(_) => {
+                    let trimmed = self.line_buf.trim();
+                    if trimmed.is_empty() {
+                        continue;
+                    }
+                    let parsed: Result<RawRecord, _> = serde_json::from_str(trimmed);
+                    let raw = match parsed {
+                        Ok(r) => r,
+                        Err(e) => return Some(Err(anyhow::anyhow!("trace parse: {e}"))),
+                    };
+                    let id = self.next_id;
+                    self.next_id += 1;
+                    return Some(Ok(RequestRecord {
+                        req_id: id,
+                        chat_id: raw.chat_id,
+                        arrival: raw.timestamp,
+                        input_len: raw.input_length.max(0) as u32,
+                        output_len: raw.output_length.max(0) as u32,
+                        hash_ids: raw.hash_ids.into_iter().map(|h| h as u64).collect(),
+                    }));
+                }
+                Err(e) => return Some(Err(e.into())),
+            }
+        }
+    }
+}