KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies
in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache
hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention,
architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide
meta-store for prefix-aware routing decisions.

Includes 11 routing policies (random, round_robin, least_loaded,
least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score,
estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing,
built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation
tooling for systematic policy comparison across real Alibaba serving traces.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 01:16:02 +08:00
commit ec73a95e05
52 changed files with 6005 additions and 0 deletions

271
src/main.rs Normal file
View File

@@ -0,0 +1,271 @@
use anyhow::{Context, Result};
use clap::{Args, Parser, Subcommand};
use std::path::PathBuf;
use kvcache_simulator::config::{Config, RouterMode};
use kvcache_simulator::{driver, oracle, trace::TraceReader};
#[derive(Debug, Parser)]
#[command(name = "kvcache-sim", about = "Cluster-level KV cache simulator")]
struct Cli {
#[command(subcommand)]
cmd: Cmd,
}
/// Optional CLI overrides applied on top of the YAML config so the same
/// config can be reused across sweeps without editing the file.
#[derive(Debug, Args, Clone, Default)]
struct ConfigOverrides {
/// Override `cluster.num_instances`.
#[arg(long)]
num_instances: Option<u32>,
/// Override `sim.max_requests` (cap on processed trace records).
#[arg(long)]
max_requests: Option<u64>,
/// Override `sim.trace_path`.
#[arg(long)]
trace: Option<PathBuf>,
/// Override `sim.output_dir`.
#[arg(long)]
output_dir: Option<PathBuf>,
/// Override `sim.seed`.
#[arg(long)]
seed: Option<u64>,
/// Override `cluster.router.precise_probe_topk`.
#[arg(long)]
precise_topk: Option<u32>,
/// Override `cluster.meta_store.ttl_seconds`.
#[arg(long)]
ttl_seconds: Option<f64>,
}
impl ConfigOverrides {
fn apply(&self, cfg: &mut Config) {
if let Some(n) = self.num_instances {
cfg.cluster.num_instances = n;
}
if let Some(m) = self.max_requests {
cfg.sim.max_requests = Some(m);
}
if let Some(t) = &self.trace {
cfg.sim.trace_path = t.to_string_lossy().into_owned();
}
if let Some(o) = &self.output_dir {
cfg.sim.output_dir = o.to_string_lossy().into_owned();
}
if let Some(s) = self.seed {
cfg.sim.seed = s;
}
if let Some(k) = self.precise_topk {
cfg.cluster.router.precise_probe_topk = k;
}
if let Some(ttl) = self.ttl_seconds {
cfg.cluster.meta_store.ttl_seconds = ttl;
}
}
}
#[derive(Debug, Subcommand)]
enum Cmd {
/// Run a single simulation with the router specified in the config.
Run {
#[arg(short, long)]
config: PathBuf,
#[command(flatten)]
overrides: ConfigOverrides,
},
/// Run the same trace under multiple routers and compare summaries.
Ablate {
#[arg(short, long)]
config: PathBuf,
/// Comma-separated router modes
#[arg(
short,
long,
default_value = "random,least_loaded,least_tokens,ttl_aware,min_pd,cache_load,cache_score,estimated_ttft,prefix_affinity"
)]
routers: String,
#[command(flatten)]
overrides: ConfigOverrides,
},
/// Parse the config and trace head; do not run a simulation.
Validate {
#[arg(short, long)]
config: PathBuf,
#[command(flatten)]
overrides: ConfigOverrides,
},
/// Offline oracle analysis: theoretical hit-rate ceilings (unlimited
/// cache and offline-optimal Belady eviction at finite capacity), plus
/// LRU at the same capacity for comparison.
Oracle {
#[arg(short, long)]
config: PathBuf,
#[command(flatten)]
overrides: ConfigOverrides,
/// Cache capacity (in 16-token blocks) used for the Belady and LRU
/// analyses. Defaults to `num_instances * per_instance_HBM_blocks`
/// (the cluster-aggregate capacity).
#[arg(long)]
capacity_blocks: Option<u64>,
/// Use the per-instance HBM block budget instead of the
/// cluster-aggregate. Mutually exclusive with --capacity-blocks.
#[arg(long, default_value_t = false)]
per_instance: bool,
/// Optional output JSON path. Defaults to `<output_dir>/oracle.json`.
#[arg(long)]
out: Option<PathBuf>,
},
}
fn main() -> Result<()> {
let cli = Cli::parse();
match cli.cmd {
Cmd::Run { config, overrides } => cmd_run(&config, &overrides),
Cmd::Ablate {
config,
routers,
overrides,
} => cmd_ablate(&config, &routers, &overrides),
Cmd::Validate { config, overrides } => cmd_validate(&config, &overrides),
Cmd::Oracle {
config,
overrides,
capacity_blocks,
per_instance,
out,
} => cmd_oracle(&config, &overrides, capacity_blocks, per_instance, out.as_deref()),
}
}
fn load(config: &PathBuf, overrides: &ConfigOverrides) -> Result<Config> {
let mut cfg = Config::from_yaml_path(config)?;
overrides.apply(&mut cfg);
Ok(cfg)
}
fn cmd_run(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
let cfg = load(path, overrides)?;
let out = driver::run(&cfg, None)?;
println!("{}", serde_json::to_string_pretty(&out.summary)?);
Ok(())
}
fn cmd_ablate(path: &PathBuf, routers: &str, overrides: &ConfigOverrides) -> Result<()> {
let base = load(path, overrides)?;
let modes: Vec<RouterMode> = routers
.split(',')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(RouterMode::parse)
.collect::<Result<Vec<_>>>()
.with_context(|| format!("parsing --routers='{routers}'"))?;
let mut all = Vec::new();
for mode in modes {
let mut cfg = base.clone();
cfg.cluster.router.mode = mode;
let sub = mode.as_str().to_string();
eprintln!("[ablate] running router={}", sub);
let out = driver::run(&cfg, Some(&sub))?;
all.push(out.summary);
}
let agg_path = std::path::Path::new(&base.sim.output_dir).join("ablation.json");
std::fs::create_dir_all(&base.sim.output_dir)?;
std::fs::write(&agg_path, serde_json::to_string_pretty(&all)?)?;
println!("{}", serde_json::to_string_pretty(&all)?);
eprintln!("[ablate] wrote {}", agg_path.display());
Ok(())
}
fn cmd_validate(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
use kvcache_simulator::instance::compute::ComputeModel;
let cfg = load(path, overrides)?;
eprintln!("config OK: {}", cfg.model.name);
eprintln!("mode = {}", if cfg.model.is_arch_mode() { "architecture-derived" } else { "legacy manual" });
let cm = ComputeModel::new(&cfg.model, &cfg.hardware);
eprintln!("compute: {}", cm.describe());
eprintln!("kv_block_bytes = {} ({:.2} MB{})",
cfg.model.kv_block_bytes(),
cfg.model.kv_block_bytes() as f64 / 1e6,
if cfg.model.mla.is_some() { ", MLA compressed" } else { "" },
);
let block_bytes = cfg.model.kv_block_bytes() as f64;
let hbm_blocks = (cfg.hardware.hbm_bytes / block_bytes) as u64;
let dram_blocks = (cfg.hardware.dram_bytes / block_bytes) as u64;
eprintln!("per-instance HBM blocks = {hbm_blocks}, DRAM blocks = {dram_blocks}");
eprintln!("num_instances = {}", cfg.cluster.num_instances);
// Sample prefill times at a few prompt lengths.
eprintln!("prefill_time samples:");
for &n in &[256, 1024, 4096, 16384, 65536, 131072] {
let t = cm.prefill_time(n);
eprintln!(" N={n:>7} -> {t:.4} s");
}
let reader = TraceReader::open(&cfg.sim.trace_path, Some(5))?;
for rec in reader {
let rec = rec?;
eprintln!(
" req {} chat={} t={:.3}s in={} out={} blocks={}",
rec.req_id,
rec.chat_id,
rec.arrival,
rec.input_len,
rec.output_len,
rec.hash_ids.len()
);
}
Ok(())
}
fn cmd_oracle(
path: &PathBuf,
overrides: &ConfigOverrides,
capacity_blocks: Option<u64>,
per_instance: bool,
out_path: Option<&std::path::Path>,
) -> Result<()> {
let cfg = load(path, overrides)?;
let block_bytes = cfg.model.kv_block_bytes() as f64;
let per_instance_blocks = (cfg.hardware.hbm_bytes / block_bytes).max(1.0) as u64;
let aggregate_blocks = per_instance_blocks * cfg.cluster.num_instances as u64;
let capacity = match (capacity_blocks, per_instance) {
(Some(_), true) => {
return Err(anyhow::anyhow!(
"--capacity-blocks and --per-instance are mutually exclusive"
))
}
(Some(c), false) => c,
(None, true) => per_instance_blocks,
(None, false) => aggregate_blocks,
};
eprintln!(
"[oracle] loading trace {} (max_requests={:?})",
cfg.sim.trace_path, cfg.sim.max_requests
);
let reader = TraceReader::open(&cfg.sim.trace_path, cfg.sim.max_requests)?;
let records: Vec<_> = reader.collect::<Result<Vec<_>, _>>()?;
eprintln!(
"[oracle] loaded {} requests; analyzing with capacity = {} blocks \
({} per-instance × {} instances{})",
records.len(),
capacity,
per_instance_blocks,
cfg.cluster.num_instances,
if per_instance { ", per-instance mode" } else { "" }
);
let result = oracle::analyze(&records, capacity);
let json = serde_json::to_string_pretty(&result)?;
println!("{}", json);
let target = match out_path {
Some(p) => p.to_path_buf(),
None => std::path::Path::new(&cfg.sim.output_dir).join("oracle.json"),
};
if let Some(parent) = target.parent() {
std::fs::create_dir_all(parent)?;
}
std::fs::write(&target, &json)?;
eprintln!("[oracle] wrote {}", target.display());
Ok(())
}