KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
271
src/main.rs
Normal file
271
src/main.rs
Normal file
@@ -0,0 +1,271 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{Args, Parser, Subcommand};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kvcache_simulator::config::{Config, RouterMode};
|
||||
use kvcache_simulator::{driver, oracle, trace::TraceReader};
|
||||
|
||||
#[derive(Debug, Parser)]
|
||||
#[command(name = "kvcache-sim", about = "Cluster-level KV cache simulator")]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
cmd: Cmd,
|
||||
}
|
||||
|
||||
/// Optional CLI overrides applied on top of the YAML config so the same
|
||||
/// config can be reused across sweeps without editing the file.
|
||||
#[derive(Debug, Args, Clone, Default)]
|
||||
struct ConfigOverrides {
|
||||
/// Override `cluster.num_instances`.
|
||||
#[arg(long)]
|
||||
num_instances: Option<u32>,
|
||||
/// Override `sim.max_requests` (cap on processed trace records).
|
||||
#[arg(long)]
|
||||
max_requests: Option<u64>,
|
||||
/// Override `sim.trace_path`.
|
||||
#[arg(long)]
|
||||
trace: Option<PathBuf>,
|
||||
/// Override `sim.output_dir`.
|
||||
#[arg(long)]
|
||||
output_dir: Option<PathBuf>,
|
||||
/// Override `sim.seed`.
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
/// Override `cluster.router.precise_probe_topk`.
|
||||
#[arg(long)]
|
||||
precise_topk: Option<u32>,
|
||||
/// Override `cluster.meta_store.ttl_seconds`.
|
||||
#[arg(long)]
|
||||
ttl_seconds: Option<f64>,
|
||||
}
|
||||
|
||||
impl ConfigOverrides {
|
||||
fn apply(&self, cfg: &mut Config) {
|
||||
if let Some(n) = self.num_instances {
|
||||
cfg.cluster.num_instances = n;
|
||||
}
|
||||
if let Some(m) = self.max_requests {
|
||||
cfg.sim.max_requests = Some(m);
|
||||
}
|
||||
if let Some(t) = &self.trace {
|
||||
cfg.sim.trace_path = t.to_string_lossy().into_owned();
|
||||
}
|
||||
if let Some(o) = &self.output_dir {
|
||||
cfg.sim.output_dir = o.to_string_lossy().into_owned();
|
||||
}
|
||||
if let Some(s) = self.seed {
|
||||
cfg.sim.seed = s;
|
||||
}
|
||||
if let Some(k) = self.precise_topk {
|
||||
cfg.cluster.router.precise_probe_topk = k;
|
||||
}
|
||||
if let Some(ttl) = self.ttl_seconds {
|
||||
cfg.cluster.meta_store.ttl_seconds = ttl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Subcommand)]
|
||||
enum Cmd {
|
||||
/// Run a single simulation with the router specified in the config.
|
||||
Run {
|
||||
#[arg(short, long)]
|
||||
config: PathBuf,
|
||||
#[command(flatten)]
|
||||
overrides: ConfigOverrides,
|
||||
},
|
||||
/// Run the same trace under multiple routers and compare summaries.
|
||||
Ablate {
|
||||
#[arg(short, long)]
|
||||
config: PathBuf,
|
||||
/// Comma-separated router modes
|
||||
#[arg(
|
||||
short,
|
||||
long,
|
||||
default_value = "random,least_loaded,least_tokens,ttl_aware,min_pd,cache_load,cache_score,estimated_ttft,prefix_affinity"
|
||||
)]
|
||||
routers: String,
|
||||
#[command(flatten)]
|
||||
overrides: ConfigOverrides,
|
||||
},
|
||||
/// Parse the config and trace head; do not run a simulation.
|
||||
Validate {
|
||||
#[arg(short, long)]
|
||||
config: PathBuf,
|
||||
#[command(flatten)]
|
||||
overrides: ConfigOverrides,
|
||||
},
|
||||
/// Offline oracle analysis: theoretical hit-rate ceilings (unlimited
|
||||
/// cache and offline-optimal Belady eviction at finite capacity), plus
|
||||
/// LRU at the same capacity for comparison.
|
||||
Oracle {
|
||||
#[arg(short, long)]
|
||||
config: PathBuf,
|
||||
#[command(flatten)]
|
||||
overrides: ConfigOverrides,
|
||||
/// Cache capacity (in 16-token blocks) used for the Belady and LRU
|
||||
/// analyses. Defaults to `num_instances * per_instance_HBM_blocks`
|
||||
/// (the cluster-aggregate capacity).
|
||||
#[arg(long)]
|
||||
capacity_blocks: Option<u64>,
|
||||
/// Use the per-instance HBM block budget instead of the
|
||||
/// cluster-aggregate. Mutually exclusive with --capacity-blocks.
|
||||
#[arg(long, default_value_t = false)]
|
||||
per_instance: bool,
|
||||
/// Optional output JSON path. Defaults to `<output_dir>/oracle.json`.
|
||||
#[arg(long)]
|
||||
out: Option<PathBuf>,
|
||||
},
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
match cli.cmd {
|
||||
Cmd::Run { config, overrides } => cmd_run(&config, &overrides),
|
||||
Cmd::Ablate {
|
||||
config,
|
||||
routers,
|
||||
overrides,
|
||||
} => cmd_ablate(&config, &routers, &overrides),
|
||||
Cmd::Validate { config, overrides } => cmd_validate(&config, &overrides),
|
||||
Cmd::Oracle {
|
||||
config,
|
||||
overrides,
|
||||
capacity_blocks,
|
||||
per_instance,
|
||||
out,
|
||||
} => cmd_oracle(&config, &overrides, capacity_blocks, per_instance, out.as_deref()),
|
||||
}
|
||||
}
|
||||
|
||||
fn load(config: &PathBuf, overrides: &ConfigOverrides) -> Result<Config> {
|
||||
let mut cfg = Config::from_yaml_path(config)?;
|
||||
overrides.apply(&mut cfg);
|
||||
Ok(cfg)
|
||||
}
|
||||
|
||||
fn cmd_run(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
|
||||
let cfg = load(path, overrides)?;
|
||||
let out = driver::run(&cfg, None)?;
|
||||
println!("{}", serde_json::to_string_pretty(&out.summary)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cmd_ablate(path: &PathBuf, routers: &str, overrides: &ConfigOverrides) -> Result<()> {
|
||||
let base = load(path, overrides)?;
|
||||
let modes: Vec<RouterMode> = routers
|
||||
.split(',')
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(RouterMode::parse)
|
||||
.collect::<Result<Vec<_>>>()
|
||||
.with_context(|| format!("parsing --routers='{routers}'"))?;
|
||||
let mut all = Vec::new();
|
||||
for mode in modes {
|
||||
let mut cfg = base.clone();
|
||||
cfg.cluster.router.mode = mode;
|
||||
let sub = mode.as_str().to_string();
|
||||
eprintln!("[ablate] running router={}", sub);
|
||||
let out = driver::run(&cfg, Some(&sub))?;
|
||||
all.push(out.summary);
|
||||
}
|
||||
let agg_path = std::path::Path::new(&base.sim.output_dir).join("ablation.json");
|
||||
std::fs::create_dir_all(&base.sim.output_dir)?;
|
||||
std::fs::write(&agg_path, serde_json::to_string_pretty(&all)?)?;
|
||||
println!("{}", serde_json::to_string_pretty(&all)?);
|
||||
eprintln!("[ablate] wrote {}", agg_path.display());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cmd_validate(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
|
||||
use kvcache_simulator::instance::compute::ComputeModel;
|
||||
let cfg = load(path, overrides)?;
|
||||
eprintln!("config OK: {}", cfg.model.name);
|
||||
eprintln!("mode = {}", if cfg.model.is_arch_mode() { "architecture-derived" } else { "legacy manual" });
|
||||
let cm = ComputeModel::new(&cfg.model, &cfg.hardware);
|
||||
eprintln!("compute: {}", cm.describe());
|
||||
eprintln!("kv_block_bytes = {} ({:.2} MB{})",
|
||||
cfg.model.kv_block_bytes(),
|
||||
cfg.model.kv_block_bytes() as f64 / 1e6,
|
||||
if cfg.model.mla.is_some() { ", MLA compressed" } else { "" },
|
||||
);
|
||||
let block_bytes = cfg.model.kv_block_bytes() as f64;
|
||||
let hbm_blocks = (cfg.hardware.hbm_bytes / block_bytes) as u64;
|
||||
let dram_blocks = (cfg.hardware.dram_bytes / block_bytes) as u64;
|
||||
eprintln!("per-instance HBM blocks = {hbm_blocks}, DRAM blocks = {dram_blocks}");
|
||||
eprintln!("num_instances = {}", cfg.cluster.num_instances);
|
||||
// Sample prefill times at a few prompt lengths.
|
||||
eprintln!("prefill_time samples:");
|
||||
for &n in &[256, 1024, 4096, 16384, 65536, 131072] {
|
||||
let t = cm.prefill_time(n);
|
||||
eprintln!(" N={n:>7} -> {t:.4} s");
|
||||
}
|
||||
let reader = TraceReader::open(&cfg.sim.trace_path, Some(5))?;
|
||||
for rec in reader {
|
||||
let rec = rec?;
|
||||
eprintln!(
|
||||
" req {} chat={} t={:.3}s in={} out={} blocks={}",
|
||||
rec.req_id,
|
||||
rec.chat_id,
|
||||
rec.arrival,
|
||||
rec.input_len,
|
||||
rec.output_len,
|
||||
rec.hash_ids.len()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cmd_oracle(
|
||||
path: &PathBuf,
|
||||
overrides: &ConfigOverrides,
|
||||
capacity_blocks: Option<u64>,
|
||||
per_instance: bool,
|
||||
out_path: Option<&std::path::Path>,
|
||||
) -> Result<()> {
|
||||
let cfg = load(path, overrides)?;
|
||||
let block_bytes = cfg.model.kv_block_bytes() as f64;
|
||||
let per_instance_blocks = (cfg.hardware.hbm_bytes / block_bytes).max(1.0) as u64;
|
||||
let aggregate_blocks = per_instance_blocks * cfg.cluster.num_instances as u64;
|
||||
let capacity = match (capacity_blocks, per_instance) {
|
||||
(Some(_), true) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"--capacity-blocks and --per-instance are mutually exclusive"
|
||||
))
|
||||
}
|
||||
(Some(c), false) => c,
|
||||
(None, true) => per_instance_blocks,
|
||||
(None, false) => aggregate_blocks,
|
||||
};
|
||||
|
||||
eprintln!(
|
||||
"[oracle] loading trace {} (max_requests={:?})",
|
||||
cfg.sim.trace_path, cfg.sim.max_requests
|
||||
);
|
||||
let reader = TraceReader::open(&cfg.sim.trace_path, cfg.sim.max_requests)?;
|
||||
let records: Vec<_> = reader.collect::<Result<Vec<_>, _>>()?;
|
||||
eprintln!(
|
||||
"[oracle] loaded {} requests; analyzing with capacity = {} blocks \
|
||||
({} per-instance × {} instances{})",
|
||||
records.len(),
|
||||
capacity,
|
||||
per_instance_blocks,
|
||||
cfg.cluster.num_instances,
|
||||
if per_instance { ", per-instance mode" } else { "" }
|
||||
);
|
||||
|
||||
let result = oracle::analyze(&records, capacity);
|
||||
let json = serde_json::to_string_pretty(&result)?;
|
||||
println!("{}", json);
|
||||
|
||||
let target = match out_path {
|
||||
Some(p) => p.to_path_buf(),
|
||||
None => std::path::Path::new(&cfg.sim.output_dir).join("oracle.json"),
|
||||
};
|
||||
if let Some(parent) = target.parent() {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
std::fs::write(&target, &json)?;
|
||||
eprintln!("[oracle] wrote {}", target.display());
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user