KVCache simulator for LLM serving cluster routing research
Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
155
tests/smoke.rs
Normal file
155
tests/smoke.rs
Normal file
@@ -0,0 +1,155 @@
|
||||
//! Smoke test: synthesize a small trace with shared prefixes and assert that
|
||||
//! the cache hit rate is monotonic in router sophistication:
|
||||
//! random <= least_loaded <= ttl_aware <= precise
|
||||
|
||||
use std::io::Write;
|
||||
|
||||
use kvcache_simulator::config::*;
|
||||
use kvcache_simulator::driver;
|
||||
|
||||
fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
|
||||
Config {
|
||||
model: ModelConfig {
|
||||
name: "test".into(),
|
||||
num_layers: 4,
|
||||
num_kv_heads: 2,
|
||||
head_dim: 64,
|
||||
dtype_bytes: 2,
|
||||
block_size_tokens: 16,
|
||||
flops_per_token_prefill: Some(1.0e9),
|
||||
attn_quadratic_coeff: Some(64.0),
|
||||
..Default::default()
|
||||
},
|
||||
hardware: HardwareConfig {
|
||||
gpu_flops: 1.0e14,
|
||||
gpu_mem_bw: 1.0e12,
|
||||
hbm_bytes: 1.0e9,
|
||||
dram_bytes: 4.0e9,
|
||||
pcie_bw: 32.0e9,
|
||||
pcie_latency_us: 1.0,
|
||||
rdma_bw: 12.0e9,
|
||||
rdma_latency_us: 5.0,
|
||||
max_batch_slots: 32,
|
||||
prefill_chunk_tokens: 1024,
|
||||
},
|
||||
cluster: ClusterConfig {
|
||||
num_instances: 4,
|
||||
meta_store: MetaStoreConfig { ttl_seconds: 1000.0 },
|
||||
router: RouterConfig {
|
||||
mode,
|
||||
precise_probe_latency_us: 10.0,
|
||||
precise_probe_topk: 4,
|
||||
load_alpha: 0.1,
|
||||
score_alpha: 1.0,
|
||||
score_beta: 0.1,
|
||||
prefix_k: 8,
|
||||
affinity_fan_out: 0,
|
||||
},
|
||||
},
|
||||
sim: SimConfig {
|
||||
trace_path: trace_path.into(),
|
||||
max_requests: None,
|
||||
output_dir: out_dir.into(),
|
||||
sample_interval_s: 0.0,
|
||||
seed: 7,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn write_synthetic_trace(path: &std::path::Path) {
|
||||
// 5 distinct conversations, each with 8 turns. Within a conversation,
|
||||
// turn k+1 reuses the prefix of turn k (shared first ~10 blocks) and
|
||||
// appends a few new blocks. This is the canonical KV-prefix-cache pattern.
|
||||
let mut f = std::fs::File::create(path).unwrap();
|
||||
let mut t = 0.0_f64;
|
||||
let mut req_id_counter: i64 = 0;
|
||||
for conv in 0..5i64 {
|
||||
let mut prefix: Vec<i64> = (0..10).map(|i| conv * 1_000_000 + i).collect();
|
||||
for turn in 0..8 {
|
||||
let mut hashes = prefix.clone();
|
||||
// Append 2 new blocks unique to this turn
|
||||
for j in 0..2 {
|
||||
let h = conv * 1_000_000 + 100 + (turn as i64) * 10 + j;
|
||||
hashes.push(h);
|
||||
}
|
||||
req_id_counter += 1;
|
||||
let line = serde_json::json!({
|
||||
"chat_id": conv,
|
||||
"parent_chat_id": -1,
|
||||
"timestamp": t,
|
||||
"input_length": (hashes.len() as i64) * 16,
|
||||
"output_length": 16, // 1 block of decode
|
||||
"type": "text",
|
||||
"turn": turn,
|
||||
"hash_ids": hashes,
|
||||
});
|
||||
writeln!(f, "{}", line).unwrap();
|
||||
// Next turn's prefix grows to include this turn's appended blocks
|
||||
prefix = hashes;
|
||||
t += 0.05;
|
||||
}
|
||||
let _ = req_id_counter;
|
||||
}
|
||||
}
|
||||
|
||||
fn run(mode: RouterMode, trace_path: &std::path::Path, out_root: &std::path::Path)
|
||||
-> kvcache_simulator::metrics::Summary
|
||||
{
|
||||
let cfg = base_config(
|
||||
trace_path.to_str().unwrap(),
|
||||
out_root.to_str().unwrap(),
|
||||
mode,
|
||||
);
|
||||
let res = driver::run(&cfg, Some(mode.as_str())).expect("sim run");
|
||||
res.summary
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ablation_hit_rate_ordering() {
|
||||
let tmp = std::env::temp_dir().join("kvcache_sim_smoke");
|
||||
let _ = std::fs::remove_dir_all(&tmp);
|
||||
std::fs::create_dir_all(&tmp).unwrap();
|
||||
let trace_path = tmp.join("trace.jsonl");
|
||||
write_synthetic_trace(&trace_path);
|
||||
|
||||
let s_random = run(RouterMode::Random, &trace_path, &tmp);
|
||||
let s_ll = run(RouterMode::LeastLoaded, &trace_path, &tmp);
|
||||
let s_ttl = run(RouterMode::TtlAware, &trace_path, &tmp);
|
||||
let s_prec = run(RouterMode::Precise, &trace_path, &tmp);
|
||||
|
||||
let total_hit = |s: &kvcache_simulator::metrics::Summary| {
|
||||
s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote
|
||||
};
|
||||
|
||||
let h_rand = total_hit(&s_random);
|
||||
let h_ll = total_hit(&s_ll);
|
||||
let h_ttl = total_hit(&s_ttl);
|
||||
let h_prec = total_hit(&s_prec);
|
||||
|
||||
eprintln!(
|
||||
"smoke: hit rates random={:.3} least_loaded={:.3} ttl={:.3} precise={:.3}",
|
||||
h_rand, h_ll, h_ttl, h_prec
|
||||
);
|
||||
eprintln!(
|
||||
" remote+local hit ratio L0/L1/remote: \
|
||||
random=({:.2},{:.2},{:.2}) precise=({:.2},{:.2},{:.2})",
|
||||
s_random.hit_rate_l0, s_random.hit_rate_l1, s_random.hit_rate_remote,
|
||||
s_prec.hit_rate_l0, s_prec.hit_rate_l1, s_prec.hit_rate_remote,
|
||||
);
|
||||
|
||||
// ttl_aware and precise should outperform random / least_loaded for
|
||||
// a workload built entirely of shared-prefix conversations.
|
||||
let eps = 1e-6;
|
||||
assert!(
|
||||
h_ttl + eps >= h_rand,
|
||||
"ttl_aware should >= random hit rate"
|
||||
);
|
||||
assert!(
|
||||
h_prec + eps >= h_rand,
|
||||
"precise should >= random hit rate"
|
||||
);
|
||||
assert!(
|
||||
h_prec + eps >= h_ll,
|
||||
"precise should >= least_loaded hit rate"
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user