330 lines
11 KiB
Rust
330 lines
11 KiB
Rust
//! Smoke test: synthesize a small trace with shared prefixes and assert that
|
|
//! the cache hit rate is monotonic in router sophistication:
|
|
//! random <= least_loaded <= ttl_aware <= precise
|
|
|
|
use std::io::Write;
|
|
|
|
use kvcache_simulator::config::*;
|
|
use kvcache_simulator::driver;
|
|
use kvcache_simulator::replay::{self, PlacementEntry, ReplayEvictPolicy};
|
|
|
|
fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
|
|
Config {
|
|
model: ModelConfig {
|
|
name: "test".into(),
|
|
num_layers: 4,
|
|
num_kv_heads: 2,
|
|
head_dim: 64,
|
|
dtype_bytes: 2,
|
|
block_size_tokens: 16,
|
|
flops_per_token_prefill: Some(1.0e9),
|
|
attn_quadratic_coeff: Some(64.0),
|
|
..Default::default()
|
|
},
|
|
hardware: HardwareConfig {
|
|
gpu_flops: 1.0e14,
|
|
gpu_fp8_flops: 0.0,
|
|
gpu_fp4_flops: 0.0,
|
|
gpu_mem_bw: 1.0e12,
|
|
hbm_bytes: 1.0e9,
|
|
dram_bytes: 4.0e9,
|
|
host_dram_bw: 5.0e11,
|
|
pcie_bw: 32.0e9,
|
|
pcie_latency_us: 1.0,
|
|
rdma_bw: 12.0e9,
|
|
rdma_latency_us: 5.0,
|
|
intra_node_tp_bw: 9.0e11,
|
|
intra_node_tp_latency_us: 2.0,
|
|
tp_degree: 1,
|
|
max_batch_slots: 32,
|
|
prefill_chunk_tokens: 1024,
|
|
},
|
|
calibration: CalibrationConfig::default(),
|
|
cluster: ClusterConfig {
|
|
num_instances: Some(4),
|
|
buckets: Vec::new(),
|
|
global_router: Default::default(),
|
|
meta_store: MetaStoreConfig {
|
|
ttl_seconds: 1000.0,
|
|
},
|
|
router: RouterConfig {
|
|
mode,
|
|
precise_probe_latency_us: 10.0,
|
|
precise_probe_topk: 4,
|
|
load_alpha: 0.1,
|
|
score_alpha: 1.0,
|
|
score_beta: 0.1,
|
|
prefix_k: 8,
|
|
affinity_fan_out: 0,
|
|
},
|
|
},
|
|
sim: SimConfig {
|
|
trace_path: trace_path.into(),
|
|
max_requests: None,
|
|
output_dir: out_dir.into(),
|
|
sample_interval_s: 0.0,
|
|
seed: 7,
|
|
input_length_min: None,
|
|
input_length_max: None,
|
|
},
|
|
}
|
|
}
|
|
|
|
fn bucketed_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
|
|
let mut cfg = base_config(trace_path, out_dir, mode);
|
|
cfg.cluster.num_instances = None;
|
|
cfg.cluster.buckets = vec![
|
|
BucketConfig {
|
|
name: "short".into(),
|
|
input_length_min: 0,
|
|
input_length_max: 64,
|
|
num_instances: 2,
|
|
},
|
|
BucketConfig {
|
|
name: "long".into(),
|
|
input_length_min: 65,
|
|
input_length_max: 128,
|
|
num_instances: 1,
|
|
},
|
|
];
|
|
cfg
|
|
}
|
|
|
|
fn write_synthetic_trace(path: &std::path::Path) {
|
|
// 5 distinct conversations, each with 8 turns. Within a conversation,
|
|
// turn k+1 reuses the prefix of turn k (shared first ~10 blocks) and
|
|
// appends a few new blocks. This is the canonical KV-prefix-cache pattern.
|
|
let mut f = std::fs::File::create(path).unwrap();
|
|
let mut t = 0.0_f64;
|
|
let mut req_id_counter: i64 = 0;
|
|
for conv in 0..5i64 {
|
|
let mut prefix: Vec<i64> = (0..10).map(|i| conv * 1_000_000 + i).collect();
|
|
for turn in 0..8 {
|
|
let mut hashes = prefix.clone();
|
|
// Append 2 new blocks unique to this turn
|
|
for j in 0..2 {
|
|
let h = conv * 1_000_000 + 100 + (turn as i64) * 10 + j;
|
|
hashes.push(h);
|
|
}
|
|
req_id_counter += 1;
|
|
let line = serde_json::json!({
|
|
"chat_id": conv,
|
|
"parent_chat_id": -1,
|
|
"timestamp": t,
|
|
"input_length": (hashes.len() as i64) * 16,
|
|
"output_length": 16, // 1 block of decode
|
|
"type": "text",
|
|
"turn": turn,
|
|
"hash_ids": hashes,
|
|
});
|
|
writeln!(f, "{}", line).unwrap();
|
|
// Next turn's prefix grows to include this turn's appended blocks
|
|
prefix = hashes;
|
|
t += 0.05;
|
|
}
|
|
let _ = req_id_counter;
|
|
}
|
|
}
|
|
|
|
fn run(
|
|
mode: RouterMode,
|
|
trace_path: &std::path::Path,
|
|
out_root: &std::path::Path,
|
|
) -> kvcache_simulator::metrics::Summary {
|
|
let cfg = base_config(
|
|
trace_path.to_str().unwrap(),
|
|
out_root.to_str().unwrap(),
|
|
mode,
|
|
);
|
|
let res = driver::run(&cfg, Some(mode.as_str())).expect("sim run");
|
|
res.summary
|
|
}
|
|
|
|
#[test]
|
|
fn ablation_hit_rate_ordering() {
|
|
let tmp = std::env::temp_dir().join("kvcache_sim_smoke");
|
|
let _ = std::fs::remove_dir_all(&tmp);
|
|
std::fs::create_dir_all(&tmp).unwrap();
|
|
let trace_path = tmp.join("trace.jsonl");
|
|
write_synthetic_trace(&trace_path);
|
|
|
|
let s_random = run(RouterMode::Random, &trace_path, &tmp);
|
|
let s_ll = run(RouterMode::LeastLoaded, &trace_path, &tmp);
|
|
let s_ttl = run(RouterMode::TtlAware, &trace_path, &tmp);
|
|
let s_prec = run(RouterMode::Precise, &trace_path, &tmp);
|
|
|
|
let total_hit =
|
|
|s: &kvcache_simulator::metrics::Summary| s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote;
|
|
|
|
let h_rand = total_hit(&s_random);
|
|
let h_ll = total_hit(&s_ll);
|
|
let h_ttl = total_hit(&s_ttl);
|
|
let h_prec = total_hit(&s_prec);
|
|
|
|
eprintln!(
|
|
"smoke: hit rates random={:.3} least_loaded={:.3} ttl={:.3} precise={:.3}",
|
|
h_rand, h_ll, h_ttl, h_prec
|
|
);
|
|
eprintln!(
|
|
" remote+local hit ratio L0/L1/remote: \
|
|
random=({:.2},{:.2},{:.2}) precise=({:.2},{:.2},{:.2})",
|
|
s_random.hit_rate_l0,
|
|
s_random.hit_rate_l1,
|
|
s_random.hit_rate_remote,
|
|
s_prec.hit_rate_l0,
|
|
s_prec.hit_rate_l1,
|
|
s_prec.hit_rate_remote,
|
|
);
|
|
|
|
// ttl_aware and precise should outperform random / least_loaded for
|
|
// a workload built entirely of shared-prefix conversations.
|
|
let eps = 1e-6;
|
|
assert!(h_ttl + eps >= h_rand, "ttl_aware should >= random hit rate");
|
|
assert!(h_prec + eps >= h_rand, "precise should >= random hit rate");
|
|
assert!(
|
|
h_prec + eps >= h_ll,
|
|
"precise should >= least_loaded hit rate"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn ablation_lru_preserves_ttft_fields() {
|
|
let tmp = std::env::temp_dir().join("kvcache_sim_replay");
|
|
let _ = std::fs::remove_dir_all(&tmp);
|
|
std::fs::create_dir_all(&tmp).unwrap();
|
|
let trace_path = tmp.join("trace.jsonl");
|
|
write_synthetic_trace(&trace_path);
|
|
|
|
let cfg = base_config(
|
|
trace_path.to_str().unwrap(),
|
|
tmp.to_str().unwrap(),
|
|
RouterMode::Random,
|
|
);
|
|
let online = driver::run(&cfg, Some("online_lru")).expect("online lru run");
|
|
let out =
|
|
driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Lru])
|
|
.expect("ablate lru");
|
|
|
|
assert_eq!(out.len(), 1);
|
|
let row = &out[0];
|
|
let online_hit =
|
|
online.summary.hit_rate_l0 + online.summary.hit_rate_l1 + online.summary.hit_rate_remote;
|
|
let ablate_hit = row.hit_rate_l0 + row.hit_rate_l1 + row.hit_rate_remote;
|
|
|
|
assert!(
|
|
(ablate_hit - online_hit).abs() < 1e-9,
|
|
"ablation lru should match online lru hit rate: online={online_hit} ablate={ablate_hit}"
|
|
);
|
|
assert!((row.ttft_mean - online.summary.ttft_mean).abs() < 1e-9);
|
|
assert!((row.ttft_p50 - online.summary.ttft_p50).abs() < 1e-9);
|
|
assert!((row.ttft_p95 - online.summary.ttft_p95).abs() < 1e-9);
|
|
assert!((row.ttft_p99 - online.summary.ttft_p99).abs() < 1e-9);
|
|
}
|
|
|
|
#[test]
|
|
fn ablate_rejects_belady_until_exact_algorithm_exists() {
|
|
let tmp = std::env::temp_dir().join("kvcache_sim_ablate_evict");
|
|
let _ = std::fs::remove_dir_all(&tmp);
|
|
std::fs::create_dir_all(&tmp).unwrap();
|
|
let trace_path = tmp.join("trace.jsonl");
|
|
write_synthetic_trace(&trace_path);
|
|
|
|
let cfg = base_config(
|
|
trace_path.to_str().unwrap(),
|
|
tmp.to_str().unwrap(),
|
|
RouterMode::Random,
|
|
);
|
|
|
|
let err =
|
|
driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Belady])
|
|
.expect_err("belady should be rejected");
|
|
assert!(
|
|
err.to_string().contains("exact belady"),
|
|
"unexpected error: {err:#}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn ablation_parallel_matches_serial() {
|
|
let tmp = std::env::temp_dir().join("kvcache_sim_ablate_parallel");
|
|
let _ = std::fs::remove_dir_all(&tmp);
|
|
std::fs::create_dir_all(&tmp).unwrap();
|
|
let trace_path = tmp.join("trace.jsonl");
|
|
write_synthetic_trace(&trace_path);
|
|
|
|
let cfg = base_config(
|
|
trace_path.to_str().unwrap(),
|
|
tmp.to_str().unwrap(),
|
|
RouterMode::Random,
|
|
);
|
|
let routers = [
|
|
RouterMode::Random,
|
|
RouterMode::LeastLoaded,
|
|
RouterMode::TtlAware,
|
|
RouterMode::Precise,
|
|
];
|
|
|
|
let serial = driver::ablate_fixed_placement_with_parallelism(
|
|
&cfg,
|
|
&routers,
|
|
&[ReplayEvictPolicy::Lru],
|
|
1,
|
|
)
|
|
.expect("serial ablate");
|
|
let parallel = driver::ablate_fixed_placement_with_parallelism(
|
|
&cfg,
|
|
&routers,
|
|
&[ReplayEvictPolicy::Lru],
|
|
2,
|
|
)
|
|
.expect("parallel ablate");
|
|
|
|
assert_eq!(parallel.len(), serial.len());
|
|
for (lhs, rhs) in parallel.iter().zip(serial.iter()) {
|
|
assert_eq!(lhs.router, rhs.router);
|
|
assert_eq!(lhs.evict_policy, rhs.evict_policy);
|
|
assert_eq!(lhs.placement_source, rhs.placement_source);
|
|
assert!((lhs.ttft_mean - rhs.ttft_mean).abs() < 1e-9);
|
|
assert!((lhs.ttft_p50 - rhs.ttft_p50).abs() < 1e-9);
|
|
assert!((lhs.ttft_p95 - rhs.ttft_p95).abs() < 1e-9);
|
|
assert!((lhs.ttft_p99 - rhs.ttft_p99).abs() < 1e-9);
|
|
assert!((lhs.hit_rate_l0 - rhs.hit_rate_l0).abs() < 1e-12);
|
|
assert!((lhs.hit_rate_l1 - rhs.hit_rate_l1).abs() < 1e-12);
|
|
assert!((lhs.hit_rate_remote - rhs.hit_rate_remote).abs() < 1e-12);
|
|
assert!((lhs.miss_rate - rhs.miss_rate).abs() < 1e-12);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn bucketed_configs_are_rejected_by_legacy_runtime_paths() {
|
|
let tmp = std::env::temp_dir().join("kvcache_sim_bucketed_reject");
|
|
let _ = std::fs::remove_dir_all(&tmp);
|
|
std::fs::create_dir_all(&tmp).unwrap();
|
|
let trace_path = tmp.join("trace.jsonl");
|
|
write_synthetic_trace(&trace_path);
|
|
|
|
let cfg = bucketed_config(
|
|
trace_path.to_str().unwrap(),
|
|
tmp.to_str().unwrap(),
|
|
RouterMode::Random,
|
|
);
|
|
|
|
let result = driver::run(&cfg, Some("bucketed_guard"));
|
|
assert!(result.is_err(), "bucketed run should fail");
|
|
let err = result.err().unwrap();
|
|
assert!(err.to_string().contains("cluster.buckets"));
|
|
|
|
let err = driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Lru])
|
|
.expect_err("bucketed ablation should fail");
|
|
assert!(err.to_string().contains("cluster.buckets"));
|
|
|
|
let err = replay::replay_fixed_placement(
|
|
&cfg,
|
|
&[],
|
|
&Vec::<PlacementEntry>::new(),
|
|
ReplayEvictPolicy::Lru,
|
|
)
|
|
.expect_err("bucketed replay should fail");
|
|
assert!(err.to_string().contains("cluster.buckets"));
|
|
}
|