//! Smoke test: synthesize a small trace with shared prefixes and assert that //! the cache hit rate is monotonic in router sophistication: //! random <= least_loaded <= ttl_aware <= precise use std::io::Write; use kvcache_simulator::config::*; use kvcache_simulator::driver; use kvcache_simulator::replay::ReplayEvictPolicy; fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config { Config { model: ModelConfig { name: "test".into(), num_layers: 4, num_kv_heads: 2, head_dim: 64, dtype_bytes: 2, block_size_tokens: 16, flops_per_token_prefill: Some(1.0e9), attn_quadratic_coeff: Some(64.0), ..Default::default() }, hardware: HardwareConfig { gpu_flops: 1.0e14, gpu_fp8_flops: 0.0, gpu_fp4_flops: 0.0, gpu_mem_bw: 1.0e12, hbm_bytes: 1.0e9, dram_bytes: 4.0e9, pcie_bw: 32.0e9, pcie_latency_us: 1.0, rdma_bw: 12.0e9, rdma_latency_us: 5.0, max_batch_slots: 32, prefill_chunk_tokens: 1024, }, cluster: ClusterConfig { num_instances: 4, meta_store: MetaStoreConfig { ttl_seconds: 1000.0, }, router: RouterConfig { mode, precise_probe_latency_us: 10.0, precise_probe_topk: 4, load_alpha: 0.1, score_alpha: 1.0, score_beta: 0.1, prefix_k: 8, affinity_fan_out: 0, }, }, sim: SimConfig { trace_path: trace_path.into(), max_requests: None, output_dir: out_dir.into(), sample_interval_s: 0.0, seed: 7, }, } } fn write_synthetic_trace(path: &std::path::Path) { // 5 distinct conversations, each with 8 turns. Within a conversation, // turn k+1 reuses the prefix of turn k (shared first ~10 blocks) and // appends a few new blocks. This is the canonical KV-prefix-cache pattern. let mut f = std::fs::File::create(path).unwrap(); let mut t = 0.0_f64; let mut req_id_counter: i64 = 0; for conv in 0..5i64 { let mut prefix: Vec = (0..10).map(|i| conv * 1_000_000 + i).collect(); for turn in 0..8 { let mut hashes = prefix.clone(); // Append 2 new blocks unique to this turn for j in 0..2 { let h = conv * 1_000_000 + 100 + (turn as i64) * 10 + j; hashes.push(h); } req_id_counter += 1; let line = serde_json::json!({ "chat_id": conv, "parent_chat_id": -1, "timestamp": t, "input_length": (hashes.len() as i64) * 16, "output_length": 16, // 1 block of decode "type": "text", "turn": turn, "hash_ids": hashes, }); writeln!(f, "{}", line).unwrap(); // Next turn's prefix grows to include this turn's appended blocks prefix = hashes; t += 0.05; } let _ = req_id_counter; } } fn run( mode: RouterMode, trace_path: &std::path::Path, out_root: &std::path::Path, ) -> kvcache_simulator::metrics::Summary { let cfg = base_config( trace_path.to_str().unwrap(), out_root.to_str().unwrap(), mode, ); let res = driver::run(&cfg, Some(mode.as_str())).expect("sim run"); res.summary } #[test] fn ablation_hit_rate_ordering() { let tmp = std::env::temp_dir().join("kvcache_sim_smoke"); let _ = std::fs::remove_dir_all(&tmp); std::fs::create_dir_all(&tmp).unwrap(); let trace_path = tmp.join("trace.jsonl"); write_synthetic_trace(&trace_path); let s_random = run(RouterMode::Random, &trace_path, &tmp); let s_ll = run(RouterMode::LeastLoaded, &trace_path, &tmp); let s_ttl = run(RouterMode::TtlAware, &trace_path, &tmp); let s_prec = run(RouterMode::Precise, &trace_path, &tmp); let total_hit = |s: &kvcache_simulator::metrics::Summary| s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote; let h_rand = total_hit(&s_random); let h_ll = total_hit(&s_ll); let h_ttl = total_hit(&s_ttl); let h_prec = total_hit(&s_prec); eprintln!( "smoke: hit rates random={:.3} least_loaded={:.3} ttl={:.3} precise={:.3}", h_rand, h_ll, h_ttl, h_prec ); eprintln!( " remote+local hit ratio L0/L1/remote: \ random=({:.2},{:.2},{:.2}) precise=({:.2},{:.2},{:.2})", s_random.hit_rate_l0, s_random.hit_rate_l1, s_random.hit_rate_remote, s_prec.hit_rate_l0, s_prec.hit_rate_l1, s_prec.hit_rate_remote, ); // ttl_aware and precise should outperform random / least_loaded for // a workload built entirely of shared-prefix conversations. let eps = 1e-6; assert!(h_ttl + eps >= h_rand, "ttl_aware should >= random hit rate"); assert!(h_prec + eps >= h_rand, "precise should >= random hit rate"); assert!( h_prec + eps >= h_ll, "precise should >= least_loaded hit rate" ); } #[test] fn ablation_lru_preserves_ttft_fields() { let tmp = std::env::temp_dir().join("kvcache_sim_replay"); let _ = std::fs::remove_dir_all(&tmp); std::fs::create_dir_all(&tmp).unwrap(); let trace_path = tmp.join("trace.jsonl"); write_synthetic_trace(&trace_path); let cfg = base_config( trace_path.to_str().unwrap(), tmp.to_str().unwrap(), RouterMode::Random, ); let online = driver::run(&cfg, Some("online_lru")).expect("online lru run"); let out = driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Lru]) .expect("ablate lru"); assert_eq!(out.len(), 1); let row = &out[0]; let online_hit = online.summary.hit_rate_l0 + online.summary.hit_rate_l1 + online.summary.hit_rate_remote; let ablate_hit = row.hit_rate_l0 + row.hit_rate_l1 + row.hit_rate_remote; assert!( (ablate_hit - online_hit).abs() < 1e-9, "ablation lru should match online lru hit rate: online={online_hit} ablate={ablate_hit}" ); assert!((row.ttft_mean - online.summary.ttft_mean).abs() < 1e-9); assert!((row.ttft_p50 - online.summary.ttft_p50).abs() < 1e-9); assert!((row.ttft_p95 - online.summary.ttft_p95).abs() < 1e-9); assert!((row.ttft_p99 - online.summary.ttft_p99).abs() < 1e-9); } #[test] fn ablate_rejects_belady_until_exact_algorithm_exists() { let tmp = std::env::temp_dir().join("kvcache_sim_ablate_evict"); let _ = std::fs::remove_dir_all(&tmp); std::fs::create_dir_all(&tmp).unwrap(); let trace_path = tmp.join("trace.jsonl"); write_synthetic_trace(&trace_path); let cfg = base_config( trace_path.to_str().unwrap(), tmp.to_str().unwrap(), RouterMode::Random, ); let err = driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Belady]) .expect_err("belady should be rejected"); assert!( err.to_string().contains("exact belady"), "unexpected error: {err:#}" ); }