kvcache-simulator/tests/smoke.rs

//! Smoke test: synthesize a small trace with shared prefixes and assert that
//! the cache hit rate is monotonic in router sophistication:
//!     random  <=  least_loaded  <=  ttl_aware  <=  precise

use std::io::Write;

use kvcache_simulator::config::*;
use kvcache_simulator::driver;
use kvcache_simulator::replay::ReplayEvictPolicy;

fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
    Config {
        model: ModelConfig {
            name: "test".into(),
            num_layers: 4,
            num_kv_heads: 2,
            head_dim: 64,
            dtype_bytes: 2,
            block_size_tokens: 16,
            flops_per_token_prefill: Some(1.0e9),
            attn_quadratic_coeff: Some(64.0),
            ..Default::default()
        },
        hardware: HardwareConfig {
            gpu_flops: 1.0e14,
            gpu_fp8_flops: 0.0,
            gpu_fp4_flops: 0.0,
            gpu_mem_bw: 1.0e12,
            hbm_bytes: 1.0e9,
            dram_bytes: 4.0e9,
            host_dram_bw: 5.0e11,
            pcie_bw: 32.0e9,
            pcie_latency_us: 1.0,
            rdma_bw: 12.0e9,
            rdma_latency_us: 5.0,
            intra_node_tp_bw: 9.0e11,
            intra_node_tp_latency_us: 2.0,
            tp_degree: 1,
            max_batch_slots: 32,
            prefill_chunk_tokens: 1024,
        },
        calibration: CalibrationConfig::default(),
        cluster: ClusterConfig {
            num_instances: 4,
            meta_store: MetaStoreConfig {
                ttl_seconds: 1000.0,
            },
            router: RouterConfig {
                mode,
                precise_probe_latency_us: 10.0,
                precise_probe_topk: 4,
                load_alpha: 0.1,
                score_alpha: 1.0,
                score_beta: 0.1,
                prefix_k: 8,
                affinity_fan_out: 0,
            },
        },
        sim: SimConfig {
            trace_path: trace_path.into(),
            max_requests: None,
            output_dir: out_dir.into(),
            sample_interval_s: 0.0,
            seed: 7,
            input_length_min: None,
            input_length_max: None,
        },
    }
}

fn write_synthetic_trace(path: &std::path::Path) {
    // 5 distinct conversations, each with 8 turns. Within a conversation,
    // turn k+1 reuses the prefix of turn k (shared first ~10 blocks) and
    // appends a few new blocks. This is the canonical KV-prefix-cache pattern.
    let mut f = std::fs::File::create(path).unwrap();
    let mut t = 0.0_f64;
    let mut req_id_counter: i64 = 0;
    for conv in 0..5i64 {
        let mut prefix: Vec<i64> = (0..10).map(|i| conv * 1_000_000 + i).collect();
        for turn in 0..8 {
            let mut hashes = prefix.clone();
            // Append 2 new blocks unique to this turn
            for j in 0..2 {
                let h = conv * 1_000_000 + 100 + (turn as i64) * 10 + j;
                hashes.push(h);
            }
            req_id_counter += 1;
            let line = serde_json::json!({
                "chat_id": conv,
                "parent_chat_id": -1,
                "timestamp": t,
                "input_length": (hashes.len() as i64) * 16,
                "output_length": 16, // 1 block of decode
                "type": "text",
                "turn": turn,
                "hash_ids": hashes,
            });
            writeln!(f, "{}", line).unwrap();
            // Next turn's prefix grows to include this turn's appended blocks
            prefix = hashes;
            t += 0.05;
        }
        let _ = req_id_counter;
    }
}

fn run(
    mode: RouterMode,
    trace_path: &std::path::Path,
    out_root: &std::path::Path,
) -> kvcache_simulator::metrics::Summary {
    let cfg = base_config(
        trace_path.to_str().unwrap(),
        out_root.to_str().unwrap(),
        mode,
    );
    let res = driver::run(&cfg, Some(mode.as_str())).expect("sim run");
    res.summary
}

#[test]
fn ablation_hit_rate_ordering() {
    let tmp = std::env::temp_dir().join("kvcache_sim_smoke");
    let _ = std::fs::remove_dir_all(&tmp);
    std::fs::create_dir_all(&tmp).unwrap();
    let trace_path = tmp.join("trace.jsonl");
    write_synthetic_trace(&trace_path);

    let s_random = run(RouterMode::Random, &trace_path, &tmp);
    let s_ll = run(RouterMode::LeastLoaded, &trace_path, &tmp);
    let s_ttl = run(RouterMode::TtlAware, &trace_path, &tmp);
    let s_prec = run(RouterMode::Precise, &trace_path, &tmp);

    let total_hit =
        |s: &kvcache_simulator::metrics::Summary| s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote;

    let h_rand = total_hit(&s_random);
    let h_ll = total_hit(&s_ll);
    let h_ttl = total_hit(&s_ttl);
    let h_prec = total_hit(&s_prec);

    eprintln!(
        "smoke: hit rates  random={:.3} least_loaded={:.3} ttl={:.3} precise={:.3}",
        h_rand, h_ll, h_ttl, h_prec
    );
    eprintln!(
        "         remote+local hit ratio L0/L1/remote: \
         random=({:.2},{:.2},{:.2}) precise=({:.2},{:.2},{:.2})",
        s_random.hit_rate_l0,
        s_random.hit_rate_l1,
        s_random.hit_rate_remote,
        s_prec.hit_rate_l0,
        s_prec.hit_rate_l1,
        s_prec.hit_rate_remote,
    );

    // ttl_aware and precise should outperform random / least_loaded for
    // a workload built entirely of shared-prefix conversations.
    let eps = 1e-6;
    assert!(h_ttl + eps >= h_rand, "ttl_aware should >= random hit rate");
    assert!(h_prec + eps >= h_rand, "precise should >= random hit rate");
    assert!(
        h_prec + eps >= h_ll,
        "precise should >= least_loaded hit rate"
    );
}

#[test]
fn ablation_lru_preserves_ttft_fields() {
    let tmp = std::env::temp_dir().join("kvcache_sim_replay");
    let _ = std::fs::remove_dir_all(&tmp);
    std::fs::create_dir_all(&tmp).unwrap();
    let trace_path = tmp.join("trace.jsonl");
    write_synthetic_trace(&trace_path);

    let cfg = base_config(
        trace_path.to_str().unwrap(),
        tmp.to_str().unwrap(),
        RouterMode::Random,
    );
    let online = driver::run(&cfg, Some("online_lru")).expect("online lru run");
    let out =
        driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Lru])
            .expect("ablate lru");

    assert_eq!(out.len(), 1);
    let row = &out[0];
    let online_hit =
        online.summary.hit_rate_l0 + online.summary.hit_rate_l1 + online.summary.hit_rate_remote;
    let ablate_hit = row.hit_rate_l0 + row.hit_rate_l1 + row.hit_rate_remote;

    assert!(
        (ablate_hit - online_hit).abs() < 1e-9,
        "ablation lru should match online lru hit rate: online={online_hit} ablate={ablate_hit}"
    );
    assert!((row.ttft_mean - online.summary.ttft_mean).abs() < 1e-9);
    assert!((row.ttft_p50 - online.summary.ttft_p50).abs() < 1e-9);
    assert!((row.ttft_p95 - online.summary.ttft_p95).abs() < 1e-9);
    assert!((row.ttft_p99 - online.summary.ttft_p99).abs() < 1e-9);
}

#[test]
fn ablate_rejects_belady_until_exact_algorithm_exists() {
    let tmp = std::env::temp_dir().join("kvcache_sim_ablate_evict");
    let _ = std::fs::remove_dir_all(&tmp);
    std::fs::create_dir_all(&tmp).unwrap();
    let trace_path = tmp.join("trace.jsonl");
    write_synthetic_trace(&trace_path);

    let cfg = base_config(
        trace_path.to_str().unwrap(),
        tmp.to_str().unwrap(),
        RouterMode::Random,
    );

    let err =
        driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Belady])
            .expect_err("belady should be rejected");
    assert!(
        err.to_string().contains("exact belady"),
        "unexpected error: {err:#}"
    );
}