chore: update ablation and clean configs

2026-04-15 14:48:59 +08:00
parent eaf574cd4e
commit 365ceac3be
15 changed files with 879 additions and 324 deletions
--- a/README.md
+++ b/README.md
@@ -58,11 +58,19 @@ Prints `summary.json` to stdout and writes the full output directory
 target/release/kvcache-sim ablate \
    --config configs/glm5-8xb200-hf.yaml \
    --routers random,least_loaded,least_tokens,min_pd,prefix_affinity \
    --evict-policies lru \
    --output-dir runs/glm5_ablation
 ```
-Writes one subdirectory per router plus a combined
+Writes `ablation.json` with one row per `router x evict_policy`.
-`ablation.json` with side-by-side summaries.
+
 `ablate` currently supports only `lru` as a valid eviction policy. The
 aggregated output keeps the online prefill-time metrics
 (`ttft_mean/p50/p95/p99`) and omits `e2e`.
 The previous replay-based `belady` approximation has been removed from
 the CLI because it was not an exact full-hierarchy Belady algorithm and
 could produce misleading comparisons against `lru`.
 ### 3. Compute theoretical hit-rate ceilings (oracle)
@@ -115,7 +123,8 @@ so the same config can be reused across sweeps:
 | `--ttl-seconds <S>`      | `cluster.meta_store.ttl_seconds`          |
 `oracle` additionally takes `--capacity-blocks <N>` / `--per-instance`
-and `--out <PATH>`. `ablate` additionally takes `--routers <csv>`.
+and `--out <PATH>`. `ablate` additionally takes `--routers <csv>` and
 `--evict-policies <csv>` (currently only `lru`).
 ## Router modes
@@ -288,12 +297,8 @@ memory_time  = layers * weight_bytes_per_layer / gpu_mem_bw
 | Config | Model | Hardware | Instances | Trace |
 |--------|-------|----------|-----------|-------|
 | `glm5-8xb200-hf.yaml` | GLM-5 via HF config.json | 8xB200 preset | 32 | GLM coder blk512 |
-| `glm5-8xb200-blk512.yaml` | GLM-5 inline | 8xB200 inline | 64 | GLM coder blk512 |
+| `glm5-nvfp4-8xb300.yaml` | GLM-5-NVFP4 via HF config.json | 8xB300 preset | 8 | GLM coder blk512 |
 | `glm5-8xb200.yaml` | GLM-5 inline | 8xB200 inline | 8 | GLM coder blk512 |
 | `qwen3-coder-480b-8xh20.yaml` | Qwen3-Coder via HF | 8xH20 preset | 32 | Qwen coder blk16 |
 | `qwen2.5-coder-7b-h800.yaml` | Qwen2.5-7B inline | H800 inline | 16 | Qwen coder blk16 |
 | `qwen2.5-coder-7b-preset.yaml` | Qwen2.5-7B inline | H800 preset | 16 | Qwen coder blk16 |
 | `qwen2.5-coder-32b-h800.yaml` | Qwen2.5-32B inline | H800 inline | 16 | Qwen coder blk16 |
 ## Outputs
--- a/configs/glm5-8xb200-blk512.yaml
+++ b/configs/glm5-8xb200-blk512.yaml
@@ -1,68 +0,0 @@
 # GLM-5 (zai-org/GLM-5) on 8 x B200 SXM (192GB each).
 # Architecture from HuggingFace config.json — all roofline coefficients
 # are derived automatically.
 model:
  name: glm-5
  # Core architecture (from HF config.json)
  num_layers: 78
  hidden_size: 6144
  num_attention_heads: 64
  num_kv_heads: 64             # formalism; MLA overrides KV cache sizing
  head_dim: 64
  intermediate_size: 12288     # shared expert FFN width
  dtype_bytes: 2               # BF16
  block_size_tokens: 512       # matches bailian-traces blksz_512
  # MoE: 256 routed + 1 shared, 8 active per token
  moe:
    num_experts: 256
    num_active_experts: 8
    num_shared_experts: 1
    expert_intermediate_size: 2048   # moe_intermediate_size
  # MLA (Multi-head Latent Attention): compressed KV cache
  mla:
    kv_lora_rank: 512
    q_lora_rank: 2048
    qk_nope_head_dim: 192
    qk_rope_head_dim: 64
    v_head_dim: 256
  # DSA (DeepSeek Sparse Attention): sub-quadratic past dense_window
  attention:
    type: dsa
    dense_window: 4096
    sparse_stride: 8
    first_dense_layers: 3
 hardware:
  # Aggregate of 8 x B200 in one tensor-parallel group.
  gpu_flops:        1.80e16    # 8 * 2.25 PFLOPS BF16 dense
  gpu_mem_bw:       6.40e13    # 8 * 8 TB/s HBM3e
  # KV budget after FP8 weights + activations. GLM-5 FP8 ~744GB of 1536GB.
  hbm_bytes:        500.0e9
  dram_bytes:       1.5e12     # ~1.5 TB usable CPU DRAM / v6d per node
  pcie_bw:          128.0e9    # PCIe Gen6 x16
  pcie_latency_us:  4.0
  rdma_bw:          50.0e9     # ConnectX-7 400 Gbps
  rdma_latency_us:  6.0
  max_batch_slots:  256
  prefill_chunk_tokens: 4096
 cluster:
  num_instances: 64
  meta_store:
    ttl_seconds: 300.0
  router:
    mode: min_pd
    precise_probe_latency_us: 50.0
    precise_probe_topk: 4
    load_alpha: 1.0
 sim:
  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
  max_requests: null
  output_dir: runs/glm5_8xb200_blk512
  sample_interval_s: 1.0
  seed: 42
--- a/configs/glm5-8xb200-hf.yaml
+++ b/configs/glm5-8xb200-hf.yaml
@@ -1,40 +0,0 @@
 # GLM-5 using HuggingFace config.json + hardware preset.
 #
 # This config demonstrates the simplified format:
 #   model.config_json  — loads architecture from HF config.json
 #   hardware.type      — loads GPU specs from built-in preset
 #
 # Only deployment-specific fields need to be set explicitly.
 # Any field from config_json or the preset can be overridden in YAML.
 model:
  # Auto-detect architecture: MoE, MLA, DSA, head dims, etc.
  config_json: ../models/GLM-5/config.json
  name: glm-5                    # override HF model_type
  dtype_bytes: 1                 # BF16 (not in HF config.json)
  block_size_tokens: 512         # matches bailian-traces blksz_512
 hardware:
  type: 8xb200                   # 8 x B200 SXM (192GB each)
  # Override preset values for this specific deployment:
  hbm_bytes: 500.0e9             # KV budget after FP8 weights + activations
  dram_bytes: 1.5e12             # ~1.5 TB usable CPU DRAM per node
  max_batch_slots: 256
 cluster:
  num_instances: 32
  meta_store:
    ttl_seconds: 300.0
  router:
    mode: min_pd
    precise_probe_latency_us: 50.0
    precise_probe_topk: 4
    load_alpha: 1.0
    prefix_k: 8
 sim:
  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
  max_requests: null
  output_dir: runs/glm5_8xb200_hf
  sample_interval_s: 1.0
  seed: 42
--- a/configs/glm5-8xb200.yaml
+++ b/configs/glm5-8xb200.yaml
@@ -1,66 +1,39 @@
-# GLM-5 (zai-org/GLM-5) served as a single tensor-parallel instance on
+# GLM-5 using HuggingFace config.json + hardware preset.
 # 8 x NVIDIA B200 SXM (192GB each, 1.5 TB aggregate HBM).
 #
-# GLM-5 is a 744B-total / 40B-active Mixture-of-Experts model (BF16),
+# This config demonstrates the simplified format:
-# using DeepSeek Sparse Attention (DSA). The HF card does not publish
+#   model.config_json  — loads architecture from HF config.json
-# layer/head shapes, so the values below are reasonable estimates based
+#   hardware.type      — loads GPU specs from built-in preset
 # on the GLM-4.5 lineage; adjust once the official config.json is public.
 #
-# Hardware values below represent the *aggregate* of the 8-GPU TP group
+# Only deployment-specific fields need to be set explicitly.
-# (one simulated "instance" == one 8xB200 serving replica). This is how
+# Any field from config_json or the preset can be overridden in YAML.
 # the roofline in src/instance/compute.rs wants to see it: gpu_flops and
 # gpu_mem_bw are the effective peaks seen by the TP'd model.
 #
 # Calibrate `flops_per_token_prefill` and `attn_quadratic_coeff` against
 # measured prefill latency before trusting absolute TTFT numbers.
 model:
-  name: glm-5
+  # Auto-detect architecture: MoE, MLA, DSA, head dims, etc.
-  # --- estimates; refine from official config.json when available ---
+  config_json: ../models/GLM-5/config.json
-  num_layers: 92
+  name: glm-5                    # override HF model_type
-  num_kv_heads: 8              # GQA
+  dtype_bytes: 1                 # BF16 (not in HF config.json)
-  head_dim: 128
+  block_size_tokens: 512         # matches bailian-traces blksz_512
  dtype_bytes: 2               # BF16
  block_size_tokens: 16        # trace convention
  # Active-params-driven roofline: MoE activates ~40B params per token,
  # so non-attention prefill FLOPs/token ≈ 2 * 40e9 = 8e10.
  flops_per_token_prefill: 8.0e10
  # Quadratic attention term ≈ 2 * num_heads * head_dim. GLM-5 uses
  # DeepSeek Sparse Attention which is sub-quadratic in practice, so
  # this coefficient is an upper bound — lower it if your measurements
  # show DSA kicking in for long prompts.
  attn_quadratic_coeff:    2048.0
  bytes_per_token_prefill: 0.0
 hardware:
-  # Aggregate of 8 x B200 in one tensor-parallel group.
+  type: 8xb200                   # 8 x B200 SXM (192GB each)
-  gpu_flops:        1.80e16    # 8 * 2.25 PFLOPS BF16 dense
+  # Override preset values for this specific deployment:
-  gpu_mem_bw:       6.40e13    # 8 * 8 TB/s HBM3e
+  hbm_bytes: 500.0e9             # KV budget after FP8 weights + activations
-  # KV-cache budget after weights + activations. GLM-5 @ BF16 is ~1.49TB,
+  dram_bytes: 1.5e12             # ~1.5 TB usable CPU DRAM per node
  # which barely fits in 1.5TB HBM; realistic serving uses FP8 weights
  # (~744GB), leaving ~500GB for activations + KV cache. Adjust if your
  # deployment uses a different weight dtype.
  hbm_bytes:        500.0e9
  dram_bytes:       1.5e12     # ~1.5 TB usable CPU DRAM / v6d per node
  pcie_bw:          128.0e9    # PCIe Gen6 x16 ~ 128 GB/s per direction
  pcie_latency_us:  4.0
  rdma_bw:          50.0e9     # ConnectX-7 400 Gbps ≈ 50 GB/s
  rdma_latency_us:  6.0
  max_batch_slots: 256
  prefill_chunk_tokens: 2048
 cluster:
-  num_instances: 8             # 8 TP replicas -> 64 B200s cluster-wide
+  num_instances: 32
  meta_store:
-    ttl_seconds: 120.0
+    ttl_seconds: 300.0
  router:
-    mode: ttl_aware
+    mode: min_pd
    precise_probe_latency_us: 50.0
    precise_probe_topk: 4
    load_alpha: 1.0
    prefix_k: 8
 sim:
-  trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
  max_requests: null
  output_dir: runs/glm5_8xb200
  sample_interval_s: 1.0
--- a/configs/qwen2.5-coder-32b-h800.yaml
+++ b/configs/qwen2.5-coder-32b-h800.yaml
@@ -1,42 +0,0 @@
 # Qwen2.5-Coder-32B (dense, GQA) on H800 SXM (80GB).
 # Architecture from HuggingFace config.json — roofline auto-derived.
 model:
  name: qwen2.5-coder-32b
  num_layers: 64
  hidden_size: 5120
  num_attention_heads: 40
  num_kv_heads: 8              # GQA
  head_dim: 128
  intermediate_size: 27648     # SwiGLU FFN
  dtype_bytes: 2               # BF16
  block_size_tokens: 16
 hardware:
  gpu_flops:        9.89e14
  gpu_mem_bw:       3.35e12
  hbm_bytes:        20.0e9     # smaller budget: 32B weights are large
  dram_bytes:       512.0e9
  pcie_bw:          64.0e9
  pcie_latency_us:  5.0
  rdma_bw:          25.0e9
  rdma_latency_us:  8.0
  max_batch_slots:  128
  prefill_chunk_tokens: 1024
 cluster:
  num_instances: 16
  meta_store:
    ttl_seconds: 60.0
  router:
    mode: ttl_aware
    precise_probe_latency_us: 50.0
    precise_probe_topk: 4
    load_alpha: 1.0
 sim:
  trace_path: traces/qwen_coder_blksz_16.jsonl
  max_requests: null
  output_dir: runs/qwen32b
  sample_interval_s: 1.0
  seed: 42
--- a/configs/qwen2.5-coder-7b-h800.yaml
+++ b/configs/qwen2.5-coder-7b-h800.yaml
@@ -1,42 +0,0 @@
 # Qwen2.5-Coder-7B (dense, GQA) on a single H800 SXM (80GB).
 # Architecture from HuggingFace config.json — roofline auto-derived.
 model:
  name: qwen2.5-coder-7b
  num_layers: 28
  hidden_size: 3584
  num_attention_heads: 28
  num_kv_heads: 4              # GQA: 28 query heads, 4 KV heads
  head_dim: 128
  intermediate_size: 18944     # SwiGLU FFN
  dtype_bytes: 2               # BF16
  block_size_tokens: 16        # matches qwen_coder_blksz_16 trace
 hardware:
  gpu_flops:        9.89e14    # H800 bf16 dense
  gpu_mem_bw:       3.35e12    # 3.35 TB/s HBM3
  hbm_bytes:        60.0e9     # leave headroom for weights/activations
  dram_bytes:       512.0e9
  pcie_bw:          64.0e9     # PCIe Gen5 x16
  pcie_latency_us:  5.0
  rdma_bw:          25.0e9     # ~200 Gbps NIC
  rdma_latency_us:  8.0
  max_batch_slots:  256
  prefill_chunk_tokens: 2048
 cluster:
  num_instances: 16
  meta_store:
    ttl_seconds: 60.0
  router:
    mode: ttl_aware
    precise_probe_latency_us: 50.0
    precise_probe_topk: 4
    load_alpha: 1.0
 sim:
  trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl
  max_requests: null
  output_dir: runs/qwen7b
  sample_interval_s: 1.0
  seed: 42
--- a/configs/qwen2.5-coder-7b-preset.yaml
+++ b/configs/qwen2.5-coder-7b-preset.yaml
@@ -1,36 +0,0 @@
 # Qwen2.5-Coder-7B using hardware preset.
 #
 # Model architecture is specified inline (no config.json needed for simple
 # models). Hardware uses preset "h800" with a single override for hbm_bytes.
 model:
  name: qwen2.5-coder-7b
  num_layers: 28
  hidden_size: 3584
  num_attention_heads: 28
  num_kv_heads: 4
  head_dim: 128
  intermediate_size: 18944
  dtype_bytes: 2
  block_size_tokens: 16
 hardware:
  type: h800                     # single H800 SXM (80GB)
  hbm_bytes: 60.0e9             # KV budget after 7B model weights
 cluster:
  num_instances: 16
  meta_store:
    ttl_seconds: 60.0
  router:
    mode: ttl_aware
    precise_probe_latency_us: 50.0
    precise_probe_topk: 4
    load_alpha: 1.0
 sim:
  trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl
  max_requests: null
  output_dir: runs/qwen7b_preset
  sample_interval_s: 1.0
  seed: 42
--- a/configs/qwen3-coder-480b-8xh20.yaml
+++ b/configs/qwen3-coder-480b-8xh20.yaml
@@ -5,16 +5,17 @@ model:
  config_json: ../models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json
  name: qwen3-coder-480b
  dtype_bytes: 1               # FP8 inference
-  block_size_tokens: 16
+  block_size_tokens: 512
 hardware:
  type: 8xh20
  hbm_bytes: 400.0e9           # KV budget after FP8 weights on 8x96GB
  dram_bytes: 1.0e12             # ~1.0 TB usable CPU DRAM per node
 cluster:
-  num_instances: 32
+  num_instances: 128
  meta_store:
-    ttl_seconds: 120.0
+    ttl_seconds: 300.0
  router:
    mode: min_pd
    precise_probe_latency_us: 50.0
@@ -22,7 +23,7 @@ cluster:
    load_alpha: 1.0
 sim:
-  trace_path: traces/qwen_coder_blksz_16.jsonl
+  trace_path: bailian-traces/qwen3_coder_blksz_512_040915-040917.jsonl
  max_requests: null
  output_dir: runs/qwen3_coder_8xh20
  sample_interval_s: 1.0
--- a/src/driver.rs
+++ b/src/driver.rs
@@ -6,11 +6,13 @@ use std::collections::HashMap;
 use std::path::Path;
 use crate::cluster::Cluster;
-use crate::config::Config;
+use crate::config::{Config, RouterMode};
 use crate::metrics::ablation::AblationRow;
 use crate::metrics::per_request::{PerRequestRow, PerRequestWriter};
 use crate::metrics::routing_log::RoutingLogWriter;
 use crate::metrics::summary::Summary;
 use crate::metrics::timeseries::{TimeseriesRow, TimeseriesWriter};
 use crate::replay::ReplayEvictPolicy;
 use crate::sim::{Event, EventQueue};
 use crate::trace::{RequestRecord, TraceReader};
@@ -100,7 +102,12 @@ pub fn run(config: &Config, output_subdir: Option<&str>) -> Result<RunOutputs> {
                if !inst.tick_scheduled {
                    inst.tick_scheduled = true;
                    let when = stats.ready_at.max(now);
-                    q.schedule(when, Event::BatchTick { instance: stats.instance });
+                    q.schedule(
                        when,
                        Event::BatchTick {
                            instance: stats.instance,
                        },
                    );
                }
            }
            Event::BatchTick { instance } => {
@@ -168,3 +175,33 @@ pub fn run(config: &Config, output_subdir: Option<&str>) -> Result<RunOutputs> {
    Ok(RunOutputs { summary, rows })
 }
 pub fn ablate_fixed_placement(
    base: &Config,
    routers: &[RouterMode],
    evict_policies: &[ReplayEvictPolicy],
 ) -> Result<Vec<AblationRow>> {
    let mut out = Vec::new();
    for &policy in evict_policies {
        if policy != ReplayEvictPolicy::Lru {
            return Err(anyhow::anyhow!(
                "exact belady is not supported for fixed-placement full-hierarchy ablation; \
                 the previous replay-based approximation has been removed"
            ));
        }
    }
    for &mode in routers {
        let mut cfg = base.clone();
        cfg.cluster.router.mode = mode;
        let placement_run = run(&cfg, Some(&format!("{}__placement_lru", mode.as_str())))?;
        for &policy in evict_policies {
            out.push(AblationRow::from_summary(
                mode.as_str(),
                policy,
                "realized_lru",
                &placement_run.summary,
            ));
        }
    }
    Ok(out)
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,6 +7,7 @@ pub mod instance;
 pub mod metrics;
 pub mod network;
 pub mod oracle;
 pub mod replay;
 pub mod router;
 pub mod sim;
 pub mod trace;
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,6 +3,7 @@ use clap::{Args, Parser, Subcommand};
 use std::path::PathBuf;
 use kvcache_simulator::config::{Config, RouterMode};
 use kvcache_simulator::replay::ReplayEvictPolicy;
 use kvcache_simulator::{driver, oracle, trace::TraceReader};
 #[derive(Debug, Parser)]
@@ -74,7 +75,8 @@ enum Cmd {
        #[command(flatten)]
        overrides: ConfigOverrides,
    },
-    /// Run the same trace under multiple routers and compare summaries.
+    /// Run the same trace under multiple routers and fixed-placement eviction
    /// policies, then compare cache-hit summaries.
    Ablate {
        #[arg(short, long)]
        config: PathBuf,
@@ -85,6 +87,10 @@ enum Cmd {
            default_value = "random,least_loaded,least_tokens,ttl_aware,min_pd,cache_load,cache_score,estimated_ttft,prefix_affinity"
        )]
        routers: String,
        /// Comma-separated eviction policies for ablation aggregation.
        /// Currently only `lru` is supported.
        #[arg(long, default_value = "lru")]
        evict_policies: String,
        #[command(flatten)]
        overrides: ConfigOverrides,
    },
@@ -125,8 +131,9 @@ fn main() -> Result<()> {
        Cmd::Ablate {
            config,
            routers,
            evict_policies,
            overrides,
-        } => cmd_ablate(&config, &routers, &overrides),
+        } => cmd_ablate(&config, &routers, &evict_policies, &overrides),
        Cmd::Validate { config, overrides } => cmd_validate(&config, &overrides),
        Cmd::Oracle {
            config,
@@ -134,7 +141,13 @@ fn main() -> Result<()> {
            capacity_blocks,
            per_instance,
            out,
-        } => cmd_oracle(&config, &overrides, capacity_blocks, per_instance, out.as_deref()),
+        } => cmd_oracle(
            &config,
            &overrides,
            capacity_blocks,
            per_instance,
            out.as_deref(),
        ),
    }
 }
@@ -151,7 +164,12 @@ fn cmd_run(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
    Ok(())
 }
-fn cmd_ablate(path: &PathBuf, routers: &str, overrides: &ConfigOverrides) -> Result<()> {
+fn cmd_ablate(
    path: &PathBuf,
    routers: &str,
    evict_policies: &str,
    overrides: &ConfigOverrides,
 ) -> Result<()> {
    let base = load(path, overrides)?;
    let modes: Vec<RouterMode> = routers
        .split(',')
@@ -160,15 +178,27 @@ fn cmd_ablate(path: &PathBuf, routers: &str, overrides: &ConfigOverrides) -> Res
        .map(RouterMode::parse)
        .collect::<Result<Vec<_>>>()
        .with_context(|| format!("parsing --routers='{routers}'"))?;
-    let mut all = Vec::new();
+    let policies: Vec<ReplayEvictPolicy> = evict_policies
-    for mode in modes {
+        .split(',')
-        let mut cfg = base.clone();
+        .map(|s| s.trim())
-        cfg.cluster.router.mode = mode;
+        .filter(|s| !s.is_empty())
-        let sub = mode.as_str().to_string();
+        .map(ReplayEvictPolicy::parse)
-        eprintln!("[ablate] running router={}", sub);
+        .collect::<Result<Vec<_>>>()
-        let out = driver::run(&cfg, Some(&sub))?;
+        .with_context(|| format!("parsing --evict-policies='{evict_policies}'"))?;
-        all.push(out.summary);
+    eprintln!(
-    }
+        "[ablate] routers={} evict_policies={}",
        modes
            .iter()
            .map(RouterMode::as_str)
            .collect::<Vec<_>>()
            .join(","),
        policies
            .iter()
            .map(ReplayEvictPolicy::as_str)
            .collect::<Vec<_>>()
            .join(",")
    );
    let all = driver::ablate_fixed_placement(&base, &modes, &policies)?;
    let agg_path = std::path::Path::new(&base.sim.output_dir).join("ablation.json");
    std::fs::create_dir_all(&base.sim.output_dir)?;
    std::fs::write(&agg_path, serde_json::to_string_pretty(&all)?)?;
@@ -181,13 +211,25 @@ fn cmd_validate(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
    use kvcache_simulator::instance::compute::ComputeModel;
    let cfg = load(path, overrides)?;
    eprintln!("config OK: {}", cfg.model.name);
-    eprintln!("mode = {}", if cfg.model.is_arch_mode() { "architecture-derived" } else { "legacy manual" });
+    eprintln!(
        "mode = {}",
        if cfg.model.is_arch_mode() {
            "architecture-derived"
        } else {
            "legacy manual"
        }
    );
    let cm = ComputeModel::new(&cfg.model, &cfg.hardware);
    eprintln!("compute: {}", cm.describe());
-    eprintln!("kv_block_bytes = {} ({:.2} MB{})",
+    eprintln!(
        "kv_block_bytes = {} ({:.2} MB{})",
        cfg.model.kv_block_bytes(),
        cfg.model.kv_block_bytes() as f64 / 1e6,
-        if cfg.model.mla.is_some() { ", MLA compressed" } else { "" },
+        if cfg.model.mla.is_some() {
            ", MLA compressed"
        } else {
            ""
        },
    );
    let block_bytes = cfg.model.kv_block_bytes() as f64;
    let hbm_blocks = (cfg.hardware.hbm_bytes / block_bytes) as u64;
@@ -251,7 +293,11 @@ fn cmd_oracle(
        capacity,
        per_instance_blocks,
        cfg.cluster.num_instances,
-        if per_instance { ", per-instance mode" } else { "" }
+        if per_instance {
            ", per-instance mode"
        } else {
            ""
        }
    );
    let result = oracle::analyze(&records, capacity);
--- a/src/metrics/ablation.rs
+++ b/src/metrics/ablation.rs
@@ -0,0 +1,50 @@
 use serde::Serialize;
 use crate::metrics::Summary;
 use crate::replay::ReplayEvictPolicy;
 #[derive(Debug, Clone, Serialize)]
 pub struct AblationRow {
    pub router: String,
    pub evict_policy: String,
    pub placement_source: String,
    pub num_requests: u64,
    pub total_blocks: u64,
    pub ttft_mean: f64,
    pub ttft_p50: f64,
    pub ttft_p95: f64,
    pub ttft_p99: f64,
    pub hit_rate_l0: f64,
    pub hit_rate_l1: f64,
    pub hit_rate_remote: f64,
    pub miss_rate: f64,
    pub total_rdma_bytes: u64,
    pub total_pcie_bytes: u64,
 }
 impl AblationRow {
    pub fn from_summary(
        router: &str,
        policy: ReplayEvictPolicy,
        placement_source: &str,
        summary: &Summary,
    ) -> Self {
        Self {
            router: router.to_string(),
            evict_policy: policy.as_str().to_string(),
            placement_source: placement_source.to_string(),
            num_requests: summary.num_requests,
            total_blocks: summary.total_blocks,
            ttft_mean: summary.ttft_mean,
            ttft_p50: summary.ttft_p50,
            ttft_p95: summary.ttft_p95,
            ttft_p99: summary.ttft_p99,
            hit_rate_l0: summary.hit_rate_l0,
            hit_rate_l1: summary.hit_rate_l1,
            hit_rate_remote: summary.hit_rate_remote,
            miss_rate: summary.miss_rate,
            total_rdma_bytes: summary.total_rdma_bytes,
            total_pcie_bytes: summary.total_pcie_bytes,
        }
    }
 }
--- a/src/metrics/mod.rs
+++ b/src/metrics/mod.rs
@@ -1,7 +1,9 @@
 pub mod ablation;
 pub mod per_request;
 pub mod routing_log;
 pub mod summary;
 pub mod timeseries;
 pub use ablation::AblationRow;
 pub use per_request::PerRequestRow;
 pub use summary::Summary;
--- a/src/replay.rs
+++ b/src/replay.rs
@@ -0,0 +1,608 @@
 use ahash::{AHashMap, AHashSet};
 use anyhow::{anyhow, Result};
 use serde::Serialize;
 use std::cmp::min;
 use std::collections::BinaryHeap;
 use crate::config::Config;
 use crate::instance::kv_cache::LruBlocks;
 use crate::trace::RequestRecord;
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub enum ReplayEvictPolicy {
    Lru,
    Belady,
 }
 impl ReplayEvictPolicy {
    pub fn parse(s: &str) -> Result<Self> {
        match s {
            "lru" => Ok(Self::Lru),
            "belady" => Err(anyhow!(
                "exact belady is not supported for fixed-placement full-hierarchy ablation"
            )),
            other => Err(anyhow!("unknown evict policy: {other}")),
        }
    }
    pub fn as_str(&self) -> &'static str {
        match self {
            Self::Lru => "lru",
            Self::Belady => "belady",
        }
    }
 }
 #[derive(Debug, Clone)]
 pub struct PlacementEntry {
    pub req_id: u64,
    pub instance: u32,
 }
 #[derive(Debug, Clone, Serialize, Default)]
 pub struct ReplaySummary {
    pub num_requests: u64,
    pub total_blocks: u64,
    pub l0_hit_blocks: u64,
    pub l1_hit_blocks: u64,
    pub remote_hit_blocks: u64,
    pub miss_blocks: u64,
    pub hit_rate_l0: f64,
    pub hit_rate_l1: f64,
    pub hit_rate_remote: f64,
    pub miss_rate: f64,
    pub total_rdma_bytes: u64,
    pub total_pcie_bytes: u64,
 }
 impl ReplaySummary {
    fn from_counts(
        num_requests: usize,
        total_blocks: u64,
        l0_hit_blocks: u64,
        l1_hit_blocks: u64,
        remote_hit_blocks: u64,
        miss_blocks: u64,
        total_rdma_bytes: u64,
        total_pcie_bytes: u64,
    ) -> Self {
        let denom = total_blocks.max(1) as f64;
        Self {
            num_requests: num_requests as u64,
            total_blocks,
            l0_hit_blocks,
            l1_hit_blocks,
            remote_hit_blocks,
            miss_blocks,
            hit_rate_l0: l0_hit_blocks as f64 / denom,
            hit_rate_l1: l1_hit_blocks as f64 / denom,
            hit_rate_remote: remote_hit_blocks as f64 / denom,
            miss_rate: miss_blocks as f64 / denom,
            total_rdma_bytes,
            total_pcie_bytes,
        }
    }
 }
 #[derive(Debug, Clone, Copy)]
 enum FutureKind {
    L0,
    L1,
 }
 #[derive(Debug)]
 struct FutureIndex {
    local: AHashMap<(u32, u64), Vec<usize>>,
    global: AHashMap<u64, Vec<(usize, u32)>>,
 }
 impl FutureIndex {
    fn build(records: &[RequestRecord], placement: &[u32]) -> Self {
        let mut local: AHashMap<(u32, u64), Vec<usize>> = AHashMap::new();
        let mut global: AHashMap<u64, Vec<(usize, u32)>> = AHashMap::new();
        for (req_idx, record) in records.iter().enumerate() {
            let inst = placement[req_idx];
            let mut seen = AHashSet::new();
            for &block in &record.hash_ids {
                if !seen.insert(block) {
                    continue;
                }
                local.entry((inst, block)).or_default().push(req_idx);
                global.entry(block).or_default().push((req_idx, inst));
            }
        }
        Self { local, global }
    }
    fn next_local(&self, inst: u32, block: u64, current_req_idx: usize) -> usize {
        match self.local.get(&(inst, block)) {
            Some(indices) => next_after(indices, current_req_idx),
            None => usize::MAX,
        }
    }
    fn next_other(&self, inst: u32, block: u64, current_req_idx: usize) -> usize {
        let Some(indices) = self.global.get(&block) else {
            return usize::MAX;
        };
        let start = first_after_pair(indices, current_req_idx);
        for &(req_idx, owner_inst) in indices.iter().skip(start) {
            if owner_inst != inst {
                return req_idx;
            }
        }
        usize::MAX
    }
    fn next_use(&self, kind: FutureKind, inst: u32, block: u64, current_req_idx: usize) -> usize {
        match kind {
            FutureKind::L0 => self.next_local(inst, block, current_req_idx),
            FutureKind::L1 => min(
                self.next_local(inst, block, current_req_idx),
                self.next_other(inst, block, current_req_idx),
            ),
        }
    }
 }
 fn next_after(indices: &[usize], current_req_idx: usize) -> usize {
    let pos = indices.partition_point(|&idx| idx <= current_req_idx);
    indices.get(pos).copied().unwrap_or(usize::MAX)
 }
 fn first_after_pair(indices: &[(usize, u32)], current_req_idx: usize) -> usize {
    indices.partition_point(|&(idx, _)| idx <= current_req_idx)
 }
 #[derive(Debug)]
 struct BeladyTier {
    capacity: usize,
    resident: AHashSet<u64>,
    versions: AHashMap<u64, u64>,
    heap: BinaryHeap<(usize, u64, u64)>,
    next_version: u64,
 }
 impl BeladyTier {
    fn new(capacity: usize) -> Self {
        Self {
            capacity,
            resident: AHashSet::with_capacity(capacity),
            versions: AHashMap::with_capacity(capacity),
            heap: BinaryHeap::with_capacity(capacity),
            next_version: 0,
        }
    }
    fn contains(&self, key: u64) -> bool {
        self.resident.contains(&key)
    }
    fn remove(&mut self, key: u64) -> bool {
        if self.resident.remove(&key) {
            self.versions.remove(&key);
            true
        } else {
            false
        }
    }
    fn touch(
        &mut self,
        key: u64,
        current_req_idx: usize,
        kind: FutureKind,
        inst: u32,
        futures: &FutureIndex,
    ) -> bool {
        if !self.resident.contains(&key) {
            return false;
        }
        self.next_version += 1;
        let version = self.next_version;
        let next_use = futures.next_use(kind, inst, key, current_req_idx);
        self.versions.insert(key, version);
        self.heap.push((next_use, version, key));
        true
    }
    fn insert(
        &mut self,
        key: u64,
        current_req_idx: usize,
        kind: FutureKind,
        inst: u32,
        futures: &FutureIndex,
    ) -> Option<u64> {
        if self.touch(key, current_req_idx, kind, inst, futures) {
            return None;
        }
        if self.capacity == 0 {
            return Some(key);
        }
        let mut evicted = None;
        if self.resident.len() == self.capacity {
            evicted = self.evict(current_req_idx, kind, inst, futures);
        }
        self.next_version += 1;
        let version = self.next_version;
        let next_use = futures.next_use(kind, inst, key, current_req_idx);
        self.resident.insert(key);
        self.versions.insert(key, version);
        self.heap.push((next_use, version, key));
        evicted
    }
    fn evict(
        &mut self,
        current_req_idx: usize,
        kind: FutureKind,
        inst: u32,
        futures: &FutureIndex,
    ) -> Option<u64> {
        while let Some((stored_next_use, version, key)) = self.heap.pop() {
            if !self.resident.contains(&key) {
                continue;
            }
            let Some(current_version) = self.versions.get(&key).copied() else {
                continue;
            };
            if current_version != version {
                continue;
            }
            let actual_next_use = futures.next_use(kind, inst, key, current_req_idx);
            if actual_next_use != stored_next_use {
                self.next_version += 1;
                let new_version = self.next_version;
                self.versions.insert(key, new_version);
                self.heap.push((actual_next_use, new_version, key));
                continue;
            }
            self.resident.remove(&key);
            self.versions.remove(&key);
            return Some(key);
        }
        None
    }
 }
 #[derive(Debug)]
 enum Tier {
    Lru(LruBlocks),
    Belady(BeladyTier),
 }
 impl Tier {
    fn new(policy: ReplayEvictPolicy, capacity: usize) -> Self {
        match policy {
            ReplayEvictPolicy::Lru => Self::Lru(LruBlocks::new(capacity)),
            ReplayEvictPolicy::Belady => Self::Belady(BeladyTier::new(capacity)),
        }
    }
    fn contains(&self, key: u64) -> bool {
        match self {
            Self::Lru(tier) => tier.contains(key),
            Self::Belady(tier) => tier.contains(key),
        }
    }
    fn remove(&mut self, key: u64) -> bool {
        match self {
            Self::Lru(tier) => tier.remove(key),
            Self::Belady(tier) => tier.remove(key),
        }
    }
    fn touch(
        &mut self,
        key: u64,
        req_idx: usize,
        kind: FutureKind,
        inst: u32,
        futures: &FutureIndex,
    ) -> bool {
        match self {
            Self::Lru(tier) => tier.touch(key),
            Self::Belady(tier) => tier.touch(key, req_idx, kind, inst, futures),
        }
    }
    fn insert(
        &mut self,
        key: u64,
        req_idx: usize,
        kind: FutureKind,
        inst: u32,
        futures: &FutureIndex,
    ) -> Option<u64> {
        match self {
            Self::Lru(tier) => tier.insert_block(key),
            Self::Belady(tier) => tier.insert(key, req_idx, kind, inst, futures),
        }
    }
    fn longest_prefix_touch(
        &mut self,
        hashes: &[u64],
        req_idx: usize,
        kind: FutureKind,
        inst: u32,
        futures: &FutureIndex,
    ) -> usize {
        match self {
            Self::Lru(tier) => tier.longest_prefix(hashes),
            Self::Belady(tier) => {
                let mut matched = 0usize;
                for &hash in hashes {
                    if !tier.touch(hash, req_idx, kind, inst, futures) {
                        break;
                    }
                    matched += 1;
                }
                matched
            }
        }
    }
    fn longest_prefix_peek(&self, hashes: &[u64]) -> usize {
        match self {
            Self::Lru(tier) => tier.longest_prefix_peek(hashes),
            Self::Belady(tier) => {
                let mut matched = 0usize;
                for &hash in hashes {
                    if !tier.contains(hash) {
                        break;
                    }
                    matched += 1;
                }
                matched
            }
        }
    }
 }
 #[derive(Debug)]
 struct ReplayInstanceCache {
    l0: Tier,
    l1: Tier,
 }
 impl ReplayInstanceCache {
    fn new(policy: ReplayEvictPolicy, l0_cap: usize, l1_cap: usize) -> Self {
        Self {
            l0: Tier::new(policy, l0_cap),
            l1: Tier::new(policy, l1_cap),
        }
    }
    fn promote_l1_blocks_to_l0(
        &mut self,
        hashes: &[u64],
        req_idx: usize,
        inst: u32,
        futures: &FutureIndex,
        owners: &mut AHashMap<u64, AHashSet<u32>>,
    ) {
        for &hash in hashes {
            if self.l1.remove(hash) {
                remove_owner(owners, hash, inst);
            }
            self.insert_block_into_l0(hash, req_idx, inst, futures, owners);
        }
    }
    fn fetch_remote_blocks_to_l0(
        &mut self,
        hashes: &[u64],
        req_idx: usize,
        inst: u32,
        futures: &FutureIndex,
        owners: &mut AHashMap<u64, AHashSet<u32>>,
    ) {
        for &hash in hashes {
            self.stage_remote_block_in_l1(hash, req_idx, inst, futures, owners);
            if self.l1.remove(hash) {
                remove_owner(owners, hash, inst);
            }
            self.insert_block_into_l0(hash, req_idx, inst, futures, owners);
        }
    }
    fn insert_blocks_into_l0(
        &mut self,
        hashes: &[u64],
        req_idx: usize,
        inst: u32,
        futures: &FutureIndex,
        owners: &mut AHashMap<u64, AHashSet<u32>>,
    ) {
        for &hash in hashes {
            self.insert_block_into_l0(hash, req_idx, inst, futures, owners);
        }
    }
    fn insert_block_into_l0(
        &mut self,
        hash: u64,
        req_idx: usize,
        inst: u32,
        futures: &FutureIndex,
        owners: &mut AHashMap<u64, AHashSet<u32>>,
    ) {
        if self.l0.touch(hash, req_idx, FutureKind::L0, inst, futures) {
            return;
        }
        if self.l1.remove(hash) {
            remove_owner(owners, hash, inst);
        }
        if let Some(evicted_l0) = self.l0.insert(hash, req_idx, FutureKind::L0, inst, futures) {
            self.demote_into_l1(evicted_l0, req_idx, inst, futures, owners);
        }
    }
    fn stage_remote_block_in_l1(
        &mut self,
        hash: u64,
        req_idx: usize,
        inst: u32,
        futures: &FutureIndex,
        owners: &mut AHashMap<u64, AHashSet<u32>>,
    ) {
        if self.l0.contains(hash) || self.l1.contains(hash) {
            return;
        }
        if let Some(evicted_l1) = self.l1.insert(hash, req_idx, FutureKind::L1, inst, futures) {
            remove_owner(owners, evicted_l1, inst);
        }
        add_owner(owners, hash, inst);
    }
    fn demote_into_l1(
        &mut self,
        hash: u64,
        req_idx: usize,
        inst: u32,
        futures: &FutureIndex,
        owners: &mut AHashMap<u64, AHashSet<u32>>,
    ) {
        if self.l1.touch(hash, req_idx, FutureKind::L1, inst, futures) {
            return;
        }
        if let Some(evicted_l1) = self.l1.insert(hash, req_idx, FutureKind::L1, inst, futures) {
            remove_owner(owners, evicted_l1, inst);
        }
        add_owner(owners, hash, inst);
    }
 }
 fn add_owner(owners: &mut AHashMap<u64, AHashSet<u32>>, hash: u64, inst: u32) {
    owners.entry(hash).or_default().insert(inst);
 }
 fn remove_owner(owners: &mut AHashMap<u64, AHashSet<u32>>, hash: u64, inst: u32) {
    if let Some(bucket) = owners.get_mut(&hash) {
        bucket.remove(&inst);
        if bucket.is_empty() {
            owners.remove(&hash);
        }
    }
 }
 pub fn replay_fixed_placement(
    cfg: &Config,
    records: &[RequestRecord],
    placements: &[PlacementEntry],
    policy: ReplayEvictPolicy,
 ) -> Result<ReplaySummary> {
    if records.len() != placements.len() {
        return Err(anyhow!(
            "records/placements length mismatch: {} vs {}",
            records.len(),
            placements.len()
        ));
    }
    let placement_by_req: AHashMap<u64, u32> =
        placements.iter().map(|p| (p.req_id, p.instance)).collect();
    let ordered_placement: Vec<u32> = records
        .iter()
        .map(|r| {
            placement_by_req
                .get(&r.req_id)
                .copied()
                .ok_or_else(|| anyhow!("missing placement for req_id={}", r.req_id))
        })
        .collect::<Result<_>>()?;
    let futures = FutureIndex::build(records, &ordered_placement);
    let block_bytes = cfg.model.kv_block_bytes() as f64;
    let l0_cap = (cfg.hardware.hbm_bytes / block_bytes).max(1.0) as usize;
    let l1_cap = (cfg.hardware.dram_bytes / block_bytes).max(1.0) as usize;
    let num_instances = cfg.cluster.num_instances as usize;
    let mut caches: Vec<ReplayInstanceCache> = (0..num_instances)
        .map(|_| ReplayInstanceCache::new(policy, l0_cap, l1_cap))
        .collect();
    let mut owners: AHashMap<u64, AHashSet<u32>> = AHashMap::new();
    let mut total_blocks = 0u64;
    let mut l0_hit_blocks = 0u64;
    let mut l1_hit_blocks = 0u64;
    let mut remote_hit_blocks = 0u64;
    let mut miss_blocks = 0u64;
    let mut total_rdma_bytes = 0u64;
    let mut total_pcie_bytes = 0u64;
    for (req_idx, record) in records.iter().enumerate() {
        let inst = ordered_placement[req_idx];
        let cache = &mut caches[inst as usize];
        total_blocks += record.hash_ids.len() as u64;
        let l0_hits = cache.l0.longest_prefix_touch(
            &record.hash_ids,
            req_idx,
            FutureKind::L0,
            inst,
            &futures,
        );
        let suffix_after_l0 = &record.hash_ids[l0_hits..];
        let l1_hits = cache.l1.longest_prefix_peek(suffix_after_l0);
        if l1_hits > 0 {
            cache.promote_l1_blocks_to_l0(
                &suffix_after_l0[..l1_hits],
                req_idx,
                inst,
                &futures,
                &mut owners,
            );
        }
        let suffix_after_l1 = &suffix_after_l0[l1_hits..];
        let mut remote_hits = 0usize;
        for &hash in suffix_after_l1 {
            let any_remote = owners
                .get(&hash)
                .map(|bucket| bucket.iter().any(|owner| *owner != inst))
                .unwrap_or(false);
            if any_remote {
                remote_hits += 1;
            } else {
                break;
            }
        }
        if remote_hits > 0 {
            cache.fetch_remote_blocks_to_l0(
                &suffix_after_l1[..remote_hits],
                req_idx,
                inst,
                &futures,
                &mut owners,
            );
        }
        let misses = record.hash_ids.len() - l0_hits - l1_hits - remote_hits;
        let new_input = &record.hash_ids[(l0_hits + l1_hits + remote_hits)..];
        if !new_input.is_empty() {
            cache.insert_blocks_into_l0(new_input, req_idx, inst, &futures, &mut owners);
        }
        l0_hit_blocks += l0_hits as u64;
        l1_hit_blocks += l1_hits as u64;
        remote_hit_blocks += remote_hits as u64;
        miss_blocks += misses as u64;
        let kv_block_bytes = cfg.model.kv_block_bytes();
        total_rdma_bytes += (remote_hits as u64) * kv_block_bytes;
        total_pcie_bytes += ((l1_hits + remote_hits) as u64) * kv_block_bytes;
    }
    Ok(ReplaySummary::from_counts(
        records.len(),
        total_blocks,
        l0_hit_blocks,
        l1_hit_blocks,
        remote_hit_blocks,
        miss_blocks,
        total_rdma_bytes,
        total_pcie_bytes,
    ))
 }
--- a/tests/smoke.rs
+++ b/tests/smoke.rs
@@ -6,6 +6,7 @@ use std::io::Write;
 use kvcache_simulator::config::*;
 use kvcache_simulator::driver;
 use kvcache_simulator::replay::ReplayEvictPolicy;
 fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
    Config {
@@ -36,7 +37,9 @@ fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
        },
        cluster: ClusterConfig {
            num_instances: 4,
-            meta_store: MetaStoreConfig { ttl_seconds: 1000.0 },
+            meta_store: MetaStoreConfig {
                ttl_seconds: 1000.0,
            },
            router: RouterConfig {
                mode,
                precise_probe_latency_us: 10.0,
@@ -94,9 +97,11 @@ fn write_synthetic_trace(path: &std::path::Path) {
    }
 }
-fn run(mode: RouterMode, trace_path: &std::path::Path, out_root: &std::path::Path)
+fn run(
-    -> kvcache_simulator::metrics::Summary
+    mode: RouterMode,
-{
+    trace_path: &std::path::Path,
    out_root: &std::path::Path,
 ) -> kvcache_simulator::metrics::Summary {
    let cfg = base_config(
        trace_path.to_str().unwrap(),
        out_root.to_str().unwrap(),
@@ -119,9 +124,8 @@ fn ablation_hit_rate_ordering() {
    let s_ttl = run(RouterMode::TtlAware, &trace_path, &tmp);
    let s_prec = run(RouterMode::Precise, &trace_path, &tmp);
-    let total_hit = |s: &kvcache_simulator::metrics::Summary| {
+    let total_hit =
-        s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote
+        |s: &kvcache_simulator::metrics::Summary| s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote;
    };
    let h_rand = total_hit(&s_random);
    let h_ll = total_hit(&s_ll);
@@ -135,23 +139,79 @@ fn ablation_hit_rate_ordering() {
    eprintln!(
        "         remote+local hit ratio L0/L1/remote: \
         random=({:.2},{:.2},{:.2}) precise=({:.2},{:.2},{:.2})",
-        s_random.hit_rate_l0, s_random.hit_rate_l1, s_random.hit_rate_remote,
+        s_random.hit_rate_l0,
-        s_prec.hit_rate_l0, s_prec.hit_rate_l1, s_prec.hit_rate_remote,
+        s_random.hit_rate_l1,
        s_random.hit_rate_remote,
        s_prec.hit_rate_l0,
        s_prec.hit_rate_l1,
        s_prec.hit_rate_remote,
    );
    // ttl_aware and precise should outperform random / least_loaded for
    // a workload built entirely of shared-prefix conversations.
    let eps = 1e-6;
-    assert!(
+    assert!(h_ttl + eps >= h_rand, "ttl_aware should >= random hit rate");
-        h_ttl + eps >= h_rand,
+    assert!(h_prec + eps >= h_rand, "precise should >= random hit rate");
        "ttl_aware should >= random hit rate"
    );
    assert!(
        h_prec + eps >= h_rand,
        "precise should >= random hit rate"
    );
    assert!(
        h_prec + eps >= h_ll,
        "precise should >= least_loaded hit rate"
    );
 }
 #[test]
 fn ablation_lru_preserves_ttft_fields() {
    let tmp = std::env::temp_dir().join("kvcache_sim_replay");
    let _ = std::fs::remove_dir_all(&tmp);
    std::fs::create_dir_all(&tmp).unwrap();
    let trace_path = tmp.join("trace.jsonl");
    write_synthetic_trace(&trace_path);
    let cfg = base_config(
        trace_path.to_str().unwrap(),
        tmp.to_str().unwrap(),
        RouterMode::Random,
    );
    let online = driver::run(&cfg, Some("online_lru")).expect("online lru run");
    let out = driver::ablate_fixed_placement(&cfg, &[RouterMode::Random], &[ReplayEvictPolicy::Lru])
        .expect("ablate lru");
    assert_eq!(out.len(), 1);
    let row = &out[0];
    let online_hit = online.summary.hit_rate_l0 + online.summary.hit_rate_l1 + online.summary.hit_rate_remote;
    let ablate_hit = row.hit_rate_l0 + row.hit_rate_l1 + row.hit_rate_remote;
    assert!(
        (ablate_hit - online_hit).abs() < 1e-9,
        "ablation lru should match online lru hit rate: online={online_hit} ablate={ablate_hit}"
    );
    assert!((row.ttft_mean - online.summary.ttft_mean).abs() < 1e-9);
    assert!((row.ttft_p50 - online.summary.ttft_p50).abs() < 1e-9);
    assert!((row.ttft_p95 - online.summary.ttft_p95).abs() < 1e-9);
    assert!((row.ttft_p99 - online.summary.ttft_p99).abs() < 1e-9);
 }
 #[test]
 fn ablate_rejects_belady_until_exact_algorithm_exists() {
    let tmp = std::env::temp_dir().join("kvcache_sim_ablate_evict");
    let _ = std::fs::remove_dir_all(&tmp);
    std::fs::create_dir_all(&tmp).unwrap();
    let trace_path = tmp.join("trace.jsonl");
    write_synthetic_trace(&trace_path);
    let cfg = base_config(
        trace_path.to_str().unwrap(),
        tmp.to_str().unwrap(),
        RouterMode::Random,
    );
    let err = driver::ablate_fixed_placement(
        &cfg,
        &[RouterMode::Random],
        &[ReplayEvictPolicy::Belady],
    )
    .expect_err("belady should be rejected");
    assert!(
        err.to_string().contains("exact belady"),
        "unexpected error: {err:#}"
    );
 }