From ec73a95e0518cd58a70e6946e11b547301246ada Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Tue, 14 Apr 2026 01:16:02 +0800
Subject: [PATCH] KVCache simulator for LLM serving cluster routing research

Discrete-event simulator for evaluating KV cache-aware routing policies
in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache
hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention,
architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide
meta-store for prefix-aware routing decisions.

Includes 11 routing policies (random, round_robin, least_loaded,
least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score,
estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing,
built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation
tooling for systematic policy comparison across real Alibaba serving traces.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore                                    |  29 +
 .gitmodules                                   |   3 +
 Cargo.lock                                    | 729 ++++++++++++++++++
 Cargo.toml                                    |  28 +
 README.md                                     | 174 +++++
 configs/glm5-8xb200-blk512.yaml               |  68 ++
 configs/glm5-8xb200-hf.yaml                   |  40 +
 configs/glm5-8xb200.yaml                      |  67 ++
 configs/qwen2.5-coder-32b-h800.yaml           |  42 +
 configs/qwen2.5-coder-7b-h800.yaml            |  42 +
 configs/qwen2.5-coder-7b-preset.yaml          |  36 +
 configs/qwen3-coder-480b-8xh20.yaml           |  29 +
 models/GLM-5/config.json                      |  59 ++
 .../config.json                               |  41 +
 qwen-bailian-usagetraces-anon                 |   1 +
 src/cluster/cluster.rs                        | 167 ++++
 src/cluster/meta_store.rs                     | 161 ++++
 src/cluster/mod.rs                            |   6 +
 src/config.rs                                 | 510 ++++++++++++
 src/driver.rs                                 | 170 ++++
 src/hardware_presets.rs                       | 225 ++++++
 src/hf_config.rs                              | 193 +++++
 src/instance/compute.rs                       | 405 ++++++++++
 src/instance/instance.rs                      | 191 +++++
 src/instance/kv_cache.rs                      | 226 ++++++
 src/instance/mod.rs                           |   6 +
 src/lib.rs                                    |  13 +
 src/main.rs                                   | 271 +++++++
 src/metrics/mod.rs                            |   7 +
 src/metrics/per_request.rs                    |  42 +
 src/metrics/routing_log.rs                    |  29 +
 src/metrics/summary.rs                        |  80 ++
 src/metrics/timeseries.rs                     |  34 +
 src/network.rs                                |  84 ++
 src/oracle.rs                                 | 279 +++++++
 src/router/cache_load.rs                      |  89 +++
 src/router/cache_score.rs                     | 111 +++
 src/router/estimated_ttft.rs                  | 128 +++
 src/router/least_loaded.rs                    |  54 ++
 src/router/least_tokens.rs                    |  73 ++
 src/router/min_pd.rs                          | 124 +++
 src/router/mod.rs                             |  80 ++
 src/router/precise_aware.rs                   | 120 +++
 src/router/prefix_affinity.rs                 | 196 +++++
 src/router/random.rs                          |  90 +++
 src/router/ttl_aware.rs                       |  59 ++
 src/sim/engine.rs                             | 113 +++
 src/sim/events.rs                             |  15 +
 src/sim/mod.rs                                |   5 +
 src/trace.rs                                  | 102 +++
 src/types.rs                                  |   4 +
 tests/smoke.rs                                | 155 ++++
 52 files changed, 6005 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 README.md
 create mode 100644 configs/glm5-8xb200-blk512.yaml
 create mode 100644 configs/glm5-8xb200-hf.yaml
 create mode 100644 configs/glm5-8xb200.yaml
 create mode 100644 configs/qwen2.5-coder-32b-h800.yaml
 create mode 100644 configs/qwen2.5-coder-7b-h800.yaml
 create mode 100644 configs/qwen2.5-coder-7b-preset.yaml
 create mode 100644 configs/qwen3-coder-480b-8xh20.yaml
 create mode 100644 models/GLM-5/config.json
 create mode 100644 models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json
 create mode 160000 qwen-bailian-usagetraces-anon
 create mode 100644 src/cluster/cluster.rs
 create mode 100644 src/cluster/meta_store.rs
 create mode 100644 src/cluster/mod.rs
 create mode 100644 src/config.rs
 create mode 100644 src/driver.rs
 create mode 100644 src/hardware_presets.rs
 create mode 100644 src/hf_config.rs
 create mode 100644 src/instance/compute.rs
 create mode 100644 src/instance/instance.rs
 create mode 100644 src/instance/kv_cache.rs
 create mode 100644 src/instance/mod.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/main.rs
 create mode 100644 src/metrics/mod.rs
 create mode 100644 src/metrics/per_request.rs
 create mode 100644 src/metrics/routing_log.rs
 create mode 100644 src/metrics/summary.rs
 create mode 100644 src/metrics/timeseries.rs
 create mode 100644 src/network.rs
 create mode 100644 src/oracle.rs
 create mode 100644 src/router/cache_load.rs
 create mode 100644 src/router/cache_score.rs
 create mode 100644 src/router/estimated_ttft.rs
 create mode 100644 src/router/least_loaded.rs
 create mode 100644 src/router/least_tokens.rs
 create mode 100644 src/router/min_pd.rs
 create mode 100644 src/router/mod.rs
 create mode 100644 src/router/precise_aware.rs
 create mode 100644 src/router/prefix_affinity.rs
 create mode 100644 src/router/random.rs
 create mode 100644 src/router/ttl_aware.rs
 create mode 100644 src/sim/engine.rs
 create mode 100644 src/sim/events.rs
 create mode 100644 src/sim/mod.rs
 create mode 100644 src/trace.rs
 create mode 100644 src/types.rs
 create mode 100644 tests/smoke.rs

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f70b09b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
+# Trace files
+bailian-traces
+
+# Rust build artifacts
+/target/
+**/*.rs.bk
+
+# Simulation output
+/runs/
+
+# Editor / IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Profiling / perf
+perf.data*
+flamegraph.svg
+*.prof
+
+# Temporary test files
+/tmp/
+*.log
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..945f71e
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "qwen-bailian-usagetraces-anon"]
+	path = qwen-bailian-usagetraces-anon
+	url = https://github.com/alibaba-edu/qwen-bailian-usagetraces-anon.git
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..bad079b
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,729 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "getrandom 0.3.4",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "clap"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
+[[package]]
+name = "console"
+version = "0.15.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "once_cell",
+ "unicode-width",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "csv"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde_core",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "encode_unicode"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasip2",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "indexmap"
+version = "2.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
+[[package]]
+name = "indicatif"
+version = "0.17.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
+dependencies = [
+ "console",
+ "number_prefix",
+ "portable-atomic",
+ "unicode-width",
+ "web-time",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "js-sys"
+version = "0.3.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "kvcache-simulator"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "anyhow",
+ "clap",
+ "csv",
+ "indicatif",
+ "ordered-float",
+ "rand",
+ "rand_chacha",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "smallvec",
+ "thiserror",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.184"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af"
+
+[[package]]
+name = "memchr"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "number_prefix"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "ordered-float"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.17",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "serde_yaml"
+version = "0.9.34+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
+dependencies = [
+ "indexmap",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-width"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
+
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.2+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+
+[[package]]
+name = "zerocopy"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..841e3d1
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "kvcache-simulator"
+version = "0.1.0"
+edition = "2021"
+description = "Discrete-event simulator for cluster-level LLM serving with two-tier KV cache and KV-aware routing ablation."
+
+[[bin]]
+name = "kvcache-sim"
+path = "src/main.rs"
+
+[dependencies]
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+serde_yaml = "0.9"
+clap = { version = "4", features = ["derive"] }
+csv = "1"
+ahash = "0.8"
+ordered-float = "4"
+anyhow = "1"
+thiserror = "1"
+indicatif = "0.17"
+rand = "0.8"
+rand_chacha = "0.3"
+smallvec = { version = "1", features = ["union"] }
+
+[profile.release]
+lto = "thin"
+codegen-units = 1
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1dde7d3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,174 @@
+# kvcache-simulator
+
+Discrete-event simulator for cluster-level LLM **prefill** serving with a
+two-tier KV cache (GPU HBM + CPU DRAM / v6d) and KV-aware request routing.
+Replays real production traces against a synthetic cluster so you can
+ablate routing strategies and cache sizing without spinning up any GPUs.
+
+Assumes **PD (prefill/decode) disaggregation** — only the prefill path is
+modeled.
+
+## Build
+
+```bash
+cargo build --release
+# binary: target/release/kvcache-sim
+```
+
+Fetch the upstream trace (consumed as a git submodule):
+
+```bash
+git submodule update --init --recursive
+```
+
+## Usage
+
+### 1. Run a single simulation
+
+```bash
+target/release/kvcache-sim run --config configs/qwen2.5-coder-7b-h800.yaml
+```
+
+Prints `summary.json` to stdout and writes the full output directory
+(see [Outputs](#outputs) below).
+
+### 2. Compare routers on the same trace (ablation)
+
+```bash
+target/release/kvcache-sim ablate \
+    --config configs/qwen2.5-coder-7b-h800.yaml \
+    --num-instances 64 \
+    --output-dir runs/qwen7b_n64 \
+    --routers random,least_loaded,ttl_aware,precise
+```
+
+Writes one subdirectory per router plus a combined
+`runs/qwen7b_n64/ablation.json` with side-by-side summaries.
+
+### 3. Compute theoretical hit-rate ceilings (oracle)
+
+```bash
+# Cluster-aggregate capacity (default)
+target/release/kvcache-sim oracle \
+    --config configs/qwen2.5-coder-7b-h800.yaml --num-instances 64
+
+# A single instance's HBM budget
+target/release/kvcache-sim oracle \
+    --config configs/qwen2.5-coder-7b-h800.yaml --per-instance
+
+# Explicit capacity in 16-token blocks
+target/release/kvcache-sim oracle \
+    --config configs/qwen2.5-coder-7b-h800.yaml --capacity-blocks 200000
+```
+
+Reports three numbers:
+
+- `unlimited.hit_rate` — absolute ceiling (infinite cache)
+- `belady_finite.hit_rate` — optimal-eviction ceiling at the given capacity
+- `lru_finite.hit_rate` — production LRU at the same capacity
+
+Gap between `lru_finite` and `belady_finite` = headroom from a smarter
+eviction policy. Gap between `belady_finite` and `unlimited` = headroom
+only reachable by adding capacity.
+
+### 4. Validate a config without running
+
+```bash
+target/release/kvcache-sim validate --config configs/qwen2.5-coder-7b-h800.yaml
+```
+
+Parses the YAML, prints derived per-instance block budgets, and dumps
+the first 5 trace records so you can sanity-check the path.
+
+## CLI overrides
+
+These flags work on **all** subcommands and override the YAML in place,
+so the same config can be reused across sweeps:
+
+| Flag                     | Overrides                                 |
+|--------------------------|-------------------------------------------|
+| `--num-instances <N>`    | `cluster.num_instances`                   |
+| `--max-requests <N>`     | `sim.max_requests`                        |
+| `--trace <PATH>`         | `sim.trace_path`                          |
+| `--output-dir <PATH>`    | `sim.output_dir`                          |
+| `--seed <N>`             | `sim.seed`                                |
+| `--precise-topk <N>`     | `cluster.router.precise_probe_topk`       |
+| `--ttl-seconds <S>`      | `cluster.meta_store.ttl_seconds`          |
+
+`oracle` additionally takes `--capacity-blocks <N>` / `--per-instance`
+and `--out <PATH>`. `ablate` additionally takes `--routers <csv>`.
+
+## Router modes
+
+Set `cluster.router.mode` in the YAML or list in `--routers`:
+
+| Mode           | What it does                                                       |
+|----------------|--------------------------------------------------------------------|
+| `random`       | Uniform random. Baseline.                                          |
+| `round_robin`  | Deterministic round-robin. Baseline.                               |
+| `least_loaded` | `argmin(kv_blocks_used + alpha * queue_len)`. KV-blind.            |
+| `ttl_aware`    | Picks instance with longest prefix in the global TTL meta store.   |
+| `precise`      | Probes top-K least-loaded instances' actual caches; charges probe latency into TTFT. |
+
+Expected hit-rate ordering: `random ≲ least_loaded ≲ ttl_aware ≲ precise`.
+
+## Outputs
+
+Each run writes a directory under `sim.output_dir`:
+
+| File                 | Contents                                                                   |
+|----------------------|----------------------------------------------------------------------------|
+| `summary.json`       | Router, throughput, TTFT p50/p95/p99, hit rates per tier, total RDMA/PCIe bytes |
+| `per_request.csv`    | `req_id,arrival,ttft,e2e,instance,total_blocks,l0_hit,l1_hit,remote_hit,miss,rdma_bytes,pcie_bytes,probe_overhead_s` |
+| `instances.csv`      | `t,instance,queue_len,kv_blocks_used,kv_blocks_total,busy` per sample      |
+| `routing_log.jsonl`  | One JSON per request: all router candidates + chosen instance + reason    |
+
+For `ablate`: an extra `ablation.json` with one summary per router.  
+For `oracle`: an `oracle.json` with the three hit-rate analyses.
+
+### Reading results quickly
+
+```bash
+# Pretty-print the summary
+cat runs/qwen7b/summary.json | jq .
+
+# Compare all routers from an ablation
+cat runs/qwen7b_n64/ablation.json | jq '.[] | {router, ttft_p50, hit_rate_l0, total_rdma_bytes}'
+
+# Hit-rate ceilings vs LRU at the same capacity
+cat runs/qwen7b/oracle.json | jq '{unlimited: .unlimited.hit_rate, belady: .belady_finite.hit_rate, lru: .lru_finite.hit_rate}'
+```
+
+## Config
+
+A config is a single YAML file with four sections. A working example
+lives at
+[`configs/qwen2.5-coder-7b-h800.yaml`](configs/qwen2.5-coder-7b-h800.yaml);
+copy and edit for other models/hardware.
+
+```yaml
+model:      # shape + prefill roofline coefficients
+hardware:   # per-instance GPU/PCIe/RDMA capabilities + batch knobs
+cluster:    # num_instances, meta_store TTL, router mode
+sim:        # trace_path, max_requests, output_dir, seed
+```
+
+Only prefill-side model coefficients are used; any decode fields in
+legacy YAMLs are accepted and ignored.
+
+## Trace format
+
+The simulator reads the Alibaba
+[`qwen-bailian-usagetraces-anon`](https://github.com/alibaba-edu/qwen-bailian-usagetraces-anon)
+JSONL schema. Each record has `chat_id`, `timestamp`, `input_length`,
+`output_length`, and `hash_ids` (16-token block hashes). Only the
+input side is used.
+
+## Testing
+
+```bash
+cargo test --release
+```
+
+16 tests: 15 unit + 1 smoke that runs all four routers on a synthetic
+shared-prefix trace and asserts the expected hit-rate ordering.
diff --git a/configs/glm5-8xb200-blk512.yaml b/configs/glm5-8xb200-blk512.yaml
new file mode 100644
index 0000000..cbf1be8
--- /dev/null
+++ b/configs/glm5-8xb200-blk512.yaml
@@ -0,0 +1,68 @@
+# GLM-5 (zai-org/GLM-5) on 8 x B200 SXM (192GB each).
+# Architecture from HuggingFace config.json — all roofline coefficients
+# are derived automatically.
+
+model:
+  name: glm-5
+  # Core architecture (from HF config.json)
+  num_layers: 78
+  hidden_size: 6144
+  num_attention_heads: 64
+  num_kv_heads: 64             # formalism; MLA overrides KV cache sizing
+  head_dim: 64
+  intermediate_size: 12288     # shared expert FFN width
+  dtype_bytes: 2               # BF16
+  block_size_tokens: 512       # matches bailian-traces blksz_512
+
+  # MoE: 256 routed + 1 shared, 8 active per token
+  moe:
+    num_experts: 256
+    num_active_experts: 8
+    num_shared_experts: 1
+    expert_intermediate_size: 2048   # moe_intermediate_size
+
+  # MLA (Multi-head Latent Attention): compressed KV cache
+  mla:
+    kv_lora_rank: 512
+    q_lora_rank: 2048
+    qk_nope_head_dim: 192
+    qk_rope_head_dim: 64
+    v_head_dim: 256
+
+  # DSA (DeepSeek Sparse Attention): sub-quadratic past dense_window
+  attention:
+    type: dsa
+    dense_window: 4096
+    sparse_stride: 8
+    first_dense_layers: 3
+
+hardware:
+  # Aggregate of 8 x B200 in one tensor-parallel group.
+  gpu_flops:        1.80e16    # 8 * 2.25 PFLOPS BF16 dense
+  gpu_mem_bw:       6.40e13    # 8 * 8 TB/s HBM3e
+  # KV budget after FP8 weights + activations. GLM-5 FP8 ~744GB of 1536GB.
+  hbm_bytes:        500.0e9
+  dram_bytes:       1.5e12     # ~1.5 TB usable CPU DRAM / v6d per node
+  pcie_bw:          128.0e9    # PCIe Gen6 x16
+  pcie_latency_us:  4.0
+  rdma_bw:          50.0e9     # ConnectX-7 400 Gbps
+  rdma_latency_us:  6.0
+  max_batch_slots:  256
+  prefill_chunk_tokens: 4096
+
+cluster:
+  num_instances: 64
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_8xb200_blk512
+  sample_interval_s: 1.0
+  seed: 42
diff --git a/configs/glm5-8xb200-hf.yaml b/configs/glm5-8xb200-hf.yaml
new file mode 100644
index 0000000..0cc32d2
--- /dev/null
+++ b/configs/glm5-8xb200-hf.yaml
@@ -0,0 +1,40 @@
+# GLM-5 using HuggingFace config.json + hardware preset.
+#
+# This config demonstrates the simplified format:
+#   model.config_json  — loads architecture from HF config.json
+#   hardware.type      — loads GPU specs from built-in preset
+#
+# Only deployment-specific fields need to be set explicitly.
+# Any field from config_json or the preset can be overridden in YAML.
+
+model:
+  # Auto-detect architecture: MoE, MLA, DSA, head dims, etc.
+  config_json: ../models/GLM-5/config.json
+  name: glm-5                    # override HF model_type
+  dtype_bytes: 1                 # BF16 (not in HF config.json)
+  block_size_tokens: 512         # matches bailian-traces blksz_512
+
+hardware:
+  type: 8xb200                   # 8 x B200 SXM (192GB each)
+  # Override preset values for this specific deployment:
+  hbm_bytes: 500.0e9             # KV budget after FP8 weights + activations
+  dram_bytes: 1.5e12             # ~1.5 TB usable CPU DRAM per node
+  max_batch_slots: 256
+
+cluster:
+  num_instances: 32
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+    prefix_k: 8
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_8xb200_hf
+  sample_interval_s: 1.0
+  seed: 42
diff --git a/configs/glm5-8xb200.yaml b/configs/glm5-8xb200.yaml
new file mode 100644
index 0000000..0e3c542
--- /dev/null
+++ b/configs/glm5-8xb200.yaml
@@ -0,0 +1,67 @@
+# GLM-5 (zai-org/GLM-5) served as a single tensor-parallel instance on
+# 8 x NVIDIA B200 SXM (192GB each, 1.5 TB aggregate HBM).
+#
+# GLM-5 is a 744B-total / 40B-active Mixture-of-Experts model (BF16),
+# using DeepSeek Sparse Attention (DSA). The HF card does not publish
+# layer/head shapes, so the values below are reasonable estimates based
+# on the GLM-4.5 lineage; adjust once the official config.json is public.
+#
+# Hardware values below represent the *aggregate* of the 8-GPU TP group
+# (one simulated "instance" == one 8xB200 serving replica). This is how
+# the roofline in src/instance/compute.rs wants to see it: gpu_flops and
+# gpu_mem_bw are the effective peaks seen by the TP'd model.
+#
+# Calibrate `flops_per_token_prefill` and `attn_quadratic_coeff` against
+# measured prefill latency before trusting absolute TTFT numbers.
+
+model:
+  name: glm-5
+  # --- estimates; refine from official config.json when available ---
+  num_layers: 92
+  num_kv_heads: 8              # GQA
+  head_dim: 128
+  dtype_bytes: 2               # BF16
+  block_size_tokens: 16        # trace convention
+  # Active-params-driven roofline: MoE activates ~40B params per token,
+  # so non-attention prefill FLOPs/token ≈ 2 * 40e9 = 8e10.
+  flops_per_token_prefill: 8.0e10
+  # Quadratic attention term ≈ 2 * num_heads * head_dim. GLM-5 uses
+  # DeepSeek Sparse Attention which is sub-quadratic in practice, so
+  # this coefficient is an upper bound — lower it if your measurements
+  # show DSA kicking in for long prompts.
+  attn_quadratic_coeff:    2048.0
+  bytes_per_token_prefill: 0.0
+
+hardware:
+  # Aggregate of 8 x B200 in one tensor-parallel group.
+  gpu_flops:        1.80e16    # 8 * 2.25 PFLOPS BF16 dense
+  gpu_mem_bw:       6.40e13    # 8 * 8 TB/s HBM3e
+  # KV-cache budget after weights + activations. GLM-5 @ BF16 is ~1.49TB,
+  # which barely fits in 1.5TB HBM; realistic serving uses FP8 weights
+  # (~744GB), leaving ~500GB for activations + KV cache. Adjust if your
+  # deployment uses a different weight dtype.
+  hbm_bytes:        500.0e9
+  dram_bytes:       1.5e12     # ~1.5 TB usable CPU DRAM / v6d per node
+  pcie_bw:          128.0e9    # PCIe Gen6 x16 ~ 128 GB/s per direction
+  pcie_latency_us:  4.0
+  rdma_bw:          50.0e9     # ConnectX-7 400 Gbps ≈ 50 GB/s
+  rdma_latency_us:  6.0
+  max_batch_slots:  256
+  prefill_chunk_tokens: 2048
+
+cluster:
+  num_instances: 8             # 8 TP replicas -> 64 B200s cluster-wide
+  meta_store:
+    ttl_seconds: 120.0
+  router:
+    mode: ttl_aware
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+
+sim:
+  trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl
+  max_requests: null
+  output_dir: runs/glm5_8xb200
+  sample_interval_s: 1.0
+  seed: 42
diff --git a/configs/qwen2.5-coder-32b-h800.yaml b/configs/qwen2.5-coder-32b-h800.yaml
new file mode 100644
index 0000000..cc66942
--- /dev/null
+++ b/configs/qwen2.5-coder-32b-h800.yaml
@@ -0,0 +1,42 @@
+# Qwen2.5-Coder-32B (dense, GQA) on H800 SXM (80GB).
+# Architecture from HuggingFace config.json — roofline auto-derived.
+
+model:
+  name: qwen2.5-coder-32b
+  num_layers: 64
+  hidden_size: 5120
+  num_attention_heads: 40
+  num_kv_heads: 8              # GQA
+  head_dim: 128
+  intermediate_size: 27648     # SwiGLU FFN
+  dtype_bytes: 2               # BF16
+  block_size_tokens: 16
+
+hardware:
+  gpu_flops:        9.89e14
+  gpu_mem_bw:       3.35e12
+  hbm_bytes:        20.0e9     # smaller budget: 32B weights are large
+  dram_bytes:       512.0e9
+  pcie_bw:          64.0e9
+  pcie_latency_us:  5.0
+  rdma_bw:          25.0e9
+  rdma_latency_us:  8.0
+  max_batch_slots:  128
+  prefill_chunk_tokens: 1024
+
+cluster:
+  num_instances: 16
+  meta_store:
+    ttl_seconds: 60.0
+  router:
+    mode: ttl_aware
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+
+sim:
+  trace_path: traces/qwen_coder_blksz_16.jsonl
+  max_requests: null
+  output_dir: runs/qwen32b
+  sample_interval_s: 1.0
+  seed: 42
diff --git a/configs/qwen2.5-coder-7b-h800.yaml b/configs/qwen2.5-coder-7b-h800.yaml
new file mode 100644
index 0000000..bea5226
--- /dev/null
+++ b/configs/qwen2.5-coder-7b-h800.yaml
@@ -0,0 +1,42 @@
+# Qwen2.5-Coder-7B (dense, GQA) on a single H800 SXM (80GB).
+# Architecture from HuggingFace config.json — roofline auto-derived.
+
+model:
+  name: qwen2.5-coder-7b
+  num_layers: 28
+  hidden_size: 3584
+  num_attention_heads: 28
+  num_kv_heads: 4              # GQA: 28 query heads, 4 KV heads
+  head_dim: 128
+  intermediate_size: 18944     # SwiGLU FFN
+  dtype_bytes: 2               # BF16
+  block_size_tokens: 16        # matches qwen_coder_blksz_16 trace
+
+hardware:
+  gpu_flops:        9.89e14    # H800 bf16 dense
+  gpu_mem_bw:       3.35e12    # 3.35 TB/s HBM3
+  hbm_bytes:        60.0e9     # leave headroom for weights/activations
+  dram_bytes:       512.0e9
+  pcie_bw:          64.0e9     # PCIe Gen5 x16
+  pcie_latency_us:  5.0
+  rdma_bw:          25.0e9     # ~200 Gbps NIC
+  rdma_latency_us:  8.0
+  max_batch_slots:  256
+  prefill_chunk_tokens: 2048
+
+cluster:
+  num_instances: 16
+  meta_store:
+    ttl_seconds: 60.0
+  router:
+    mode: ttl_aware
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+
+sim:
+  trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl
+  max_requests: null
+  output_dir: runs/qwen7b
+  sample_interval_s: 1.0
+  seed: 42
diff --git a/configs/qwen2.5-coder-7b-preset.yaml b/configs/qwen2.5-coder-7b-preset.yaml
new file mode 100644
index 0000000..a411662
--- /dev/null
+++ b/configs/qwen2.5-coder-7b-preset.yaml
@@ -0,0 +1,36 @@
+# Qwen2.5-Coder-7B using hardware preset.
+#
+# Model architecture is specified inline (no config.json needed for simple
+# models). Hardware uses preset "h800" with a single override for hbm_bytes.
+
+model:
+  name: qwen2.5-coder-7b
+  num_layers: 28
+  hidden_size: 3584
+  num_attention_heads: 28
+  num_kv_heads: 4
+  head_dim: 128
+  intermediate_size: 18944
+  dtype_bytes: 2
+  block_size_tokens: 16
+
+hardware:
+  type: h800                     # single H800 SXM (80GB)
+  hbm_bytes: 60.0e9             # KV budget after 7B model weights
+
+cluster:
+  num_instances: 16
+  meta_store:
+    ttl_seconds: 60.0
+  router:
+    mode: ttl_aware
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+
+sim:
+  trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl
+  max_requests: null
+  output_dir: runs/qwen7b_preset
+  sample_interval_s: 1.0
+  seed: 42
diff --git a/configs/qwen3-coder-480b-8xh20.yaml b/configs/qwen3-coder-480b-8xh20.yaml
new file mode 100644
index 0000000..a56e16e
--- /dev/null
+++ b/configs/qwen3-coder-480b-8xh20.yaml
@@ -0,0 +1,29 @@
+# Qwen3-Coder-480B-A35B (MoE, GQA) on 8 x H20 (96GB each).
+# Architecture auto-loaded from HuggingFace config.json.
+
+model:
+  config_json: ../models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json
+  name: qwen3-coder-480b
+  dtype_bytes: 1               # FP8 inference
+  block_size_tokens: 16
+
+hardware:
+  type: 8xh20
+  hbm_bytes: 400.0e9           # KV budget after FP8 weights on 8x96GB
+
+cluster:
+  num_instances: 32
+  meta_store:
+    ttl_seconds: 120.0
+  router:
+    mode: min_pd
+    precise_probe_latency_us: 50.0
+    precise_probe_topk: 4
+    load_alpha: 1.0
+
+sim:
+  trace_path: traces/qwen_coder_blksz_16.jsonl
+  max_requests: null
+  output_dir: runs/qwen3_coder_8xh20
+  sample_interval_s: 1.0
+  seed: 42
diff --git a/models/GLM-5/config.json b/models/GLM-5/config.json
new file mode 100644
index 0000000..a34ad26
--- /dev/null
+++ b/models/GLM-5/config.json
@@ -0,0 +1,59 @@
+{
+  "architectures": [
+    "GlmMoeDsaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": [
+    154820,
+    154827,
+    154829
+  ],
+  "ep_size": 1,
+  "first_k_dense_replace": 3,
+  "hidden_act": "silu",
+  "head_dim": 64,
+  "hidden_size": 6144,
+  "index_head_dim": 128,
+  "index_n_heads": 32,
+  "index_topk": 2048,
+  "indexer_rope_interleave": true,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 202752,
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": 1,
+  "model_type": "glm_moe_dsa",
+  "n_group": 1,
+  "n_routed_experts": 256,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 64,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 78,
+  "num_key_value_heads": 64,
+  "num_nextn_predict_layers": 1,
+  "pad_token_id": 154820,
+  "pretraining_tp": 1,
+  "q_lora_rank": 2048,
+  "qk_head_dim": 256,
+  "qk_nope_head_dim": 192,
+  "qk_rope_head_dim": 64,
+  "rms_norm_eps": 1e-05,
+  "rope_interleave": true,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "routed_scaling_factor": 2.5,
+  "scoring_func": "sigmoid",
+  "tie_word_embeddings": false,
+  "topk_group": 1,
+  "topk_method": "noaux_tc",
+  "transformers_version": "5.0.2.dev0",
+  "use_cache": true,
+  "v_head_dim": 256,
+  "vocab_size": 154880
+}
diff --git a/models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json b/models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json
new file mode 100644
index 0000000..b06dbe1
--- /dev/null
+++ b/models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json
@@ -0,0 +1,41 @@
+{
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "decoder_sparse_step": 1,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 28,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_moe",
+  "moe_intermediate_size": 2560,
+  "norm_topk_prob": true,
+  "num_attention_heads": 96,
+  "num_experts": 160,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 62,
+  "num_key_value_heads": 8,
+  "output_router_logits": false,
+  "qkv_bias": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.0,
+  "shared_expert_intermediate_size": 0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "use_qk_norm": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/qwen-bailian-usagetraces-anon b/qwen-bailian-usagetraces-anon
new file mode 160000
index 0000000..27cfe19
--- /dev/null
+++ b/qwen-bailian-usagetraces-anon
@@ -0,0 +1 @@
+Subproject commit 27cfe19920eea8c4debb2b915758b678ede7b861
diff --git a/src/cluster/cluster.rs b/src/cluster/cluster.rs
new file mode 100644
index 0000000..233f947
--- /dev/null
+++ b/src/cluster/cluster.rs
@@ -0,0 +1,167 @@
+//! Cluster: routes arrivals, performs the L0 / L1 / remote-RDMA fetch chain
+//! described in the design diagram, and bookkeeps the global meta store.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::config::{Config, ModelConfig};
+use crate::instance::instance::AdmittedRequest;
+use crate::instance::Instance;
+use crate::router::{self, RouteDecision, Router};
+use crate::trace::RequestRecord;
+use crate::types::InstanceId;
+
+#[derive(Debug, Clone)]
+pub struct AdmissionStats {
+    pub instance: InstanceId,
+    pub l0_hit_blocks: u32,
+    pub l1_hit_blocks: u32,
+    pub remote_hit_blocks: u32,
+    pub miss_blocks: u32,
+    pub rdma_bytes: u64,
+    pub pcie_bytes: u64,
+    pub fetch_time_s: f64,
+    pub probe_overhead_s: f64,
+    pub ready_at: f64,
+    pub decision: RouteDecision,
+}
+
+pub struct Cluster {
+    pub instances: Vec<Instance>,
+    pub meta_store: MetaStore,
+    pub router: Box<dyn Router>,
+    pub block_size_tokens: u32,
+    pub kv_block_bytes: u64,
+}
+
+impl Cluster {
+    pub fn new(config: &Config, model: &ModelConfig) -> Self {
+        let mut instances = Vec::with_capacity(config.cluster.num_instances as usize);
+        for id in 0..config.cluster.num_instances {
+            instances.push(Instance::new(id as InstanceId, model, &config.hardware));
+        }
+        let meta_store = MetaStore::new(config.cluster.meta_store.ttl_seconds);
+        let router = router::build(config, config.sim.seed);
+        Self {
+            instances,
+            meta_store,
+            router,
+            block_size_tokens: model.block_size_tokens,
+            kv_block_bytes: model.kv_block_bytes(),
+        }
+    }
+
+    /// Route + admit a request. Returns the chosen instance plus rich
+    /// per-request stats for metrics. Does NOT schedule the BatchTick — the
+    /// simulator driver does that based on the returned `ready_at`.
+    pub fn route_and_admit(&mut self, req: &RequestRecord, now: f64) -> AdmissionStats {
+        let decision = self.router.route(req, &self.instances, &self.meta_store, now);
+        let inst_id = decision.chosen;
+        let probe_overhead_s = decision.probe_overhead_s;
+
+        // The router probe overhead delays the request's effective start time.
+        let effective_now = now + probe_overhead_s;
+
+        let inst = &mut self.instances[inst_id as usize];
+        let total_blocks = req.hash_ids.len() as u32;
+
+        // 1. L0 lookup (touches matched blocks).
+        let l0_hits = inst.cache.l0.longest_prefix(&req.hash_ids) as u32;
+
+        // 2. L1 lookup on the remaining suffix.
+        let suffix_after_l0 = &req.hash_ids[l0_hits as usize..];
+        let l1_hits = inst.cache.l1.longest_prefix(suffix_after_l0) as u32;
+        // L1->L0 transfer cost
+        let l1_bytes = (l1_hits as u64) * self.kv_block_bytes;
+        let mut t = effective_now;
+        if l1_hits > 0 {
+            t = inst.links.pcie.reserve(t, l1_bytes);
+            // Promote those blocks into L0
+            let mut evicted = Vec::new();
+            inst.cache.l0.insert_blocks(
+                &suffix_after_l0[..l1_hits as usize],
+                &mut evicted,
+            );
+        }
+
+        // 3. Remote v6d lookup for the still-remaining suffix.
+        let suffix_after_l1 = &suffix_after_l0[l1_hits as usize..];
+        let mut remote_hit_blocks: u32 = 0;
+        for &h in suffix_after_l1 {
+            // A block is remotely available iff some instance other than
+            // `inst_id` lists it (and not expired).
+            let owners = self.meta_store.instances_for(h, now);
+            let any_remote = owners.iter().any(|o| *o != inst_id);
+            if any_remote {
+                remote_hit_blocks += 1;
+            } else {
+                break; // contiguous prefix - stop on first miss
+            }
+        }
+        let remote_bytes = (remote_hit_blocks as u64) * self.kv_block_bytes;
+        if remote_hit_blocks > 0 {
+            // RDMA from peer host -> local DRAM, then PCIe -> GPU
+            let inst = &mut self.instances[inst_id as usize];
+            t = inst.links.rdma.reserve(t, remote_bytes);
+            t = inst.links.pcie.reserve(t, remote_bytes);
+            // Insert into local L1 (occupies LRU space) AND into L0
+            let pulled = &suffix_after_l1[..remote_hit_blocks as usize];
+            let mut evicted_l1 = Vec::new();
+            inst.cache.l1.insert_blocks(pulled, &mut evicted_l1);
+            let mut evicted_l0 = Vec::new();
+            inst.cache.l0.insert_blocks(pulled, &mut evicted_l0);
+            // The local instance now also owns these blocks - update meta_store.
+            for &h in pulled {
+                self.meta_store.insert(h, inst_id, now);
+            }
+        }
+
+        // 4. Miss = remaining tokens to prefill from scratch.
+        let miss_blocks = total_blocks - l0_hits - l1_hits - remote_hit_blocks;
+        let miss_tokens = miss_blocks * self.block_size_tokens;
+
+        // The newly-prefilled blocks (after the request runs) are inserted
+        // into L0 here, and into L1 / meta_store via async writeback. Doing
+        // this at admission time is OK because we're tracking presence, not
+        // actually moving bytes — the writeback latency is hidden behind
+        // request execution and we don't model meta_store inconsistency
+        // window beyond the TTL itself.
+        let inst = &mut self.instances[inst_id as usize];
+        let new_input_blocks = &req.hash_ids[(l0_hits + l1_hits + remote_hit_blocks) as usize..];
+        let mut evicted_l0 = Vec::new();
+        inst.cache.l0.insert_blocks(new_input_blocks, &mut evicted_l0);
+        let mut evicted_l1 = Vec::new();
+        inst.cache.l1.insert_blocks(new_input_blocks, &mut evicted_l1);
+        for &h in new_input_blocks {
+            self.meta_store.insert(h, inst_id, now);
+        }
+
+        // 5. Reserve KV slots for this request's prefill residency.
+        //    PD disaggregation: decode runs elsewhere, so only the input
+        //    blocks occupy HBM on this instance.
+        let reserved_blocks = total_blocks;
+        let admitted = AdmittedRequest {
+            req_id: req.req_id,
+            arrival: req.arrival,
+            ready_at: t,
+            prefill_tokens_remaining: miss_tokens,
+            reserved_blocks,
+        };
+        inst.admit(admitted);
+
+        let pcie_bytes = l1_bytes + remote_bytes;
+        let fetch_time_s = (t - effective_now).max(0.0);
+
+        AdmissionStats {
+            instance: inst_id,
+            l0_hit_blocks: l0_hits,
+            l1_hit_blocks: l1_hits,
+            remote_hit_blocks,
+            miss_blocks,
+            rdma_bytes: remote_bytes,
+            pcie_bytes,
+            fetch_time_s,
+            probe_overhead_s,
+            ready_at: t,
+            decision,
+        }
+    }
+}
diff --git a/src/cluster/meta_store.rs b/src/cluster/meta_store.rs
new file mode 100644
index 0000000..2fe3b94
--- /dev/null
+++ b/src/cluster/meta_store.rs
@@ -0,0 +1,161 @@
+//! Global redis-like KV-cache index.
+//!
+//! Maps `block_hash -> SmallVec<(instance_id, expires_at)>`. TTL eviction is
+//! lazy (on read). The TTL-aware router uses `score_prefix` to score each
+//! instance's predicted longest prefix without probing instances directly.
+
+use ahash::AHashMap;
+use smallvec::SmallVec;
+
+use crate::types::InstanceId;
+
+#[derive(Debug, Clone, Copy)]
+struct Entry {
+    instance: InstanceId,
+    expires_at: f64,
+}
+
+#[derive(Debug, Default)]
+pub struct MetaStore {
+    ttl_seconds: f64,
+    map: AHashMap<u64, SmallVec<[Entry; 4]>>,
+}
+
+impl MetaStore {
+    pub fn new(ttl_seconds: f64) -> Self {
+        Self {
+            ttl_seconds,
+            map: AHashMap::with_capacity(1 << 16),
+        }
+    }
+
+    pub fn ttl(&self) -> f64 {
+        self.ttl_seconds
+    }
+
+    /// Record that `instance` now holds `block_hash`.
+    pub fn insert(&mut self, block_hash: u64, instance: InstanceId, now: f64) {
+        let entry = Entry {
+            instance,
+            expires_at: now + self.ttl_seconds,
+        };
+        let bucket = self.map.entry(block_hash).or_default();
+        // refresh existing entry if present
+        for e in bucket.iter_mut() {
+            if e.instance == instance {
+                e.expires_at = entry.expires_at;
+                return;
+            }
+        }
+        bucket.push(entry);
+    }
+
+    /// Score each candidate instance by the longest leading prefix of
+    /// `hash_ids` for which the meta store believes that instance still holds
+    /// every block. Returns scores indexed by instance id.
+    pub fn score_prefix(&self, hash_ids: &[u64], now: f64, num_instances: usize) -> Vec<u32> {
+        if hash_ids.is_empty() {
+            return vec![0; num_instances];
+        }
+        // Walk hashes; at each step intersect the still-eligible instance set.
+        // Use a small bitset since num_instances is typically <= 1024.
+        let mut alive: Vec<bool> = vec![false; num_instances];
+        // First block: seed alive set
+        let first = hash_ids[0];
+        let mut any = false;
+        if let Some(bucket) = self.map.get(&first) {
+            for e in bucket {
+                if e.expires_at >= now {
+                    let i = e.instance as usize;
+                    if i < num_instances {
+                        alive[i] = true;
+                        any = true;
+                    }
+                }
+            }
+        }
+        let mut scores = vec![0u32; num_instances];
+        if !any {
+            return scores;
+        }
+        for i in 0..num_instances {
+            if alive[i] {
+                scores[i] = 1;
+            }
+        }
+        // Subsequent blocks: an instance survives only if the meta store still
+        // lists it for that block (and not expired).
+        for (depth, &h) in hash_ids.iter().enumerate().skip(1) {
+            let bucket = match self.map.get(&h) {
+                Some(b) => b,
+                None => break,
+            };
+            // mark instances present for this block
+            let mut present = vec![false; num_instances];
+            let mut any2 = false;
+            for e in bucket {
+                if e.expires_at >= now {
+                    let i = e.instance as usize;
+                    if i < num_instances && alive[i] {
+                        present[i] = true;
+                        any2 = true;
+                    }
+                }
+            }
+            if !any2 {
+                break;
+            }
+            for i in 0..num_instances {
+                if present[i] {
+                    scores[i] = (depth + 1) as u32;
+                } else {
+                    alive[i] = false;
+                }
+            }
+        }
+        scores
+    }
+
+    /// Lookup which (alive) instances claim to hold a given block.
+    pub fn instances_for(&self, hash: u64, now: f64) -> SmallVec<[InstanceId; 4]> {
+        let mut out = SmallVec::new();
+        if let Some(bucket) = self.map.get(&hash) {
+            for e in bucket {
+                if e.expires_at >= now {
+                    out.push(e.instance);
+                }
+            }
+        }
+        out
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn score_prefix_basic() {
+        let mut m = MetaStore::new(60.0);
+        m.insert(10, 0, 0.0);
+        m.insert(11, 0, 0.0);
+        m.insert(12, 0, 0.0);
+        m.insert(10, 1, 0.0);
+        m.insert(11, 1, 0.0);
+        // instance 1 only has 10,11; instance 0 has 10,11,12
+        let s = m.score_prefix(&[10, 11, 12, 13], 1.0, 4);
+        assert_eq!(s[0], 3);
+        assert_eq!(s[1], 2);
+        assert_eq!(s[2], 0);
+    }
+
+    #[test]
+    fn ttl_expiry() {
+        let mut m = MetaStore::new(1.0);
+        m.insert(10, 0, 0.0);
+        let s_now = m.score_prefix(&[10], 0.5, 2);
+        assert_eq!(s_now[0], 1);
+        let s_later = m.score_prefix(&[10], 5.0, 2);
+        assert_eq!(s_later[0], 0);
+    }
+}
diff --git a/src/cluster/mod.rs b/src/cluster/mod.rs
new file mode 100644
index 0000000..b6001dd
--- /dev/null
+++ b/src/cluster/mod.rs
@@ -0,0 +1,6 @@
+pub mod meta_store;
+#[allow(clippy::module_inception)]
+pub mod cluster;
+
+pub use cluster::Cluster;
+pub use meta_store::MetaStore;
diff --git a/src/config.rs b/src/config.rs
new file mode 100644
index 0000000..655ca94
--- /dev/null
+++ b/src/config.rs
@@ -0,0 +1,510 @@
+//! Top-level configuration loaded from YAML.
+//!
+//! Two config styles are supported:
+//!
+//! **Architecture-derived** (preferred): set `hidden_size`, `num_attention_heads`,
+//! `intermediate_size` and the simulator derives all roofline coefficients, KV
+//! block sizes, and weight-stream costs from the model architecture. Supports
+//! MoE, MLA (Multi-head Latent Attention), and DSA (DeepSeek Sparse Attention).
+//!
+//! **Legacy manual**: omit the architecture fields and set
+//! `flops_per_token_prefill` + `attn_quadratic_coeff` directly. Backward
+//! compatible with older YAML configs.
+
+use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Config {
+    pub model: ModelConfig,
+    pub hardware: HardwareConfig,
+    pub cluster: ClusterConfig,
+    pub sim: SimConfig,
+}
+
+// ---------------------------------------------------------------------------
+// Model
+// ---------------------------------------------------------------------------
+
+/// Model architecture + roofline coefficients.
+///
+/// If `hidden_size` is present the compute model is derived from architecture;
+/// otherwise the legacy `flops_per_token_prefill` / `attn_quadratic_coeff`
+/// fields are used directly.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct ModelConfig {
+    #[serde(default)]
+    pub name: String,
+    pub num_layers: u32,
+    pub num_kv_heads: u32,
+    pub head_dim: u32,
+    pub dtype_bytes: u32,
+    pub block_size_tokens: u32,
+
+    // -- Architecture fields (enable auto-derivation when all three present) --
+    #[serde(default)]
+    pub hidden_size: Option<u32>,
+    #[serde(default)]
+    pub num_attention_heads: Option<u32>,
+    #[serde(default)]
+    pub intermediate_size: Option<u32>,
+
+    #[serde(default)]
+    pub moe: Option<MoeConfig>,
+    #[serde(default)]
+    pub mla: Option<MlaConfig>,
+    #[serde(default)]
+    pub attention: Option<AttentionConfig>,
+
+    // -- Legacy manual coefficients (used when hidden_size is absent) ---------
+    #[serde(default)]
+    pub flops_per_token_prefill: Option<f64>,
+    #[serde(default)]
+    pub attn_quadratic_coeff: Option<f64>,
+    #[serde(default)]
+    pub bytes_per_token_prefill: Option<f64>,
+
+    #[serde(default, skip_serializing)]
+    #[allow(dead_code)]
+    pub flops_per_token_decode: Option<f64>,
+    #[serde(default, skip_serializing)]
+    #[allow(dead_code)]
+    pub bytes_per_token_decode: Option<f64>,
+}
+
+/// Whether the config is architecture-derived or uses legacy manual knobs.
+impl ModelConfig {
+    pub fn is_arch_mode(&self) -> bool {
+        self.hidden_size.is_some()
+    }
+
+    /// Bytes of KV cache per block.
+    ///
+    /// For standard / GQA: `2 * L * kv_heads * head_dim * dtype * block_tokens`
+    /// For MLA:            `L * (kv_lora_rank + qk_rope_head_dim) * dtype * block_tokens`
+    pub fn kv_block_bytes(&self) -> u64 {
+        if let Some(mla) = &self.mla {
+            self.num_layers as u64
+                * (mla.kv_lora_rank + mla.qk_rope_head_dim) as u64
+                * self.dtype_bytes as u64
+                * self.block_size_tokens as u64
+        } else {
+            2u64
+                * self.num_layers as u64
+                * self.num_kv_heads as u64
+                * self.head_dim as u64
+                * self.dtype_bytes as u64
+                * self.block_size_tokens as u64
+        }
+    }
+}
+
+// -- Sub-configs for MoE / MLA / Attention -----------------------------------
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MoeConfig {
+    pub num_experts: u32,
+    pub num_active_experts: u32,
+    #[serde(default)]
+    pub num_shared_experts: u32,
+    /// Per-expert FFN intermediate size (`moe_intermediate_size` in HF).
+    /// Falls back to parent `intermediate_size` if absent.
+    #[serde(default)]
+    pub expert_intermediate_size: Option<u32>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MlaConfig {
+    pub kv_lora_rank: u32,
+    pub q_lora_rank: u32,
+    pub qk_nope_head_dim: u32,
+    pub qk_rope_head_dim: u32,
+    pub v_head_dim: u32,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum AttentionConfig {
+    Dense,
+    SlidingWindow {
+        window_size: u32,
+    },
+    Dsa {
+        /// Tokens within this window attend fully.
+        dense_window: u32,
+        /// Beyond the window, attend to every `sparse_stride`-th token.
+        sparse_stride: u32,
+        /// Number of initial layers that use dense attention regardless.
+        #[serde(default)]
+        first_dense_layers: u32,
+    },
+}
+
+// ---------------------------------------------------------------------------
+// Hardware
+// ---------------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HardwareConfig {
+    pub gpu_flops: f64,
+    pub gpu_mem_bw: f64,
+    pub hbm_bytes: f64,
+    pub dram_bytes: f64,
+    pub pcie_bw: f64,
+    pub pcie_latency_us: f64,
+    pub rdma_bw: f64,
+    pub rdma_latency_us: f64,
+    #[serde(default = "default_max_batch_slots")]
+    pub max_batch_slots: u32,
+    #[serde(default = "default_prefill_chunk_tokens")]
+    pub prefill_chunk_tokens: u32,
+}
+
+fn default_max_batch_slots() -> u32 {
+    256
+}
+fn default_prefill_chunk_tokens() -> u32 {
+    2048
+}
+
+// ---------------------------------------------------------------------------
+// Cluster
+// ---------------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ClusterConfig {
+    pub num_instances: u32,
+    pub meta_store: MetaStoreConfig,
+    pub router: RouterConfig,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetaStoreConfig {
+    pub ttl_seconds: f64,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RouterConfig {
+    pub mode: RouterMode,
+    #[serde(default = "default_probe_latency_us")]
+    pub precise_probe_latency_us: f64,
+    #[serde(default = "default_probe_topk")]
+    pub precise_probe_topk: u32,
+    #[serde(default = "default_load_alpha")]
+    pub load_alpha: f64,
+    /// Weight for load (queue_len) in cache_score: `2^(α·load + β·miss)`.
+    #[serde(default = "default_score_alpha")]
+    pub score_alpha: f64,
+    /// Weight for cache miss in cache_score: `2^(α·load + β·miss)`.
+    #[serde(default = "default_score_beta")]
+    pub score_beta: f64,
+    /// Number of leading blocks for prefix fingerprint in prefix_affinity.
+    #[serde(default = "default_prefix_k")]
+    pub prefix_k: usize,
+    /// Number of top-affinity instances to consider in prefix_affinity.
+    /// 0 means auto (n/8, min 2).
+    #[serde(default)]
+    pub affinity_fan_out: usize,
+}
+
+fn default_probe_latency_us() -> f64 {
+    50.0
+}
+fn default_probe_topk() -> u32 {
+    4
+}
+fn default_load_alpha() -> f64 {
+    1.0
+}
+fn default_score_alpha() -> f64 {
+    1.0
+}
+fn default_score_beta() -> f64 {
+    0.1
+}
+fn default_prefix_k() -> usize {
+    8
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum RouterMode {
+    Random,
+    RoundRobin,
+    LeastLoaded,
+    LeastTokens,
+    TtlAware,
+    Precise,
+    MinPd,
+    CacheLoad,
+    CacheScore,
+    EstimatedTtft,
+    PrefixAffinity,
+}
+
+impl RouterMode {
+    pub fn parse(s: &str) -> Result<Self> {
+        match s {
+            "random" => Ok(Self::Random),
+            "round_robin" | "rr" => Ok(Self::RoundRobin),
+            "least_loaded" => Ok(Self::LeastLoaded),
+            "least_tokens" | "lt" => Ok(Self::LeastTokens),
+            "ttl_aware" | "ttl" => Ok(Self::TtlAware),
+            "precise" | "precise_aware" => Ok(Self::Precise),
+            "min_pd" | "minpd" | "pd" => Ok(Self::MinPd),
+            "cache_load" | "cl" => Ok(Self::CacheLoad),
+            "cache_score" | "cs" => Ok(Self::CacheScore),
+            "estimated_ttft" | "ettft" | "optimal" => Ok(Self::EstimatedTtft),
+            "prefix_affinity" | "affinity" | "pa" => Ok(Self::PrefixAffinity),
+            other => Err(anyhow::anyhow!("unknown router mode: {other}")),
+        }
+    }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::Random => "random",
+            Self::RoundRobin => "round_robin",
+            Self::LeastLoaded => "least_loaded",
+            Self::LeastTokens => "least_tokens",
+            Self::TtlAware => "ttl_aware",
+            Self::Precise => "precise",
+            Self::MinPd => "min_pd",
+            Self::CacheLoad => "cache_load",
+            Self::CacheScore => "cache_score",
+            Self::EstimatedTtft => "estimated_ttft",
+            Self::PrefixAffinity => "prefix_affinity",
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Sim
+// ---------------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SimConfig {
+    pub trace_path: String,
+    #[serde(default)]
+    pub max_requests: Option<u64>,
+    pub output_dir: String,
+    #[serde(default = "default_sample_interval")]
+    pub sample_interval_s: f64,
+    #[serde(default)]
+    pub seed: u64,
+}
+
+fn default_sample_interval() -> f64 {
+    1.0
+}
+
+impl Config {
+    /// Load from a YAML file, resolving `config_json` (HF model config) and
+    /// hardware `type` (preset) references if present.
+    pub fn from_yaml_path<P: AsRef<Path>>(path: P) -> Result<Self> {
+        let path = path.as_ref();
+        let raw_str = std::fs::read_to_string(path)
+            .with_context(|| format!("reading config {}", path.display()))?;
+        let raw: RawConfig = serde_yaml::from_str(&raw_str)
+            .with_context(|| format!("parsing config {}", path.display()))?;
+        let yaml_dir = path.parent().unwrap_or(Path::new("."));
+        raw.resolve(yaml_dir)
+            .with_context(|| format!("resolving config {}", path.display()))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Raw deserialization types — flexible YAML loading
+// ---------------------------------------------------------------------------
+//
+// All model/hardware fields are `Option` so that `config_json` and `type`
+// can supply base values, with explicit YAML fields acting as overrides.
+// Existing YAML configs (no config_json / type) continue to work unchanged.
+
+#[derive(Deserialize)]
+struct RawConfig {
+    model: RawModelConfig,
+    hardware: RawHardwareConfig,
+    cluster: ClusterConfig,
+    sim: SimConfig,
+}
+
+#[derive(Deserialize)]
+struct RawModelConfig {
+    /// Path to a HuggingFace `config.json`.  Resolved relative to the YAML
+    /// file's directory.  When present, architecture fields are loaded from
+    /// the JSON and any explicit YAML fields act as overrides.
+    #[serde(default)]
+    config_json: Option<String>,
+
+    #[serde(default)]
+    name: Option<String>,
+    #[serde(default)]
+    num_layers: Option<u32>,
+    #[serde(default)]
+    num_kv_heads: Option<u32>,
+    #[serde(default)]
+    head_dim: Option<u32>,
+    #[serde(default)]
+    dtype_bytes: Option<u32>,
+    #[serde(default)]
+    block_size_tokens: Option<u32>,
+    #[serde(default)]
+    hidden_size: Option<u32>,
+    #[serde(default)]
+    num_attention_heads: Option<u32>,
+    #[serde(default)]
+    intermediate_size: Option<u32>,
+    #[serde(default)]
+    moe: Option<MoeConfig>,
+    #[serde(default)]
+    mla: Option<MlaConfig>,
+    #[serde(default)]
+    attention: Option<AttentionConfig>,
+    #[serde(default)]
+    flops_per_token_prefill: Option<f64>,
+    #[serde(default)]
+    attn_quadratic_coeff: Option<f64>,
+    #[serde(default)]
+    bytes_per_token_prefill: Option<f64>,
+    #[serde(default)]
+    flops_per_token_decode: Option<f64>,
+    #[serde(default)]
+    bytes_per_token_decode: Option<f64>,
+}
+
+#[derive(Deserialize)]
+struct RawHardwareConfig {
+    /// Hardware preset name (e.g. `"h100"`, `"8xb200"`).  When present,
+    /// specs are loaded from the built-in preset database and any explicit
+    /// YAML fields override individual values.
+    #[serde(default, rename = "type")]
+    hw_type: Option<String>,
+
+    #[serde(default)]
+    gpu_flops: Option<f64>,
+    #[serde(default)]
+    gpu_mem_bw: Option<f64>,
+    #[serde(default)]
+    hbm_bytes: Option<f64>,
+    #[serde(default)]
+    dram_bytes: Option<f64>,
+    #[serde(default)]
+    pcie_bw: Option<f64>,
+    #[serde(default)]
+    pcie_latency_us: Option<f64>,
+    #[serde(default)]
+    rdma_bw: Option<f64>,
+    #[serde(default)]
+    rdma_latency_us: Option<f64>,
+    #[serde(default)]
+    max_batch_slots: Option<u32>,
+    #[serde(default)]
+    prefill_chunk_tokens: Option<u32>,
+}
+
+// -- Resolution (merge base + YAML overrides → final Config) ------------------
+
+impl RawConfig {
+    fn resolve(self, yaml_dir: &Path) -> Result<Config> {
+        Ok(Config {
+            model: self.model.resolve(yaml_dir)?,
+            hardware: self.hardware.resolve()?,
+            cluster: self.cluster,
+            sim: self.sim,
+        })
+    }
+}
+
+impl RawModelConfig {
+    fn resolve(self, yaml_dir: &Path) -> Result<ModelConfig> {
+        // Start from HF config.json if specified, else empty default.
+        let mut m = if let Some(ref cj) = self.config_json {
+            let cj_path = if Path::new(cj).is_absolute() {
+                std::path::PathBuf::from(cj)
+            } else {
+                yaml_dir.join(cj)
+            };
+            crate::hf_config::parse(&cj_path)?
+        } else {
+            ModelConfig::default()
+        };
+
+        // Overlay: explicit YAML fields override the base.
+        if let Some(v) = self.name { m.name = v; }
+        if let Some(v) = self.num_layers { m.num_layers = v; }
+        if let Some(v) = self.num_kv_heads { m.num_kv_heads = v; }
+        if let Some(v) = self.head_dim { m.head_dim = v; }
+        if let Some(v) = self.dtype_bytes { m.dtype_bytes = v; }
+        if let Some(v) = self.block_size_tokens { m.block_size_tokens = v; }
+        if let Some(v) = self.hidden_size { m.hidden_size = Some(v); }
+        if let Some(v) = self.num_attention_heads { m.num_attention_heads = Some(v); }
+        if let Some(v) = self.intermediate_size { m.intermediate_size = Some(v); }
+        if self.moe.is_some() { m.moe = self.moe; }
+        if self.mla.is_some() { m.mla = self.mla; }
+        if self.attention.is_some() { m.attention = self.attention; }
+        if let Some(v) = self.flops_per_token_prefill { m.flops_per_token_prefill = Some(v); }
+        if let Some(v) = self.attn_quadratic_coeff { m.attn_quadratic_coeff = Some(v); }
+        if let Some(v) = self.bytes_per_token_prefill { m.bytes_per_token_prefill = Some(v); }
+        if let Some(v) = self.flops_per_token_decode { m.flops_per_token_decode = Some(v); }
+        if let Some(v) = self.bytes_per_token_decode { m.bytes_per_token_decode = Some(v); }
+
+        // Validate deployment-specific fields that HF config.json never provides.
+        anyhow::ensure!(
+            m.dtype_bytes > 0,
+            "model.dtype_bytes is required (not in HF config.json)"
+        );
+        anyhow::ensure!(
+            m.block_size_tokens > 0,
+            "model.block_size_tokens is required (not in HF config.json)"
+        );
+
+        Ok(m)
+    }
+}
+
+impl RawHardwareConfig {
+    fn resolve(self) -> Result<HardwareConfig> {
+        // Start from preset if specified, else zeros (all must come from YAML).
+        let mut hw = if let Some(ref t) = self.hw_type {
+            crate::hardware_presets::resolve(t).ok_or_else(|| {
+                anyhow::anyhow!(
+                    "unknown hardware preset '{t}'. Available: {}",
+                    crate::hardware_presets::AVAILABLE.join(", ")
+                )
+            })?
+        } else {
+            HardwareConfig {
+                gpu_flops: 0.0,
+                gpu_mem_bw: 0.0,
+                hbm_bytes: 0.0,
+                dram_bytes: 0.0,
+                pcie_bw: 0.0,
+                pcie_latency_us: 5.0,
+                rdma_bw: 0.0,
+                rdma_latency_us: 8.0,
+                max_batch_slots: default_max_batch_slots(),
+                prefill_chunk_tokens: default_prefill_chunk_tokens(),
+            }
+        };
+
+        // Overlay: explicit YAML fields override the preset / defaults.
+        if let Some(v) = self.gpu_flops { hw.gpu_flops = v; }
+        if let Some(v) = self.gpu_mem_bw { hw.gpu_mem_bw = v; }
+        if let Some(v) = self.hbm_bytes { hw.hbm_bytes = v; }
+        if let Some(v) = self.dram_bytes { hw.dram_bytes = v; }
+        if let Some(v) = self.pcie_bw { hw.pcie_bw = v; }
+        if let Some(v) = self.pcie_latency_us { hw.pcie_latency_us = v; }
+        if let Some(v) = self.rdma_bw { hw.rdma_bw = v; }
+        if let Some(v) = self.rdma_latency_us { hw.rdma_latency_us = v; }
+        if let Some(v) = self.max_batch_slots { hw.max_batch_slots = v; }
+        if let Some(v) = self.prefill_chunk_tokens { hw.prefill_chunk_tokens = v; }
+
+        // Validate minimum requirements.
+        anyhow::ensure!(hw.gpu_flops > 0.0, "hardware.gpu_flops is required");
+        anyhow::ensure!(hw.gpu_mem_bw > 0.0, "hardware.gpu_mem_bw is required");
+        anyhow::ensure!(hw.hbm_bytes > 0.0, "hardware.hbm_bytes is required");
+
+        Ok(hw)
+    }
+}
diff --git a/src/driver.rs b/src/driver.rs
new file mode 100644
index 0000000..03fd2cf
--- /dev/null
+++ b/src/driver.rs
@@ -0,0 +1,170 @@
+//! Simulation driver: pulls trace records, advances the event queue, runs
+//! instance batch ticks, and emits metrics.
+
+use anyhow::Result;
+use std::collections::HashMap;
+use std::path::Path;
+
+use crate::cluster::Cluster;
+use crate::config::Config;
+use crate::metrics::per_request::{PerRequestRow, PerRequestWriter};
+use crate::metrics::routing_log::RoutingLogWriter;
+use crate::metrics::summary::Summary;
+use crate::metrics::timeseries::{TimeseriesRow, TimeseriesWriter};
+use crate::sim::{Event, EventQueue};
+use crate::trace::{RequestRecord, TraceReader};
+
+pub struct RunOutputs {
+    pub summary: Summary,
+    pub rows: Vec<PerRequestRow>,
+}
+
+#[derive(Debug, Clone)]
+struct InflightInfo {
+    arrival: f64,
+    instance: u32,
+    total_blocks: u32,
+    l0_hit_blocks: u32,
+    l1_hit_blocks: u32,
+    remote_hit_blocks: u32,
+    miss_blocks: u32,
+    rdma_bytes: u64,
+    pcie_bytes: u64,
+    probe_overhead_s: f64,
+}
+
+pub fn run(config: &Config, output_subdir: Option<&str>) -> Result<RunOutputs> {
+    let mut cluster = Cluster::new(config, &config.model);
+    let mut q = EventQueue::new();
+
+    // Output directory
+    let base = Path::new(&config.sim.output_dir);
+    let out_dir = match output_subdir {
+        Some(s) => base.join(s),
+        None => base.to_path_buf(),
+    };
+    std::fs::create_dir_all(&out_dir)?;
+
+    let mut req_writer = PerRequestWriter::create(out_dir.join("per_request.csv"))?;
+    let mut ts_writer = TimeseriesWriter::create(out_dir.join("instances.csv"))?;
+    let mut rt_writer = RoutingLogWriter::create(out_dir.join("routing_log.jsonl"))?;
+
+    let mut trace = TraceReader::open(&config.sim.trace_path, config.sim.max_requests)?;
+    // Load all records (cheap for moderate traces) so we can index by req_id.
+    // For very large traces a streaming approach with a peekable iterator
+    // would be better; this keeps the driver simple.
+    let records: Vec<RequestRecord> = (&mut trace).collect::<Result<Vec<_>, _>>()?;
+    let mut by_id: HashMap<u64, RequestRecord> = HashMap::with_capacity(records.len());
+    for r in &records {
+        q.schedule(r.arrival, Event::Arrival { req_id: r.req_id });
+        by_id.insert(r.req_id, r.clone());
+    }
+    // Periodic samples
+    if config.sim.sample_interval_s > 0.0 && !records.is_empty() {
+        let max_t = records.iter().map(|r| r.arrival).fold(0.0_f64, f64::max);
+        let mut t = 0.0;
+        while t <= max_t + 60.0 {
+            q.schedule(t, Event::Sample);
+            t += config.sim.sample_interval_s;
+        }
+    }
+
+    let mut inflight: HashMap<u64, InflightInfo> = HashMap::new();
+    let mut rows: Vec<PerRequestRow> = Vec::with_capacity(records.len());
+
+    while let Some((now, ev)) = q.pop() {
+        match ev {
+            Event::Arrival { req_id } => {
+                let req = match by_id.get(&req_id) {
+                    Some(r) => r.clone(),
+                    None => continue,
+                };
+                let stats = cluster.route_and_admit(&req, now);
+                rt_writer.write(&stats.decision)?;
+                inflight.insert(
+                    req_id,
+                    InflightInfo {
+                        arrival: req.arrival,
+                        instance: stats.instance,
+                        total_blocks: req.hash_ids.len() as u32,
+                        l0_hit_blocks: stats.l0_hit_blocks,
+                        l1_hit_blocks: stats.l1_hit_blocks,
+                        remote_hit_blocks: stats.remote_hit_blocks,
+                        miss_blocks: stats.miss_blocks,
+                        rdma_bytes: stats.rdma_bytes,
+                        pcie_bytes: stats.pcie_bytes,
+                        probe_overhead_s: stats.probe_overhead_s,
+                    },
+                );
+                let inst = &mut cluster.instances[stats.instance as usize];
+                if !inst.tick_scheduled {
+                    inst.tick_scheduled = true;
+                    let when = stats.ready_at.max(now);
+                    q.schedule(when, Event::BatchTick { instance: stats.instance });
+                }
+            }
+            Event::BatchTick { instance } => {
+                let inst = &mut cluster.instances[instance as usize];
+                inst.tick_scheduled = false;
+                let result = inst.step(now);
+                for (rid, ttft, end) in result.completed {
+                    if let Some(info) = inflight.remove(&rid) {
+                        let row = PerRequestRow {
+                            req_id: rid,
+                            arrival: info.arrival,
+                            ttft,
+                            e2e: end - info.arrival,
+                            instance: info.instance,
+                            total_blocks: info.total_blocks,
+                            l0_hit_blocks: info.l0_hit_blocks,
+                            l1_hit_blocks: info.l1_hit_blocks,
+                            remote_hit_blocks: info.remote_hit_blocks,
+                            miss_blocks: info.miss_blocks,
+                            rdma_bytes: info.rdma_bytes,
+                            pcie_bytes: info.pcie_bytes,
+                            probe_overhead_s: info.probe_overhead_s,
+                        };
+                        req_writer.write(&row)?;
+                        rows.push(row);
+                    }
+                }
+                if let Some(next) = result.next_tick {
+                    let inst = &mut cluster.instances[instance as usize];
+                    if !inst.tick_scheduled {
+                        inst.tick_scheduled = true;
+                        q.schedule(next.max(now), Event::BatchTick { instance });
+                    }
+                }
+            }
+            Event::Sample => {
+                for inst in &cluster.instances {
+                    let busy = if inst.queue_len() > 0 { 1 } else { 0 };
+                    ts_writer.write(&TimeseriesRow {
+                        t: now,
+                        instance: inst.id,
+                        queue_len: inst.queue_len(),
+                        kv_blocks_used: inst.kv_blocks_used,
+                        kv_blocks_total: inst.hbm_block_budget,
+                        busy,
+                    })?;
+                }
+            }
+            Event::Stop => break,
+        }
+    }
+
+    req_writer.finish()?;
+    ts_writer.finish()?;
+    rt_writer.finish()?;
+
+    let sim_duration_s = rows
+        .iter()
+        .map(|r| r.arrival + r.e2e)
+        .fold(0.0_f64, f64::max);
+    let router_name = config.cluster.router.mode.as_str().to_string();
+    let summary = Summary::from_rows(&router_name, &rows, sim_duration_s);
+    let summary_json = serde_json::to_string_pretty(&summary)?;
+    std::fs::write(out_dir.join("summary.json"), summary_json)?;
+
+    Ok(RunOutputs { summary, rows })
+}
diff --git a/src/hardware_presets.rs b/src/hardware_presets.rs
new file mode 100644
index 0000000..b99fd6e
--- /dev/null
+++ b/src/hardware_presets.rs
@@ -0,0 +1,225 @@
+//! Built-in hardware presets for common GPU configurations.
+//!
+//! Presets provide baseline specs for single GPUs and tensor-parallel (TP)
+//! groups.  All values can be overridden in the YAML config by specifying
+//! explicit fields alongside `type`:
+//!
+//! ```yaml
+//! hardware:
+//!   type: 8xb200
+//!   hbm_bytes: 500.0e9   # override total HBM with actual KV budget
+//! ```
+
+use crate::config::HardwareConfig;
+
+/// All recognized preset names (for help/error messages).
+pub const AVAILABLE: &[&str] = &[
+    "h100",
+    "h800",
+    "h20",
+    "a100-80gb",
+    "a100-40gb",
+    "b200",
+    "2xh100",
+    "4xh100",
+    "8xh100",
+    "2xh800",
+    "4xh800",
+    "8xh800",
+    "2xh20",
+    "4xh20",
+    "8xh20",
+    "2xb200",
+    "4xb200",
+    "8xb200",
+];
+
+/// Resolve a hardware preset by name.
+///
+/// Case-insensitive; hyphens, underscores, and spaces are stripped before
+/// matching.  Accepts `NxGPU` patterns (e.g. `8xb200`).
+pub fn resolve(name: &str) -> Option<HardwareConfig> {
+    let key = normalize(name);
+    let (count, gpu) = parse_count_gpu(&key);
+    match gpu.as_str() {
+        "h100" => Some(make_config(count, &H100)),
+        "h800" => Some(make_config(count, &H800)),
+        "h20" => Some(make_config(count, &H20)),
+        "a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
+        "a10040gb" => Some(make_config(count, &A100_40GB)),
+        "b200" => Some(make_config(count, &B200)),
+        _ => None,
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Internals
+// ---------------------------------------------------------------------------
+
+fn normalize(s: &str) -> String {
+    s.to_ascii_lowercase().replace(['-', '_', ' '], "")
+}
+
+/// Parse `"8xh100"` → `(8, "h100")`, `"h100"` → `(1, "h100")`.
+fn parse_count_gpu(s: &str) -> (u32, String) {
+    if let Some(pos) = s.find('x') {
+        if let Ok(n) = s[..pos].parse::<u32>() {
+            return (n, s[pos + 1..].to_string());
+        }
+    }
+    (1, s.to_string())
+}
+
+// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
+
+struct GpuBase {
+    flops: f64,   // BF16 dense TFLOPS
+    mem_bw: f64,  // HBM bandwidth (B/s)
+    hbm: f64,     // Total HBM (bytes)
+    pcie_gen: u32, // PCIe generation (4/5/6)
+}
+
+const H100: GpuBase = GpuBase {
+    flops: 9.89e14,  // 989 TFLOPS BF16
+    mem_bw: 3.35e12, // 3.35 TB/s HBM3
+    hbm: 80.0e9,     // 80 GB
+    pcie_gen: 5,
+};
+
+const H800: GpuBase = GpuBase {
+    flops: 9.89e14,  // same die as H100
+    mem_bw: 3.35e12, // 3.35 TB/s HBM3
+    hbm: 80.0e9,     // 80 GB
+    pcie_gen: 5,
+};
+
+const H20: GpuBase = GpuBase {
+    flops: 1.48e14,  // 148 TFLOPS BF16 (China-export Hopper)
+    mem_bw: 4.0e12,  // 4.0 TB/s HBM3
+    hbm: 96.0e9,     // 96 GB
+    pcie_gen: 5,
+};
+
+const A100_80GB: GpuBase = GpuBase {
+    flops: 3.12e14,   // 312 TFLOPS BF16
+    mem_bw: 2.0e12,   // 2.0 TB/s HBM2e
+    hbm: 80.0e9,      // 80 GB
+    pcie_gen: 4,
+};
+
+const A100_40GB: GpuBase = GpuBase {
+    flops: 3.12e14,    // 312 TFLOPS BF16
+    mem_bw: 1.555e12,  // 1.555 TB/s HBM2e
+    hbm: 40.0e9,       // 40 GB
+    pcie_gen: 4,
+};
+
+const B200: GpuBase = GpuBase {
+    flops: 2.25e15,  // 2250 TFLOPS BF16
+    mem_bw: 8.0e12,  // 8.0 TB/s HBM3e
+    hbm: 192.0e9,    // 192 GB
+    pcie_gen: 6,
+};
+
+/// Build a [`HardwareConfig`] from a base GPU spec × TP count.
+///
+/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
+/// PCIe bandwidth scales linearly (one link per GPU).  RDMA bandwidth
+/// assumes one NIC for ≤4 GPUs and two NICs for ≥8.  Server DRAM is a
+/// reasonable default based on typical deployment sizes.
+fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
+    let f = n as f64;
+
+    // PCIe per-GPU bandwidth and latency by generation
+    let (pcie_per_gpu, pcie_lat) = match base.pcie_gen {
+        6 => (128.0e9, 4.0),  // Gen6 x16
+        5 => (64.0e9, 5.0),   // Gen5 x16
+        _ => (32.0e9, 5.0),   // Gen4 x16
+    };
+
+    // RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers
+    let (rdma_base, rdma_lat) = match base.pcie_gen {
+        6 => (50.0e9, 6.0),  // 400 Gbps NIC
+        _ => (25.0e9, 8.0),  // 200 Gbps NIC
+    };
+    let rdma_scale = if n >= 8 { 2.0 } else { 1.0 };
+
+    // Server DRAM: rough defaults by deployment size
+    let dram = match n {
+        1 => 512.0e9,
+        2..=4 => 1.0e12,
+        _ => 1.5e12,
+    };
+
+    HardwareConfig {
+        gpu_flops: base.flops * f,
+        gpu_mem_bw: base.mem_bw * f,
+        hbm_bytes: base.hbm * f,
+        dram_bytes: dram,
+        pcie_bw: pcie_per_gpu * f,
+        pcie_latency_us: pcie_lat,
+        rdma_bw: rdma_base * rdma_scale,
+        rdma_latency_us: rdma_lat,
+        max_batch_slots: 256,
+        prefill_chunk_tokens: if n >= 4 { 4096 } else { 2048 },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn resolve_single_gpu() {
+        let hw = resolve("h100").unwrap();
+        assert!((hw.gpu_flops - 9.89e14).abs() < 1e10);
+        assert!((hw.hbm_bytes - 80e9).abs() < 1e6);
+        assert_eq!(hw.prefill_chunk_tokens, 2048);
+    }
+
+    #[test]
+    fn resolve_tp_group() {
+        let hw = resolve("8xb200").unwrap();
+        assert!((hw.gpu_flops - 2.25e15 * 8.0).abs() < 1e11);
+        assert!((hw.hbm_bytes - 192e9 * 8.0).abs() < 1e6);
+        assert!((hw.pcie_bw - 128e9 * 8.0).abs() < 1e6);
+        assert_eq!(hw.prefill_chunk_tokens, 4096);
+    }
+
+    #[test]
+    fn resolve_case_and_separator_insensitive() {
+        assert!(resolve("H100").is_some());
+        assert!(resolve("8xB200").is_some());
+        assert!(resolve("8x-B200").is_some());
+        assert!(resolve("a100-80gb").is_some());
+        assert!(resolve("A100_80GB").is_some());
+        assert!(resolve("a100_80gb").is_some());
+    }
+
+    #[test]
+    fn resolve_unknown_returns_none() {
+        assert!(resolve("v100").is_none());
+        assert!(resolve("tpu-v5").is_none());
+        assert!(resolve("").is_none());
+    }
+
+    #[test]
+    fn a100_variants() {
+        let a80 = resolve("a100-80gb").unwrap();
+        let a40 = resolve("a100-40gb").unwrap();
+        assert!((a80.hbm_bytes - 80e9).abs() < 1e6);
+        assert!((a40.hbm_bytes - 40e9).abs() < 1e6);
+        assert!(a80.gpu_mem_bw > a40.gpu_mem_bw);
+    }
+
+    #[test]
+    fn scaling_is_linear() {
+        let s1 = resolve("h100").unwrap();
+        let s4 = resolve("4xh100").unwrap();
+        let s8 = resolve("8xh100").unwrap();
+        assert!((s4.gpu_flops - s1.gpu_flops * 4.0).abs() < 1.0);
+        assert!((s8.gpu_flops - s1.gpu_flops * 8.0).abs() < 1.0);
+        assert!((s4.gpu_mem_bw - s1.gpu_mem_bw * 4.0).abs() < 1.0);
+        assert!((s8.hbm_bytes - s1.hbm_bytes * 8.0).abs() < 1.0);
+    }
+}
diff --git a/src/hf_config.rs b/src/hf_config.rs
new file mode 100644
index 0000000..8777751
--- /dev/null
+++ b/src/hf_config.rs
@@ -0,0 +1,193 @@
+//! Parse a HuggingFace `config.json` into [`ModelConfig`] fields.
+//!
+//! Handles common architectures: standard transformer, GQA, MoE, MLA
+//! (Multi-head Latent Attention), and DSA (DeepSeek Sparse Attention).
+
+use anyhow::{Context, Result};
+use serde_json::Value;
+use std::path::Path;
+
+use crate::config::{AttentionConfig, MlaConfig, MoeConfig, ModelConfig};
+
+/// Parse a HuggingFace config.json and return a partially-populated
+/// [`ModelConfig`]. The caller must still set `dtype_bytes` and
+/// `block_size_tokens` (not part of the HF schema).
+pub fn parse(path: &Path) -> Result<ModelConfig> {
+    let raw = std::fs::read_to_string(path)
+        .with_context(|| format!("reading config.json at {}", path.display()))?;
+    let v: Value = serde_json::from_str(&raw)
+        .with_context(|| format!("parsing config.json at {}", path.display()))?;
+    parse_value(&v)
+}
+
+fn u32_field(v: &Value, key: &str) -> Option<u32> {
+    v.get(key).and_then(|x| x.as_u64()).map(|x| x as u32)
+}
+
+fn parse_value(v: &Value) -> Result<ModelConfig> {
+    let name = v
+        .get("model_type")
+        .and_then(|x| x.as_str())
+        .unwrap_or("unknown")
+        .to_string();
+
+    let num_layers = u32_field(v, "num_hidden_layers");
+    let hidden_size = u32_field(v, "hidden_size");
+    let num_attention_heads = u32_field(v, "num_attention_heads");
+    let num_kv_heads = u32_field(v, "num_key_value_heads")
+        .or(num_attention_heads); // default to MHA
+    let head_dim = u32_field(v, "head_dim").or_else(|| {
+        // Infer: hidden_size / num_attention_heads
+        match (hidden_size, num_attention_heads) {
+            (Some(h), Some(n)) if n > 0 => Some(h / n),
+            _ => None,
+        }
+    });
+    let intermediate_size = u32_field(v, "intermediate_size");
+
+    // --- MoE detection ---
+    let moe = u32_field(v, "n_routed_experts")
+        .or_else(|| u32_field(v, "num_local_experts"))
+        .or_else(|| u32_field(v, "num_experts"))
+        .map(|num_experts| MoeConfig {
+            num_experts,
+            num_active_experts: u32_field(v, "num_experts_per_tok")
+                .or_else(|| u32_field(v, "num_experts_per_topk"))
+                .unwrap_or(2),
+            num_shared_experts: u32_field(v, "n_shared_experts").unwrap_or(0),
+            expert_intermediate_size: u32_field(v, "moe_intermediate_size"),
+        });
+
+    // --- MLA detection (kv_lora_rank present → MLA) ---
+    let mla = u32_field(v, "kv_lora_rank").and_then(|kv_lora_rank| {
+        Some(MlaConfig {
+            kv_lora_rank,
+            q_lora_rank: u32_field(v, "q_lora_rank")?,
+            qk_nope_head_dim: u32_field(v, "qk_nope_head_dim")?,
+            qk_rope_head_dim: u32_field(v, "qk_rope_head_dim")?,
+            v_head_dim: u32_field(v, "v_head_dim")?,
+        })
+    });
+
+    // --- Attention pattern ---
+    let attention =
+        if let Some(first_dense) = u32_field(v, "first_k_dense_replace") {
+            // DSA-style model (GLM-5, DeepSeek-V3).
+            // dense_window and sparse_stride are typically not in config.json;
+            // use sensible defaults the user can override in YAML.
+            Some(AttentionConfig::Dsa {
+                dense_window: 4096,
+                sparse_stride: 8,
+                first_dense_layers: first_dense,
+            })
+        } else if let Some(sw) = v
+            .get("sliding_window")
+            .and_then(|x| x.as_u64())
+            .map(|x| x as u32)
+        {
+            Some(AttentionConfig::SlidingWindow { window_size: sw })
+        } else {
+            None // dense by default
+        };
+
+    Ok(ModelConfig {
+        name,
+        num_layers: num_layers.unwrap_or(0),
+        num_kv_heads: num_kv_heads.unwrap_or(0),
+        head_dim: head_dim.unwrap_or(0),
+        hidden_size,
+        num_attention_heads,
+        intermediate_size,
+        moe,
+        mla,
+        attention,
+        // Deployment fields: must come from YAML
+        dtype_bytes: 0,
+        block_size_tokens: 0,
+        ..Default::default()
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_dense_model() {
+        let json = serde_json::json!({
+            "model_type": "qwen2",
+            "num_hidden_layers": 28,
+            "hidden_size": 3584,
+            "num_attention_heads": 28,
+            "num_key_value_heads": 4,
+            "intermediate_size": 18944,
+        });
+        let m = parse_value(&json).unwrap();
+        assert_eq!(m.num_layers, 28);
+        assert_eq!(m.hidden_size, Some(3584));
+        assert_eq!(m.num_kv_heads, 4);
+        assert_eq!(m.head_dim, 128); // 3584 / 28
+        assert!(m.moe.is_none());
+        assert!(m.mla.is_none());
+        assert!(m.attention.is_none());
+    }
+
+    #[test]
+    fn parse_qwen3_moe() {
+        let json = serde_json::json!({
+            "model_type": "qwen3_moe",
+            "num_hidden_layers": 62,
+            "hidden_size": 6144,
+            "num_attention_heads": 96,
+            "num_key_value_heads": 8,
+            "head_dim": 128,
+            "intermediate_size": 8192,
+            "num_experts": 160,
+            "num_experts_per_tok": 8,
+            "moe_intermediate_size": 2560,
+        });
+        let m = parse_value(&json).unwrap();
+        assert_eq!(m.num_layers, 62);
+        assert_eq!(m.num_kv_heads, 8);
+        assert_eq!(m.head_dim, 128);
+        let moe = m.moe.as_ref().unwrap();
+        assert_eq!(moe.num_experts, 160);
+        assert_eq!(moe.num_active_experts, 8);
+        assert_eq!(moe.expert_intermediate_size, Some(2560));
+        assert_eq!(moe.num_shared_experts, 0);
+        assert!(m.mla.is_none());
+        assert!(m.attention.is_none());
+    }
+
+    #[test]
+    fn parse_moe_mla_dsa() {
+        let json = serde_json::json!({
+            "model_type": "glm_moe_dsa",
+            "num_hidden_layers": 78,
+            "hidden_size": 6144,
+            "num_attention_heads": 64,
+            "num_key_value_heads": 64,
+            "head_dim": 64,
+            "intermediate_size": 12288,
+            "n_routed_experts": 256,
+            "num_experts_per_tok": 8,
+            "n_shared_experts": 1,
+            "moe_intermediate_size": 2048,
+            "kv_lora_rank": 512,
+            "q_lora_rank": 2048,
+            "qk_nope_head_dim": 192,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 256,
+            "first_k_dense_replace": 3,
+        });
+        let m = parse_value(&json).unwrap();
+        assert_eq!(m.num_layers, 78);
+        assert_eq!(m.head_dim, 64);
+        let moe = m.moe.as_ref().unwrap();
+        assert_eq!(moe.num_experts, 256);
+        assert_eq!(moe.num_active_experts, 8);
+        let mla = m.mla.as_ref().unwrap();
+        assert_eq!(mla.kv_lora_rank, 512);
+        assert!(matches!(m.attention, Some(AttentionConfig::Dsa { first_dense_layers: 3, .. })));
+    }
+}
diff --git a/src/instance/compute.rs b/src/instance/compute.rs
new file mode 100644
index 0000000..41f6aab
--- /dev/null
+++ b/src/instance/compute.rs
@@ -0,0 +1,405 @@
+//! Roofline cost model for prefill (PD disaggregation — decode not modeled).
+//!
+//! Two construction modes:
+//!
+//! **Architecture-derived** (`ModelConfig.hidden_size` present):
+//! All FLOPs, attention coefficients, and weight-stream costs are computed
+//! from the model shape.  Handles standard / GQA / MLA attention projections,
+//! MoE routing, and DSA / sliding-window sub-quadratic attention patterns.
+//!
+//! **Legacy manual** (`hidden_size` absent): uses the raw
+//! `flops_per_token_prefill` + `attn_quadratic_coeff` scalars from the YAML.
+//!
+//! ```text
+//! prefill_time(N) = max(compute_time(N), mem_time)
+//!
+//! compute_time  = sum over layers of:
+//!     (N * linear_flops + attn_coeff * N * effective_ctx(N)) / gpu_flops
+//!
+//! mem_time      = num_layers * weight_bytes_per_layer / gpu_mem_bw
+//! ```
+//!
+//! `effective_ctx(N)` equals `N` for dense attention (→ O(N²) total) but
+//! is sub-linear for DSA / sliding-window.
+
+use crate::config::{AttentionConfig, HardwareConfig, ModelConfig};
+
+/// Resolved attention pattern used at runtime.
+#[derive(Debug, Clone)]
+pub enum AttentionPattern {
+    /// Full quadratic: effective_ctx = N.
+    Dense,
+    /// Sliding window: effective_ctx = min(N, window).
+    SlidingWindow { window: f64 },
+    /// DeepSeek Sparse Attention: effective_ctx = min(N, dense_window) +
+    /// max(0, N - dense_window) / sparse_stride.
+    Dsa { dense_window: f64, sparse_stride: f64 },
+}
+
+#[derive(Debug, Clone)]
+pub struct ComputeModel {
+    /// Total transformer layers.
+    pub num_layers: f64,
+    /// How many initial layers use dense attention (rest use `attn_pattern`).
+    /// For `Dense` pattern this equals `num_layers`.
+    pub first_dense_layers: f64,
+    /// Non-attention FLOPs per token per layer (QKV proj + output proj + MLP).
+    pub linear_flops_per_token: f64,
+    /// Attention score coefficient: per-layer attention FLOPs =
+    /// `attn_coeff * N * effective_ctx(N)`.
+    pub attn_coeff: f64,
+    /// Attention pattern for non-dense layers.
+    pub attn_pattern: AttentionPattern,
+    /// Weight bytes read from HBM per layer (for memory-bound check).
+    pub weight_bytes_per_layer: f64,
+    /// Peak GPU FLOPs (aggregate across TP group).
+    pub gpu_flops: f64,
+    /// Peak GPU memory bandwidth (aggregate across TP group).
+    pub gpu_mem_bw: f64,
+}
+
+impl ComputeModel {
+    pub fn new(model: &ModelConfig, hw: &HardwareConfig) -> Self {
+        if model.is_arch_mode() {
+            Self::from_arch(model, hw)
+        } else {
+            Self::from_manual(model, hw)
+        }
+    }
+
+    // ----- Architecture-derived construction --------------------------------
+
+    fn from_arch(model: &ModelConfig, hw: &HardwareConfig) -> Self {
+        let h = model.hidden_size.unwrap() as f64;
+        let n_heads = model.num_attention_heads.unwrap_or(model.num_kv_heads) as f64;
+        let n_kv = model.num_kv_heads as f64;
+        let hd = model.head_dim as f64;
+        let inter = model.intermediate_size.unwrap_or(0) as f64;
+        let dtype = model.dtype_bytes as f64;
+
+        // --- Attention linear FLOPs/token/layer ---
+        let attn_linear = if let Some(mla) = &model.mla {
+            let qlr = mla.q_lora_rank as f64;
+            let kvlr = mla.kv_lora_rank as f64;
+            let qk_hd = (mla.qk_nope_head_dim + mla.qk_rope_head_dim) as f64;
+            let qk_rd = mla.qk_rope_head_dim as f64;
+            let vhd = mla.v_head_dim as f64;
+            // Q: down-project + up-project
+            let q = 2.0 * h * qlr + 2.0 * qlr * n_heads * qk_hd;
+            // KV: down-project (compressed latent + RoPE key)
+            let kv = 2.0 * h * (kvlr + qk_rd);
+            // Output: up-project
+            let o = 2.0 * n_heads * vhd * h;
+            q + kv + o
+        } else {
+            // Standard / GQA
+            let qkv = 2.0 * h * (n_heads + 2.0 * n_kv) * hd;
+            let o = 2.0 * n_heads * hd * h;
+            qkv + o
+        };
+
+        // --- MLP FLOPs/token/layer (SwiGLU: gate + up + down = 3 matmuls) ---
+        let mlp = if let Some(moe) = &model.moe {
+            let expert_inter = moe.expert_intermediate_size
+                .unwrap_or(model.intermediate_size.unwrap_or(0)) as f64;
+            let active = moe.num_active_experts as f64;
+            let shared = moe.num_shared_experts as f64;
+            active * 6.0 * h * expert_inter + shared * 6.0 * h * inter
+        } else {
+            6.0 * h * inter
+        };
+
+        let linear_flops = attn_linear + mlp;
+
+        // --- Attention quadratic coefficient ---
+        // attn_flops_per_layer(N) = attn_coeff * N * effective_ctx(N)
+        let attn_coeff = if let Some(mla) = &model.mla {
+            let kvlr = mla.kv_lora_rank as f64;
+            let qk_rd = mla.qk_rope_head_dim as f64;
+            // Absorbed QK^T: each head dots over (kv_lora_rank + qk_rope_head_dim) dims.
+            // Absorbed V:    each head dots over kv_lora_rank dims.
+            2.0 * n_heads * (2.0 * kvlr + qk_rd)
+        } else {
+            // Standard: QK^T + attn@V, each 2 * n_heads * head_dim per pair.
+            4.0 * n_heads * hd
+        };
+
+        // --- Weight bytes per layer (active params only for MoE) ---
+        let attn_wt = if let Some(mla) = &model.mla {
+            let qlr = mla.q_lora_rank as f64;
+            let kvlr = mla.kv_lora_rank as f64;
+            let qk_hd = (mla.qk_nope_head_dim + mla.qk_rope_head_dim) as f64;
+            let qk_rd = mla.qk_rope_head_dim as f64;
+            let vhd = mla.v_head_dim as f64;
+            (h * qlr + qlr * n_heads * qk_hd
+                + h * (kvlr + qk_rd)
+                + n_heads * vhd * h)
+                * dtype
+        } else {
+            ((n_heads + 2.0 * n_kv) * hd * h + n_heads * hd * h) * dtype
+        };
+        let mlp_wt = if let Some(moe) = &model.moe {
+            let expert_inter = moe.expert_intermediate_size
+                .unwrap_or(model.intermediate_size.unwrap_or(0)) as f64;
+            let active = moe.num_active_experts as f64;
+            let shared = moe.num_shared_experts as f64;
+            (active * 3.0 * h * expert_inter + shared * 3.0 * h * inter) * dtype
+        } else {
+            3.0 * h * inter * dtype
+        };
+        let weight_bytes = attn_wt + mlp_wt;
+
+        // --- Attention pattern ---
+        let (attn_pattern, first_dense) = match &model.attention {
+            Some(AttentionConfig::Dsa {
+                dense_window,
+                sparse_stride,
+                first_dense_layers,
+            }) => (
+                AttentionPattern::Dsa {
+                    dense_window: *dense_window as f64,
+                    sparse_stride: *sparse_stride as f64,
+                },
+                *first_dense_layers as f64,
+            ),
+            Some(AttentionConfig::SlidingWindow { window_size }) => (
+                AttentionPattern::SlidingWindow {
+                    window: *window_size as f64,
+                },
+                0.0,
+            ),
+            Some(AttentionConfig::Dense) | None => (
+                AttentionPattern::Dense,
+                model.num_layers as f64,
+            ),
+        };
+
+        Self {
+            num_layers: model.num_layers as f64,
+            first_dense_layers: first_dense,
+            linear_flops_per_token: linear_flops,
+            attn_coeff,
+            attn_pattern,
+            weight_bytes_per_layer: weight_bytes,
+            gpu_flops: hw.gpu_flops,
+            gpu_mem_bw: hw.gpu_mem_bw,
+        }
+    }
+
+    // ----- Legacy manual construction ---------------------------------------
+
+    fn from_manual(model: &ModelConfig, hw: &HardwareConfig) -> Self {
+        Self {
+            num_layers: model.num_layers as f64,
+            first_dense_layers: model.num_layers as f64,
+            linear_flops_per_token: model.flops_per_token_prefill.unwrap_or(0.0),
+            attn_coeff: model.attn_quadratic_coeff.unwrap_or(0.0),
+            attn_pattern: AttentionPattern::Dense,
+            weight_bytes_per_layer: 0.0,
+            gpu_flops: hw.gpu_flops,
+            gpu_mem_bw: hw.gpu_mem_bw,
+        }
+    }
+
+    // ----- Prefill time -----------------------------------------------------
+
+    /// Effective context length a single token attends to at sequence length N.
+    fn effective_ctx(&self, n: f64, dense_layer: bool) -> f64 {
+        if dense_layer {
+            return n;
+        }
+        match &self.attn_pattern {
+            AttentionPattern::Dense => n,
+            AttentionPattern::SlidingWindow { window } => n.min(*window),
+            AttentionPattern::Dsa {
+                dense_window,
+                sparse_stride,
+            } => {
+                if n <= *dense_window {
+                    n
+                } else {
+                    *dense_window + (n - *dense_window) / *sparse_stride
+                }
+            }
+        }
+    }
+
+    /// Time (s) to prefill `n` tokens.
+    pub fn prefill_time(&self, n: u32) -> f64 {
+        if n == 0 {
+            return 0.0;
+        }
+        let n = n as f64;
+        let linear = n * self.linear_flops_per_token;
+
+        // Compute FLOPs across all layers (dense + sparse may differ).
+        let dense_layers = self.first_dense_layers;
+        let sparse_layers = self.num_layers - dense_layers;
+
+        let dense_flops = dense_layers
+            * (linear + self.attn_coeff * n * self.effective_ctx(n, true));
+        let sparse_flops = sparse_layers
+            * (linear + self.attn_coeff * n * self.effective_ctx(n, false));
+        let total_flops = dense_flops + sparse_flops;
+
+        let compute_time = total_flops / self.gpu_flops;
+        // Weight stream: all layers' active weights read once from HBM.
+        let mem_time = self.weight_bytes_per_layer * self.num_layers / self.gpu_mem_bw;
+
+        compute_time.max(mem_time)
+    }
+
+    /// Print human-readable derived coefficients (for `validate` output).
+    pub fn describe(&self) -> String {
+        let pattern_str = match &self.attn_pattern {
+            AttentionPattern::Dense => "dense".to_string(),
+            AttentionPattern::SlidingWindow { window } => format!("sliding_window({})", *window as u64),
+            AttentionPattern::Dsa {
+                dense_window,
+                sparse_stride,
+            } => format!(
+                "dsa(window={}, stride={}, {} dense layers)",
+                *dense_window as u64, *sparse_stride as u64, self.first_dense_layers as u64
+            ),
+        };
+        format!(
+            "linear_flops/tok/layer={:.3e}, attn_coeff={:.0}, pattern={}, \
+             weight_bytes/layer={:.2e}",
+            self.linear_flops_per_token, self.attn_coeff, pattern_str,
+            self.weight_bytes_per_layer,
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn cm_legacy() -> ComputeModel {
+        ComputeModel {
+            num_layers: 28.0,
+            first_dense_layers: 28.0,
+            linear_flops_per_token: 1.4e10,
+            attn_coeff: 1024.0,
+            attn_pattern: AttentionPattern::Dense,
+            weight_bytes_per_layer: 0.0,
+            gpu_flops: 9.89e14,
+            gpu_mem_bw: 3.35e12,
+        }
+    }
+
+    #[test]
+    fn prefill_monotonic_in_n() {
+        let m = cm_legacy();
+        let mut prev = 0.0;
+        for &n in &[1u32, 8, 64, 512, 4096, 32768] {
+            let t = m.prefill_time(n);
+            assert!(t > prev, "prefill_time should be monotonic; n={n} t={t}");
+            prev = t;
+        }
+    }
+
+    #[test]
+    fn quadratic_dominates_for_long_prompt() {
+        let m = cm_legacy();
+        let lin = m.prefill_time(1024);
+        let big = m.prefill_time(32768);
+        assert!(big / lin > 32.0);
+    }
+
+    #[test]
+    fn zero_tokens_is_free() {
+        let m = cm_legacy();
+        assert_eq!(m.prefill_time(0), 0.0);
+    }
+
+    #[test]
+    fn dsa_subquadratic() {
+        // With DSA (window=4096, stride=8) the cost at 128k should be
+        // MUCH less than pure quadratic.
+        let dense = ComputeModel {
+            num_layers: 78.0,
+            first_dense_layers: 78.0,
+            linear_flops_per_token: 1.0e9,
+            attn_coeff: 139264.0,
+            attn_pattern: AttentionPattern::Dense,
+            weight_bytes_per_layer: 0.0,
+            gpu_flops: 1.8e16,
+            gpu_mem_bw: 6.4e13,
+        };
+        let dsa = ComputeModel {
+            attn_pattern: AttentionPattern::Dsa {
+                dense_window: 4096.0,
+                sparse_stride: 8.0,
+            },
+            first_dense_layers: 3.0,
+            ..dense.clone()
+        };
+        let n = 131072; // 128k tokens
+        let t_dense = dense.prefill_time(n);
+        let t_dsa = dsa.prefill_time(n);
+        // DSA should be dramatically cheaper at long context.
+        assert!(
+            t_dsa < t_dense * 0.3,
+            "DSA should be <30% of dense at 128k: dense={t_dense:.3} dsa={t_dsa:.3}"
+        );
+        // But still monotonic.
+        assert!(t_dsa > dsa.prefill_time(n / 2));
+    }
+
+    #[test]
+    fn mem_bound_short_prefill() {
+        // With very heavy weights and a short prompt, memory should dominate.
+        let m = ComputeModel {
+            num_layers: 10.0,
+            first_dense_layers: 10.0,
+            linear_flops_per_token: 1.0e6, // tiny compute
+            attn_coeff: 1.0,
+            attn_pattern: AttentionPattern::Dense,
+            weight_bytes_per_layer: 1.0e12, // 1 TB per layer
+            gpu_flops: 1.0e15,
+            gpu_mem_bw: 1.0e12,
+        };
+        let t1 = m.prefill_time(1);
+        let t8 = m.prefill_time(8);
+        // Memory time = 10 * 1e12 / 1e12 = 10s, should dominate.
+        assert!((t1 - 10.0).abs() < 0.01);
+        // Doubling tokens shouldn't change time much (mem-bound).
+        assert!((t8 - t1).abs() / t1 < 0.01);
+    }
+
+    #[test]
+    fn arch_derives_from_model_config() {
+        // Minimal dense model: verify from_arch produces something sensible.
+        let model = ModelConfig {
+            name: "test".into(),
+            num_layers: 4,
+            num_kv_heads: 2,
+            head_dim: 64,
+            dtype_bytes: 2,
+            block_size_tokens: 16,
+            hidden_size: Some(256),
+            num_attention_heads: Some(4),
+            intermediate_size: Some(512),
+            ..Default::default()
+        };
+        let hw = HardwareConfig {
+            gpu_flops: 1e14,
+            gpu_mem_bw: 1e12,
+            hbm_bytes: 1e9,
+            dram_bytes: 4e9,
+            pcie_bw: 32e9,
+            pcie_latency_us: 1.0,
+            rdma_bw: 12e9,
+            rdma_latency_us: 5.0,
+            max_batch_slots: 32,
+            prefill_chunk_tokens: 1024,
+        };
+        let cm = ComputeModel::new(&model, &hw);
+        assert!(cm.linear_flops_per_token > 0.0);
+        assert!(cm.attn_coeff > 0.0);
+        assert!(cm.weight_bytes_per_layer > 0.0);
+        let t = cm.prefill_time(1024);
+        assert!(t > 0.0);
+    }
+}
diff --git a/src/instance/instance.rs b/src/instance/instance.rs
new file mode 100644
index 0000000..c694d3c
--- /dev/null
+++ b/src/instance/instance.rs
@@ -0,0 +1,191 @@
+//! One simulated **prefill** serving instance.
+//!
+//! This simulator assumes **PD (prefill/decode) disaggregation**: prefill
+//! and decode run on dedicated instance pools, and only prefill instances
+//! are modeled here. The decode side is invisible to the KV-cache-aware
+//! routing problem we are studying — once prefill finishes, the KV cache is
+//! shipped to a decode instance via a separate (out-of-scope) path.
+//!
+//! As a result this `Instance`:
+//!   * Owns a two-tier KV cache (L0 = HBM, L1 = DRAM / v6d) used only for
+//!     prefill prefix reuse.
+//!   * Owns the PCIe / RDMA links used for cache fetches.
+//!   * Runs a simple FCFS chunked-prefill scheduler: one request's prefill
+//!     chunk per step, up to `prefill_chunk_tokens` per chunk.
+//!
+//! The cluster (`crate::cluster`) is responsible for routing arrivals,
+//! consulting the global meta store, and inserting fetched blocks into the
+//! instance's caches before handing the request off via `admit`.
+
+use std::collections::VecDeque;
+
+use crate::config::{HardwareConfig, ModelConfig};
+use crate::instance::compute::ComputeModel;
+use crate::instance::kv_cache::TwoTierCache;
+use crate::network::InstanceLinks;
+use crate::types::{InstanceId, ReqId};
+
+#[derive(Debug, Clone)]
+pub struct AdmittedRequest {
+    pub req_id: ReqId,
+    pub arrival: f64,
+    /// Earliest time at which the KV fetch chain (L1 + RDMA + PCIe) for this
+    /// request has completed, so its prefill compute can begin.
+    pub ready_at: f64,
+    /// Tokens still needing prefill compute (after cache hits accounted for).
+    pub prefill_tokens_remaining: u32,
+    /// KV blocks reserved on this instance's HBM for the lifetime of this
+    /// request's prefill (= number of input blocks).
+    pub reserved_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct StepResult {
+    pub completed: Vec<(ReqId, f64, f64)>, // (req_id, ttft, end_time)
+    pub next_tick: Option<f64>,
+}
+
+pub struct Instance {
+    pub id: InstanceId,
+    pub cache: TwoTierCache,
+    pub links: InstanceLinks,
+    pub compute: ComputeModel,
+    pub block_size_tokens: u32,
+    pub hbm_block_budget: u32,
+    pub dram_block_budget: u32,
+    pub max_batch_slots: u32,
+    pub prefill_chunk_tokens: u32,
+
+    pub kv_blocks_used: u32,
+
+    /// Admitted but not yet ready (waiting for fetch chain to land).
+    pending: VecDeque<AdmittedRequest>,
+    /// Ready and currently being prefilled (FCFS, one at a time per step).
+    prefilling: VecDeque<AdmittedRequest>,
+
+    /// True if a BatchTick is already on the global queue for us.
+    pub tick_scheduled: bool,
+}
+
+impl Instance {
+    pub fn new(id: InstanceId, model: &ModelConfig, hw: &HardwareConfig) -> Self {
+        let block_bytes = model.kv_block_bytes() as f64;
+        let hbm_blocks = (hw.hbm_bytes / block_bytes).max(1.0) as u32;
+        let dram_blocks = (hw.dram_bytes / block_bytes).max(1.0) as u32;
+        Self {
+            id,
+            cache: TwoTierCache::new(hbm_blocks as usize, dram_blocks as usize),
+            links: InstanceLinks::from_hw(hw),
+            compute: ComputeModel::new(model, hw),
+            block_size_tokens: model.block_size_tokens,
+            hbm_block_budget: hbm_blocks,
+            dram_block_budget: dram_blocks,
+            max_batch_slots: hw.max_batch_slots,
+            prefill_chunk_tokens: hw.prefill_chunk_tokens,
+            kv_blocks_used: 0,
+            pending: VecDeque::new(),
+            prefilling: VecDeque::new(),
+            tick_scheduled: false,
+        }
+    }
+
+    pub fn queue_len(&self) -> u32 {
+        (self.pending.len() + self.prefilling.len()) as u32
+    }
+
+    /// Total prefill tokens remaining across all pending and prefilling requests.
+    pub fn waiting_tokens(&self) -> u64 {
+        self.pending
+            .iter()
+            .chain(self.prefilling.iter())
+            .map(|r| r.prefill_tokens_remaining as u64)
+            .sum()
+    }
+
+    /// Estimated wall-clock time to drain all currently queued requests.
+    ///
+    /// Sums `compute.prefill_time(tokens_j)` for each queued request,
+    /// capturing the non-linear (quadratic / DSA) cost accurately.
+    pub fn estimated_drain_time(&self) -> f64 {
+        self.pending
+            .iter()
+            .chain(self.prefilling.iter())
+            .map(|r| self.compute.prefill_time(r.prefill_tokens_remaining))
+            .sum()
+    }
+
+    pub fn admit(&mut self, req: AdmittedRequest) {
+        self.pending.push_back(req);
+    }
+
+    /// Run one batch step. Returns any requests that finished prefill during
+    /// this step plus the next wakeup time for the instance.
+    pub fn step(&mut self, now: f64) -> StepResult {
+        let mut completed = Vec::new();
+
+        // 1. Drain ready pending requests into prefilling, respecting KV
+        //    budget and slot cap. A request whose fetch chain is complete
+        //    *and* has zero prefill tokens (full cache hit) finishes
+        //    immediately at `now`.
+        while let Some(front) = self.pending.front() {
+            if front.ready_at > now {
+                break;
+            }
+            if self.prefilling.len() as u32 >= self.max_batch_slots {
+                break;
+            }
+            if self.kv_blocks_used + front.reserved_blocks > self.hbm_block_budget {
+                break;
+            }
+            let r = self.pending.pop_front().unwrap();
+            self.kv_blocks_used += r.reserved_blocks;
+            if r.prefill_tokens_remaining == 0 {
+                // Full cache hit: nothing to compute. TTFT == fetch time.
+                let ttft = now - r.arrival;
+                self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks);
+                completed.push((r.req_id, ttft, now));
+            } else {
+                self.prefilling.push_back(r);
+            }
+        }
+
+        // 2. Run one chunked-prefill step on the head of `prefilling`.
+        let chunk_tokens = self
+            .prefilling
+            .front()
+            .map(|r| r.prefill_tokens_remaining.min(self.prefill_chunk_tokens))
+            .unwrap_or(0);
+
+        if chunk_tokens == 0 {
+            // Nothing compute-bound in flight right now.
+            return StepResult {
+                completed,
+                next_tick: self.next_wakeup(now),
+            };
+        }
+
+        let dt = self.compute.prefill_time(chunk_tokens);
+        let t_end = now + dt;
+
+        let head = self.prefilling.front_mut().unwrap();
+        head.prefill_tokens_remaining -= chunk_tokens;
+        if head.prefill_tokens_remaining == 0 {
+            let done = self.prefilling.pop_front().unwrap();
+            let ttft = t_end - done.arrival;
+            self.kv_blocks_used = self.kv_blocks_used.saturating_sub(done.reserved_blocks);
+            completed.push((done.req_id, ttft, t_end));
+        }
+
+        StepResult {
+            completed,
+            next_tick: self.next_wakeup(t_end),
+        }
+    }
+
+    fn next_wakeup(&self, after: f64) -> Option<f64> {
+        if !self.prefilling.is_empty() {
+            return Some(after);
+        }
+        self.pending.front().map(|r| r.ready_at.max(after))
+    }
+}
diff --git a/src/instance/kv_cache.rs b/src/instance/kv_cache.rs
new file mode 100644
index 0000000..9daaa56
--- /dev/null
+++ b/src/instance/kv_cache.rs
@@ -0,0 +1,226 @@
+//! Two-tier LRU KV cache (L0 = GPU HBM, L1 = CPU DRAM / v6d).
+//!
+//! Each tier stores block hashes; the unit of accounting is one 16-token
+//! block. `longest_prefix` walks the hash slice front-to-back and returns the
+//! count of leading blocks present in the tier (and touches them so they
+//! stay hot).
+//!
+//! On insert, evicted block hashes are returned so the caller (instance) can
+//! propagate them to the global meta store if desired.
+
+use ahash::AHashMap;
+
+/// Doubly-linked-list-backed LRU keyed by block hash.
+#[derive(Debug)]
+pub struct LruBlocks {
+    capacity: usize,
+    map: AHashMap<u64, usize>,
+    nodes: Vec<Node>,
+    head: Option<usize>, // most recently used
+    tail: Option<usize>, // least recently used
+    free: Vec<usize>,
+}
+
+#[derive(Debug, Clone, Copy)]
+struct Node {
+    key: u64,
+    prev: Option<usize>,
+    next: Option<usize>,
+}
+
+impl LruBlocks {
+    pub fn new(capacity: usize) -> Self {
+        Self {
+            capacity,
+            map: AHashMap::with_capacity(capacity),
+            nodes: Vec::with_capacity(capacity),
+            head: None,
+            tail: None,
+            free: Vec::new(),
+        }
+    }
+
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    pub fn len(&self) -> usize {
+        self.map.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.map.is_empty()
+    }
+
+    pub fn contains(&self, key: u64) -> bool {
+        self.map.contains_key(&key)
+    }
+
+    /// Touch (move to MRU) if present. Returns whether the key was present.
+    pub fn touch(&mut self, key: u64) -> bool {
+        if let Some(&idx) = self.map.get(&key) {
+            self.move_to_head(idx);
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Insert blocks; evicted hashes appended to `evicted_out`. Reinserting an
+    /// existing block just touches it.
+    pub fn insert_blocks(&mut self, hashes: &[u64], evicted_out: &mut Vec<u64>) {
+        for &h in hashes {
+            if self.touch(h) {
+                continue;
+            }
+            // need to make room?
+            if self.map.len() == self.capacity {
+                if let Some(tail_idx) = self.tail {
+                    let tail_key = self.nodes[tail_idx].key;
+                    self.detach(tail_idx);
+                    self.map.remove(&tail_key);
+                    self.free.push(tail_idx);
+                    evicted_out.push(tail_key);
+                }
+            }
+            // allocate node
+            let idx = if let Some(i) = self.free.pop() {
+                self.nodes[i] = Node { key: h, prev: None, next: None };
+                i
+            } else {
+                let i = self.nodes.len();
+                self.nodes.push(Node { key: h, prev: None, next: None });
+                i
+            };
+            self.map.insert(h, idx);
+            self.attach_to_head(idx);
+        }
+    }
+
+    /// Longest leading prefix of `hashes` present; touches the matched blocks.
+    pub fn longest_prefix(&mut self, hashes: &[u64]) -> usize {
+        let mut n = 0usize;
+        for &h in hashes {
+            if !self.touch(h) {
+                break;
+            }
+            n += 1;
+        }
+        n
+    }
+
+    /// Read-only longest prefix without LRU updates (used for routing probes).
+    pub fn longest_prefix_peek(&self, hashes: &[u64]) -> usize {
+        let mut n = 0usize;
+        for &h in hashes {
+            if !self.map.contains_key(&h) {
+                break;
+            }
+            n += 1;
+        }
+        n
+    }
+
+    fn move_to_head(&mut self, idx: usize) {
+        if Some(idx) == self.head {
+            return;
+        }
+        self.detach(idx);
+        self.attach_to_head(idx);
+    }
+
+    fn detach(&mut self, idx: usize) {
+        let (prev, next) = {
+            let n = &self.nodes[idx];
+            (n.prev, n.next)
+        };
+        if let Some(p) = prev {
+            self.nodes[p].next = next;
+        } else {
+            // it was the head
+            self.head = next;
+        }
+        if let Some(nx) = next {
+            self.nodes[nx].prev = prev;
+        } else {
+            // it was the tail
+            self.tail = prev;
+        }
+        self.nodes[idx].prev = None;
+        self.nodes[idx].next = None;
+    }
+
+    fn attach_to_head(&mut self, idx: usize) {
+        let old_head = self.head;
+        self.nodes[idx].prev = None;
+        self.nodes[idx].next = old_head;
+        if let Some(h) = old_head {
+            self.nodes[h].prev = Some(idx);
+        }
+        self.head = Some(idx);
+        if self.tail.is_none() {
+            self.tail = Some(idx);
+        }
+    }
+}
+
+/// Two-tier (HBM, DRAM) cache.
+#[derive(Debug)]
+pub struct TwoTierCache {
+    pub l0: LruBlocks,
+    pub l1: LruBlocks,
+}
+
+impl TwoTierCache {
+    pub fn new(l0_cap: usize, l1_cap: usize) -> Self {
+        Self {
+            l0: LruBlocks::new(l0_cap),
+            l1: LruBlocks::new(l1_cap),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn lcp_full_partial_empty() {
+        let mut c = LruBlocks::new(8);
+        let mut ev = Vec::new();
+        c.insert_blocks(&[1, 2, 3, 4], &mut ev);
+        assert_eq!(c.longest_prefix(&[1, 2, 3, 4, 5, 6]), 4);
+        assert_eq!(c.longest_prefix(&[1, 2, 9]), 2);
+        assert_eq!(c.longest_prefix(&[99, 1]), 0);
+    }
+
+    #[test]
+    fn lru_eviction_order() {
+        let mut c = LruBlocks::new(3);
+        let mut ev = Vec::new();
+        c.insert_blocks(&[1, 2, 3], &mut ev);
+        assert!(ev.is_empty());
+        // touch 1 -> MRU
+        c.touch(1);
+        // insert 4 -> evicts LRU which should be 2
+        c.insert_blocks(&[4], &mut ev);
+        assert_eq!(ev, vec![2]);
+        assert!(c.contains(1));
+        assert!(c.contains(3));
+        assert!(c.contains(4));
+        assert!(!c.contains(2));
+    }
+
+    #[test]
+    fn longest_prefix_touches_blocks() {
+        let mut c = LruBlocks::new(3);
+        let mut ev = Vec::new();
+        c.insert_blocks(&[1, 2, 3], &mut ev);
+        // touch 1 via prefix lookup (only the first matching block: 1)
+        assert_eq!(c.longest_prefix(&[1, 99]), 1);
+        // now insert 4 -> LRU should be 2 (since 3 was just inserted MRU after 2,
+        // 1 is freshest, then 3, then 2)
+        c.insert_blocks(&[4], &mut ev);
+        assert_eq!(ev, vec![2]);
+    }
+}
diff --git a/src/instance/mod.rs b/src/instance/mod.rs
new file mode 100644
index 0000000..8cbb697
--- /dev/null
+++ b/src/instance/mod.rs
@@ -0,0 +1,6 @@
+pub mod compute;
+pub mod kv_cache;
+#[allow(clippy::module_inception)]
+pub mod instance;
+
+pub use instance::Instance;
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..36f9e98
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,13 @@
+pub mod cluster;
+pub mod config;
+pub mod driver;
+pub mod hardware_presets;
+pub mod hf_config;
+pub mod instance;
+pub mod metrics;
+pub mod network;
+pub mod oracle;
+pub mod router;
+pub mod sim;
+pub mod trace;
+pub mod types;
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..14289b2
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,271 @@
+use anyhow::{Context, Result};
+use clap::{Args, Parser, Subcommand};
+use std::path::PathBuf;
+
+use kvcache_simulator::config::{Config, RouterMode};
+use kvcache_simulator::{driver, oracle, trace::TraceReader};
+
+#[derive(Debug, Parser)]
+#[command(name = "kvcache-sim", about = "Cluster-level KV cache simulator")]
+struct Cli {
+    #[command(subcommand)]
+    cmd: Cmd,
+}
+
+/// Optional CLI overrides applied on top of the YAML config so the same
+/// config can be reused across sweeps without editing the file.
+#[derive(Debug, Args, Clone, Default)]
+struct ConfigOverrides {
+    /// Override `cluster.num_instances`.
+    #[arg(long)]
+    num_instances: Option<u32>,
+    /// Override `sim.max_requests` (cap on processed trace records).
+    #[arg(long)]
+    max_requests: Option<u64>,
+    /// Override `sim.trace_path`.
+    #[arg(long)]
+    trace: Option<PathBuf>,
+    /// Override `sim.output_dir`.
+    #[arg(long)]
+    output_dir: Option<PathBuf>,
+    /// Override `sim.seed`.
+    #[arg(long)]
+    seed: Option<u64>,
+    /// Override `cluster.router.precise_probe_topk`.
+    #[arg(long)]
+    precise_topk: Option<u32>,
+    /// Override `cluster.meta_store.ttl_seconds`.
+    #[arg(long)]
+    ttl_seconds: Option<f64>,
+}
+
+impl ConfigOverrides {
+    fn apply(&self, cfg: &mut Config) {
+        if let Some(n) = self.num_instances {
+            cfg.cluster.num_instances = n;
+        }
+        if let Some(m) = self.max_requests {
+            cfg.sim.max_requests = Some(m);
+        }
+        if let Some(t) = &self.trace {
+            cfg.sim.trace_path = t.to_string_lossy().into_owned();
+        }
+        if let Some(o) = &self.output_dir {
+            cfg.sim.output_dir = o.to_string_lossy().into_owned();
+        }
+        if let Some(s) = self.seed {
+            cfg.sim.seed = s;
+        }
+        if let Some(k) = self.precise_topk {
+            cfg.cluster.router.precise_probe_topk = k;
+        }
+        if let Some(ttl) = self.ttl_seconds {
+            cfg.cluster.meta_store.ttl_seconds = ttl;
+        }
+    }
+}
+
+#[derive(Debug, Subcommand)]
+enum Cmd {
+    /// Run a single simulation with the router specified in the config.
+    Run {
+        #[arg(short, long)]
+        config: PathBuf,
+        #[command(flatten)]
+        overrides: ConfigOverrides,
+    },
+    /// Run the same trace under multiple routers and compare summaries.
+    Ablate {
+        #[arg(short, long)]
+        config: PathBuf,
+        /// Comma-separated router modes
+        #[arg(
+            short,
+            long,
+            default_value = "random,least_loaded,least_tokens,ttl_aware,min_pd,cache_load,cache_score,estimated_ttft,prefix_affinity"
+        )]
+        routers: String,
+        #[command(flatten)]
+        overrides: ConfigOverrides,
+    },
+    /// Parse the config and trace head; do not run a simulation.
+    Validate {
+        #[arg(short, long)]
+        config: PathBuf,
+        #[command(flatten)]
+        overrides: ConfigOverrides,
+    },
+    /// Offline oracle analysis: theoretical hit-rate ceilings (unlimited
+    /// cache and offline-optimal Belady eviction at finite capacity), plus
+    /// LRU at the same capacity for comparison.
+    Oracle {
+        #[arg(short, long)]
+        config: PathBuf,
+        #[command(flatten)]
+        overrides: ConfigOverrides,
+        /// Cache capacity (in 16-token blocks) used for the Belady and LRU
+        /// analyses. Defaults to `num_instances * per_instance_HBM_blocks`
+        /// (the cluster-aggregate capacity).
+        #[arg(long)]
+        capacity_blocks: Option<u64>,
+        /// Use the per-instance HBM block budget instead of the
+        /// cluster-aggregate. Mutually exclusive with --capacity-blocks.
+        #[arg(long, default_value_t = false)]
+        per_instance: bool,
+        /// Optional output JSON path. Defaults to `<output_dir>/oracle.json`.
+        #[arg(long)]
+        out: Option<PathBuf>,
+    },
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+    match cli.cmd {
+        Cmd::Run { config, overrides } => cmd_run(&config, &overrides),
+        Cmd::Ablate {
+            config,
+            routers,
+            overrides,
+        } => cmd_ablate(&config, &routers, &overrides),
+        Cmd::Validate { config, overrides } => cmd_validate(&config, &overrides),
+        Cmd::Oracle {
+            config,
+            overrides,
+            capacity_blocks,
+            per_instance,
+            out,
+        } => cmd_oracle(&config, &overrides, capacity_blocks, per_instance, out.as_deref()),
+    }
+}
+
+fn load(config: &PathBuf, overrides: &ConfigOverrides) -> Result<Config> {
+    let mut cfg = Config::from_yaml_path(config)?;
+    overrides.apply(&mut cfg);
+    Ok(cfg)
+}
+
+fn cmd_run(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
+    let cfg = load(path, overrides)?;
+    let out = driver::run(&cfg, None)?;
+    println!("{}", serde_json::to_string_pretty(&out.summary)?);
+    Ok(())
+}
+
+fn cmd_ablate(path: &PathBuf, routers: &str, overrides: &ConfigOverrides) -> Result<()> {
+    let base = load(path, overrides)?;
+    let modes: Vec<RouterMode> = routers
+        .split(',')
+        .map(|s| s.trim())
+        .filter(|s| !s.is_empty())
+        .map(RouterMode::parse)
+        .collect::<Result<Vec<_>>>()
+        .with_context(|| format!("parsing --routers='{routers}'"))?;
+    let mut all = Vec::new();
+    for mode in modes {
+        let mut cfg = base.clone();
+        cfg.cluster.router.mode = mode;
+        let sub = mode.as_str().to_string();
+        eprintln!("[ablate] running router={}", sub);
+        let out = driver::run(&cfg, Some(&sub))?;
+        all.push(out.summary);
+    }
+    let agg_path = std::path::Path::new(&base.sim.output_dir).join("ablation.json");
+    std::fs::create_dir_all(&base.sim.output_dir)?;
+    std::fs::write(&agg_path, serde_json::to_string_pretty(&all)?)?;
+    println!("{}", serde_json::to_string_pretty(&all)?);
+    eprintln!("[ablate] wrote {}", agg_path.display());
+    Ok(())
+}
+
+fn cmd_validate(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> {
+    use kvcache_simulator::instance::compute::ComputeModel;
+    let cfg = load(path, overrides)?;
+    eprintln!("config OK: {}", cfg.model.name);
+    eprintln!("mode = {}", if cfg.model.is_arch_mode() { "architecture-derived" } else { "legacy manual" });
+    let cm = ComputeModel::new(&cfg.model, &cfg.hardware);
+    eprintln!("compute: {}", cm.describe());
+    eprintln!("kv_block_bytes = {} ({:.2} MB{})",
+        cfg.model.kv_block_bytes(),
+        cfg.model.kv_block_bytes() as f64 / 1e6,
+        if cfg.model.mla.is_some() { ", MLA compressed" } else { "" },
+    );
+    let block_bytes = cfg.model.kv_block_bytes() as f64;
+    let hbm_blocks = (cfg.hardware.hbm_bytes / block_bytes) as u64;
+    let dram_blocks = (cfg.hardware.dram_bytes / block_bytes) as u64;
+    eprintln!("per-instance HBM blocks = {hbm_blocks}, DRAM blocks = {dram_blocks}");
+    eprintln!("num_instances = {}", cfg.cluster.num_instances);
+    // Sample prefill times at a few prompt lengths.
+    eprintln!("prefill_time samples:");
+    for &n in &[256, 1024, 4096, 16384, 65536, 131072] {
+        let t = cm.prefill_time(n);
+        eprintln!("  N={n:>7} -> {t:.4} s");
+    }
+    let reader = TraceReader::open(&cfg.sim.trace_path, Some(5))?;
+    for rec in reader {
+        let rec = rec?;
+        eprintln!(
+            "  req {} chat={} t={:.3}s in={} out={} blocks={}",
+            rec.req_id,
+            rec.chat_id,
+            rec.arrival,
+            rec.input_len,
+            rec.output_len,
+            rec.hash_ids.len()
+        );
+    }
+    Ok(())
+}
+
+fn cmd_oracle(
+    path: &PathBuf,
+    overrides: &ConfigOverrides,
+    capacity_blocks: Option<u64>,
+    per_instance: bool,
+    out_path: Option<&std::path::Path>,
+) -> Result<()> {
+    let cfg = load(path, overrides)?;
+    let block_bytes = cfg.model.kv_block_bytes() as f64;
+    let per_instance_blocks = (cfg.hardware.hbm_bytes / block_bytes).max(1.0) as u64;
+    let aggregate_blocks = per_instance_blocks * cfg.cluster.num_instances as u64;
+    let capacity = match (capacity_blocks, per_instance) {
+        (Some(_), true) => {
+            return Err(anyhow::anyhow!(
+                "--capacity-blocks and --per-instance are mutually exclusive"
+            ))
+        }
+        (Some(c), false) => c,
+        (None, true) => per_instance_blocks,
+        (None, false) => aggregate_blocks,
+    };
+
+    eprintln!(
+        "[oracle] loading trace {} (max_requests={:?})",
+        cfg.sim.trace_path, cfg.sim.max_requests
+    );
+    let reader = TraceReader::open(&cfg.sim.trace_path, cfg.sim.max_requests)?;
+    let records: Vec<_> = reader.collect::<Result<Vec<_>, _>>()?;
+    eprintln!(
+        "[oracle] loaded {} requests; analyzing with capacity = {} blocks \
+         ({} per-instance × {} instances{})",
+        records.len(),
+        capacity,
+        per_instance_blocks,
+        cfg.cluster.num_instances,
+        if per_instance { ", per-instance mode" } else { "" }
+    );
+
+    let result = oracle::analyze(&records, capacity);
+    let json = serde_json::to_string_pretty(&result)?;
+    println!("{}", json);
+
+    let target = match out_path {
+        Some(p) => p.to_path_buf(),
+        None => std::path::Path::new(&cfg.sim.output_dir).join("oracle.json"),
+    };
+    if let Some(parent) = target.parent() {
+        std::fs::create_dir_all(parent)?;
+    }
+    std::fs::write(&target, &json)?;
+    eprintln!("[oracle] wrote {}", target.display());
+    Ok(())
+}
diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs
new file mode 100644
index 0000000..86ea2cf
--- /dev/null
+++ b/src/metrics/mod.rs
@@ -0,0 +1,7 @@
+pub mod per_request;
+pub mod routing_log;
+pub mod summary;
+pub mod timeseries;
+
+pub use per_request::PerRequestRow;
+pub use summary::Summary;
diff --git a/src/metrics/per_request.rs b/src/metrics/per_request.rs
new file mode 100644
index 0000000..b4ccdd8
--- /dev/null
+++ b/src/metrics/per_request.rs
@@ -0,0 +1,42 @@
+use anyhow::Result;
+use serde::Serialize;
+use std::path::Path;
+
+#[derive(Debug, Clone, Serialize)]
+pub struct PerRequestRow {
+    pub req_id: u64,
+    pub arrival: f64,
+    pub ttft: f64,
+    pub e2e: f64,
+    pub instance: u32,
+    pub total_blocks: u32,
+    pub l0_hit_blocks: u32,
+    pub l1_hit_blocks: u32,
+    pub remote_hit_blocks: u32,
+    pub miss_blocks: u32,
+    pub rdma_bytes: u64,
+    pub pcie_bytes: u64,
+    pub probe_overhead_s: f64,
+}
+
+pub struct PerRequestWriter {
+    inner: csv::Writer<std::fs::File>,
+}
+
+impl PerRequestWriter {
+    pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
+        let f = std::fs::File::create(path)?;
+        let inner = csv::Writer::from_writer(f);
+        Ok(Self { inner })
+    }
+
+    pub fn write(&mut self, row: &PerRequestRow) -> Result<()> {
+        self.inner.serialize(row)?;
+        Ok(())
+    }
+
+    pub fn finish(mut self) -> Result<()> {
+        self.inner.flush()?;
+        Ok(())
+    }
+}
diff --git a/src/metrics/routing_log.rs b/src/metrics/routing_log.rs
new file mode 100644
index 0000000..5e3817a
--- /dev/null
+++ b/src/metrics/routing_log.rs
@@ -0,0 +1,29 @@
+use anyhow::Result;
+use std::fs::File;
+use std::io::{BufWriter, Write};
+use std::path::Path;
+
+use crate::router::RouteDecision;
+
+pub struct RoutingLogWriter {
+    inner: BufWriter<File>,
+}
+
+impl RoutingLogWriter {
+    pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
+        let f = File::create(path)?;
+        Ok(Self { inner: BufWriter::new(f) })
+    }
+
+    pub fn write(&mut self, decision: &RouteDecision) -> Result<()> {
+        let line = serde_json::to_string(decision)?;
+        self.inner.write_all(line.as_bytes())?;
+        self.inner.write_all(b"\n")?;
+        Ok(())
+    }
+
+    pub fn finish(mut self) -> Result<()> {
+        self.inner.flush()?;
+        Ok(())
+    }
+}
diff --git a/src/metrics/summary.rs b/src/metrics/summary.rs
new file mode 100644
index 0000000..95d45ca
--- /dev/null
+++ b/src/metrics/summary.rs
@@ -0,0 +1,80 @@
+use serde::Serialize;
+
+use crate::metrics::per_request::PerRequestRow;
+
+#[derive(Debug, Clone, Serialize, Default)]
+pub struct Summary {
+    pub router: String,
+    pub num_requests: u64,
+    pub sim_duration_s: f64,
+    pub throughput_req_per_s: f64,
+    pub ttft_mean: f64,
+    pub ttft_p50: f64,
+    pub ttft_p95: f64,
+    pub ttft_p99: f64,
+    pub e2e_mean: f64,
+    pub e2e_p50: f64,
+    pub e2e_p95: f64,
+    pub e2e_p99: f64,
+    pub total_blocks: u64,
+    pub hit_rate_l0: f64,
+    pub hit_rate_l1: f64,
+    pub hit_rate_remote: f64,
+    pub miss_rate: f64,
+    pub total_rdma_bytes: u64,
+    pub total_pcie_bytes: u64,
+}
+
+impl Summary {
+    pub fn from_rows(router: &str, rows: &[PerRequestRow], sim_duration_s: f64) -> Self {
+        if rows.is_empty() {
+            return Summary {
+                router: router.to_string(),
+                ..Default::default()
+            };
+        }
+        let mut ttfts: Vec<f64> = rows.iter().map(|r| r.ttft).collect();
+        let mut e2es: Vec<f64> = rows.iter().map(|r| r.e2e).collect();
+        ttfts.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        e2es.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let pct = |v: &[f64], q: f64| -> f64 {
+            let n = v.len();
+            let idx = ((n as f64 - 1.0) * q).round() as usize;
+            v[idx.min(n - 1)]
+        };
+        let mean = |v: &[f64]| -> f64 {
+            if v.is_empty() {
+                0.0
+            } else {
+                v.iter().sum::<f64>() / v.len() as f64
+            }
+        };
+        let total_blocks: u64 = rows.iter().map(|r| r.total_blocks as u64).sum();
+        let l0: u64 = rows.iter().map(|r| r.l0_hit_blocks as u64).sum();
+        let l1: u64 = rows.iter().map(|r| r.l1_hit_blocks as u64).sum();
+        let remote: u64 = rows.iter().map(|r| r.remote_hit_blocks as u64).sum();
+        let miss: u64 = rows.iter().map(|r| r.miss_blocks as u64).sum();
+        let denom = total_blocks.max(1) as f64;
+        Summary {
+            router: router.to_string(),
+            num_requests: rows.len() as u64,
+            sim_duration_s,
+            throughput_req_per_s: rows.len() as f64 / sim_duration_s.max(1e-9),
+            ttft_mean: mean(&ttfts),
+            ttft_p50: pct(&ttfts, 0.50),
+            ttft_p95: pct(&ttfts, 0.95),
+            ttft_p99: pct(&ttfts, 0.99),
+            e2e_mean: mean(&e2es),
+            e2e_p50: pct(&e2es, 0.50),
+            e2e_p95: pct(&e2es, 0.95),
+            e2e_p99: pct(&e2es, 0.99),
+            total_blocks,
+            hit_rate_l0: l0 as f64 / denom,
+            hit_rate_l1: l1 as f64 / denom,
+            hit_rate_remote: remote as f64 / denom,
+            miss_rate: miss as f64 / denom,
+            total_rdma_bytes: rows.iter().map(|r| r.rdma_bytes).sum(),
+            total_pcie_bytes: rows.iter().map(|r| r.pcie_bytes).sum(),
+        }
+    }
+}
diff --git a/src/metrics/timeseries.rs b/src/metrics/timeseries.rs
new file mode 100644
index 0000000..327cc0c
--- /dev/null
+++ b/src/metrics/timeseries.rs
@@ -0,0 +1,34 @@
+use anyhow::Result;
+use serde::Serialize;
+use std::path::Path;
+
+#[derive(Debug, Clone, Serialize)]
+pub struct TimeseriesRow {
+    pub t: f64,
+    pub instance: u32,
+    pub queue_len: u32,
+    pub kv_blocks_used: u32,
+    pub kv_blocks_total: u32,
+    pub busy: u8,
+}
+
+pub struct TimeseriesWriter {
+    inner: csv::Writer<std::fs::File>,
+}
+
+impl TimeseriesWriter {
+    pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
+        let f = std::fs::File::create(path)?;
+        Ok(Self { inner: csv::Writer::from_writer(f) })
+    }
+
+    pub fn write(&mut self, row: &TimeseriesRow) -> Result<()> {
+        self.inner.serialize(row)?;
+        Ok(())
+    }
+
+    pub fn finish(mut self) -> Result<()> {
+        self.inner.flush()?;
+        Ok(())
+    }
+}
diff --git a/src/network.rs b/src/network.rs
new file mode 100644
index 0000000..0bc1984
--- /dev/null
+++ b/src/network.rs
@@ -0,0 +1,84 @@
+//! Network cost models for RDMA (cross-instance) and PCIe (host<->GPU).
+//!
+//! Each link is modeled as a token bucket via a `next_free` cursor: a fetch of
+//! `bytes` starting at `now` waits until `next_free`, then advances the cursor
+//! by `bytes / bw`. Latency is added on top of transfer time. This captures
+//! contention without simulating individual packets.
+
+use crate::config::HardwareConfig;
+
+#[derive(Debug, Clone)]
+pub struct LinkModel {
+    pub bw_bytes_per_s: f64,
+    pub latency_s: f64,
+    next_free: f64,
+}
+
+impl LinkModel {
+    pub fn new(bw_bytes_per_s: f64, latency_s: f64) -> Self {
+        Self {
+            bw_bytes_per_s,
+            latency_s,
+            next_free: 0.0,
+        }
+    }
+
+    /// Reserve a transfer of `bytes` starting at `now`. Returns the absolute
+    /// time at which the bytes have all arrived (advances internal cursor).
+    pub fn reserve(&mut self, now: f64, bytes: u64) -> f64 {
+        if bytes == 0 {
+            return now + self.latency_s;
+        }
+        let xfer = bytes as f64 / self.bw_bytes_per_s;
+        let start = self.next_free.max(now);
+        self.next_free = start + xfer;
+        self.next_free + self.latency_s
+    }
+
+    /// Pure cost (no contention): how long to push `bytes` over this link.
+    pub fn cost(&self, bytes: u64) -> f64 {
+        if bytes == 0 {
+            self.latency_s
+        } else {
+            self.latency_s + bytes as f64 / self.bw_bytes_per_s
+        }
+    }
+}
+
+/// Per-instance bundle of links: PCIe (host<->GPU) and RDMA (host<->remote).
+#[derive(Debug, Clone)]
+pub struct InstanceLinks {
+    pub pcie: LinkModel,
+    pub rdma: LinkModel,
+}
+
+impl InstanceLinks {
+    pub fn from_hw(hw: &HardwareConfig) -> Self {
+        Self {
+            pcie: LinkModel::new(hw.pcie_bw, hw.pcie_latency_us * 1e-6),
+            rdma: LinkModel::new(hw.rdma_bw, hw.rdma_latency_us * 1e-6),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn link_cost_matches_formula() {
+        let l = LinkModel::new(1.0e9, 1.0e-6);
+        // 1 GB / (1 GB/s) = 1s, plus 1us latency
+        let t = l.cost(1_000_000_000);
+        assert!((t - (1.0 + 1e-6)).abs() < 1e-9);
+    }
+
+    #[test]
+    fn reserve_serializes_concurrent_transfers() {
+        let mut l = LinkModel::new(1.0e9, 0.0);
+        let t1 = l.reserve(0.0, 500_000_000); // 0.5s
+        let t2 = l.reserve(0.0, 500_000_000); // contended -> 1.0s
+        assert!((t1 - 0.5).abs() < 1e-9);
+        assert!((t2 - 1.0).abs() < 1e-9);
+    }
+}
diff --git a/src/oracle.rs b/src/oracle.rs
new file mode 100644
index 0000000..75523b6
--- /dev/null
+++ b/src/oracle.rs
@@ -0,0 +1,279 @@
+//! Offline oracle analyzers for upper-bound KV-cache hit rates.
+//!
+//! Two analyses, both treating the cluster as a single aggregated cache so
+//! the result is independent of routing — i.e. they answer the question
+//! "what is the best the cluster could possibly do?":
+//!
+//! 1. **Unlimited capacity**: longest-prefix-match against an unbounded
+//!    cache. The only misses are blocks that the prefix walk encounters for
+//!    the first time. Sets the absolute ceiling.
+//!
+//! 2. **Belady (offline optimal eviction) at finite capacity**: classic
+//!    OPT replacement — evict the cached block whose *next* access is
+//!    furthest in the future. Run alongside an LRU baseline at the same
+//!    capacity so the gap tells you how much room LRU is leaving.
+//!
+//! Hit accounting uses prefix-match semantics matching the rest of the
+//! simulator: a block at position k in a request counts as a hit only if
+//! all positions 0..k are also in the cache.
+
+use ahash::{AHashMap, AHashSet};
+use serde::Serialize;
+use std::collections::BinaryHeap;
+
+use crate::instance::kv_cache::LruBlocks;
+use crate::trace::RequestRecord;
+
+#[derive(Debug, Clone, Serialize)]
+pub struct OracleResult {
+    pub num_requests: u64,
+    pub total_blocks: u64,
+    pub unique_blocks: u64,
+    pub unlimited: TierResult,
+    pub belady_finite: TierResult,
+    pub lru_finite: TierResult,
+}
+
+#[derive(Debug, Clone, Serialize, Default)]
+pub struct TierResult {
+    pub label: String,
+    pub capacity_blocks: u64,
+    pub hits: u64,
+    pub misses: u64,
+    pub hit_rate: f64,
+}
+
+impl TierResult {
+    fn from_counts(label: &str, capacity_blocks: u64, hits: u64, total: u64) -> Self {
+        let misses = total.saturating_sub(hits);
+        TierResult {
+            label: label.to_string(),
+            capacity_blocks,
+            hits,
+            misses,
+            hit_rate: if total == 0 { 0.0 } else { hits as f64 / total as f64 },
+        }
+    }
+}
+
+pub fn analyze(records: &[RequestRecord], capacity_blocks: u64) -> OracleResult {
+    // total / unique counters
+    let total_blocks: u64 = records.iter().map(|r| r.hash_ids.len() as u64).sum();
+    let mut unique = AHashSet::new();
+    for r in records {
+        for &h in &r.hash_ids {
+            unique.insert(h);
+        }
+    }
+
+    // 1. Unlimited cache
+    let unlimited_hits = run_unlimited(records);
+    let unlimited = TierResult::from_counts(
+        "unlimited",
+        u64::MAX,
+        unlimited_hits,
+        total_blocks,
+    );
+
+    // 2. Precompute next-use index for Belady
+    let next_use = build_next_use(records);
+
+    // 3. Belady at the given capacity
+    let belady_hits = run_belady(records, &next_use, capacity_blocks as usize);
+    let belady = TierResult::from_counts("belady", capacity_blocks, belady_hits, total_blocks);
+
+    // 4. LRU baseline at the same capacity
+    let lru_hits = run_lru(records, capacity_blocks as usize);
+    let lru = TierResult::from_counts("lru", capacity_blocks, lru_hits, total_blocks);
+
+    OracleResult {
+        num_requests: records.len() as u64,
+        total_blocks,
+        unique_blocks: unique.len() as u64,
+        unlimited,
+        belady_finite: belady,
+        lru_finite: lru,
+    }
+}
+
+fn run_unlimited(records: &[RequestRecord]) -> u64 {
+    let mut seen: AHashSet<u64> = AHashSet::with_capacity(1 << 18);
+    let mut hits: u64 = 0;
+    for r in records {
+        // Longest prefix match against `seen`
+        for &h in &r.hash_ids {
+            if seen.contains(&h) {
+                hits += 1;
+            } else {
+                break;
+            }
+        }
+        for &h in &r.hash_ids {
+            seen.insert(h);
+        }
+    }
+    hits
+}
+
+fn run_lru(records: &[RequestRecord], capacity: usize) -> u64 {
+    if capacity == 0 {
+        return 0;
+    }
+    let mut cache = LruBlocks::new(capacity);
+    let mut hits: u64 = 0;
+    let mut evicted = Vec::new();
+    for r in records {
+        hits += cache.longest_prefix(&r.hash_ids) as u64;
+        evicted.clear();
+        cache.insert_blocks(&r.hash_ids, &mut evicted);
+    }
+    hits
+}
+
+/// For each (request_idx, position_in_hash_ids) compute the next request
+/// index whose `hash_ids` contains the same block (`u32::MAX` if none).
+fn build_next_use(records: &[RequestRecord]) -> Vec<Vec<u32>> {
+    let n = records.len();
+    let mut next_use: Vec<Vec<u32>> = Vec::with_capacity(n);
+    for r in records {
+        next_use.push(vec![u32::MAX; r.hash_ids.len()]);
+    }
+    let mut last_seen: AHashMap<u64, u32> = AHashMap::with_capacity(1 << 18);
+    for i in (0..n).rev() {
+        let r = &records[i];
+        for (j, &h) in r.hash_ids.iter().enumerate() {
+            next_use[i][j] = *last_seen.get(&h).unwrap_or(&u32::MAX);
+        }
+        for &h in &r.hash_ids {
+            last_seen.insert(h, i as u32);
+        }
+    }
+    next_use
+}
+
+/// Belady (offline OPT) eviction over the trace.
+///
+/// Implementation: lazy-deletion max-heap keyed by next-use index. Each
+/// cache entry has a version; the heap may contain stale entries from
+/// previous insertions, which we skip on pop.
+fn run_belady(records: &[RequestRecord], next_use: &[Vec<u32>], capacity: usize) -> u64 {
+    if capacity == 0 {
+        return 0;
+    }
+    // block_hash -> (current_version, current_next_use)
+    let mut in_cache: AHashMap<u64, (u64, u32)> = AHashMap::with_capacity(capacity);
+    // (next_use, version, block_hash) — BinaryHeap is max-heap, which is what
+    // we want for "evict the entry whose next access is furthest".
+    let mut heap: BinaryHeap<(u32, u64, u64)> = BinaryHeap::with_capacity(capacity);
+    let mut version: u64 = 0;
+    let mut hits: u64 = 0;
+
+    for (i, r) in records.iter().enumerate() {
+        // 1. Longest-prefix hit accounting against current cache.
+        for &h in &r.hash_ids {
+            if in_cache.contains_key(&h) {
+                hits += 1;
+            } else {
+                break;
+            }
+        }
+
+        // 2. Insert / update each block in the request with its new next-use.
+        for (j, &h) in r.hash_ids.iter().enumerate() {
+            let nu = next_use[i][j];
+            if let Some(slot) = in_cache.get_mut(&h) {
+                version += 1;
+                slot.0 = version;
+                slot.1 = nu;
+                heap.push((nu, version, h));
+                continue;
+            }
+            // Need to make room?
+            if in_cache.len() == capacity {
+                // Evict max next_use entry, skipping stale heap entries.
+                loop {
+                    let (nu_top, ver_top, h_top) = match heap.pop() {
+                        Some(x) => x,
+                        None => break,
+                    };
+                    if let Some(&(cur_ver, cur_nu)) = in_cache.get(&h_top) {
+                        if cur_ver == ver_top && cur_nu == nu_top {
+                            in_cache.remove(&h_top);
+                            break;
+                        }
+                    }
+                    // stale; loop
+                }
+            }
+            version += 1;
+            in_cache.insert(h, (version, nu));
+            heap.push((nu, version, h));
+        }
+    }
+
+    hits
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn req(id: u64, t: f64, hashes: Vec<u64>) -> RequestRecord {
+        RequestRecord {
+            req_id: id,
+            chat_id: id as i64,
+            arrival: t,
+            input_len: (hashes.len() as u32) * 16,
+            output_len: 16,
+            hash_ids: hashes,
+        }
+    }
+
+    #[test]
+    fn unlimited_first_occurrence_misses() {
+        let recs = vec![
+            req(0, 0.0, vec![1, 2, 3]),
+            req(1, 1.0, vec![1, 2, 3, 4]),
+            req(2, 2.0, vec![1, 2, 3, 4, 5]),
+        ];
+        let out = analyze(&recs, 100);
+        // total = 3 + 4 + 5 = 12
+        assert_eq!(out.total_blocks, 12);
+        // unique = {1,2,3,4,5} = 5
+        assert_eq!(out.unique_blocks, 5);
+        // unlimited hits = 0 (req 0 all miss) + 3 (req 1 has [1,2,3] cached, then 4 miss) + 4
+        assert_eq!(out.unlimited.hits, 7);
+        assert!((out.unlimited.hit_rate - 7.0 / 12.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn belady_beats_lru_when_lru_thrashes() {
+        // Capacity 2. Pattern designed so LRU thrashes but Belady keeps the
+        // useful block: A B A C A B A C A ...
+        let mut recs = Vec::new();
+        let pattern = [1u64, 2, 1, 3, 1, 2, 1, 3];
+        for (i, &h) in pattern.iter().enumerate() {
+            recs.push(req(i as u64, i as f64, vec![h]));
+        }
+        let out = analyze(&recs, 2);
+        assert!(
+            out.belady_finite.hits >= out.lru_finite.hits,
+            "belady should be at least as good as lru: belady={} lru={}",
+            out.belady_finite.hits,
+            out.lru_finite.hits
+        );
+    }
+
+    #[test]
+    fn unlimited_is_upper_bound() {
+        let recs = vec![
+            req(0, 0.0, vec![10, 20, 30]),
+            req(1, 1.0, vec![10, 20, 30, 40, 50]),
+            req(2, 2.0, vec![60]),
+            req(3, 3.0, vec![10, 20, 30, 40, 50, 60]),
+        ];
+        let out = analyze(&recs, 3);
+        assert!(out.unlimited.hit_rate >= out.belady_finite.hit_rate);
+        assert!(out.belady_finite.hit_rate >= out.lru_finite.hit_rate - 1e-9);
+    }
+}
diff --git a/src/router/cache_load.rs b/src/router/cache_load.rs
new file mode 100644
index 0000000..b919100
--- /dev/null
+++ b/src/router/cache_load.rs
@@ -0,0 +1,89 @@
+//! Load-filtered cache-aware routing.
+//!
+//! **Step 1** — filter: sort all instances by `queue_len` ascending and take the
+//! least-loaded quarter (≥ 2 instances).
+//!
+//! **Step 2** — select: among that pool, pick the instance with the highest
+//! meta-store prefix score.  Tiebreak on lowest `queue_len`.
+//!
+//! This cleanly separates concerns: step 1 guarantees the request won't land
+//! on a saturated instance, while step 2 maximises cache reuse within the
+//! load-safe pool.  The 1/4 fraction keeps the pool large enough that good
+//! cache candidates are rarely excluded.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct CacheLoadRouter;
+
+impl CacheLoadRouter {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl Default for CacheLoadRouter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Router for CacheLoadRouter {
+    fn name(&self) -> &'static str {
+        "cache_load"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+
+        // Step 1: least-loaded 1/4 of instances (by queue_len).
+        let pool_size = (n / 4).max(2).min(n);
+        let mut indices: Vec<usize> = (0..n).collect();
+        indices.sort_by_key(|&i| instances[i].queue_len());
+        let pool = &indices[..pool_size];
+
+        // Step 2: among the pool, pick highest prefix score.
+        // Tiebreak: lowest queue_len.
+        let mut best_idx = pool[0];
+        let mut best_prefix = scores[pool[0]];
+        let mut best_queue = instances[pool[0]].queue_len();
+
+        for &i in &pool[1..] {
+            let p = scores[i];
+            let q = instances[i].queue_len();
+            if p > best_prefix || (p == best_prefix && q < best_queue) {
+                best_idx = i;
+                best_prefix = p;
+                best_queue = q;
+            }
+        }
+
+        let mut candidates = Vec::with_capacity(pool_size);
+        for &i in pool {
+            candidates.push(CandidateInfo {
+                instance: instances[i].id,
+                predicted_prefix: scores[i],
+                load_blocks: instances[i].kv_blocks_used,
+                queue_len: instances[i].queue_len(),
+            });
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "cache_load",
+            chosen: instances[best_idx].id,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "least-loaded 1/4, then best prefix",
+        }
+    }
+}
diff --git a/src/router/cache_score.rs b/src/router/cache_score.rs
new file mode 100644
index 0000000..495d5b1
--- /dev/null
+++ b/src/router/cache_score.rs
@@ -0,0 +1,111 @@
+//! Combined-score cache-aware routing with exponential weighting.
+//!
+//! Each instance is scored by:
+//!
+//! ```text
+//! score_i = 2^(α · load_i  +  β · miss_i)
+//! ```
+//!
+//! where
+//!
+//! - `load_i  = queue_len()` — requests pending or prefilling on instance i,
+//! - `miss_i  = input_blocks − prefix_blocks` — cache-miss blocks,
+//! - `α` = `score_alpha` (YAML config, default 1.0),
+//! - `β` = `score_beta`  (YAML config, default 0.1).
+//!
+//! The instance with the **lowest** score is chosen.  Since `2^x` is
+//! monotonic, this is equivalent to minimising the linear exponent
+//! `α·load + β·miss`, but the exponential framing highlights that
+//! differences are amplified exponentially — a small edge in the exponent
+//! creates a large gap in score.
+//!
+//! **Tuning guide**:
+//!
+//! - `α` controls how aggressively load is penalised.
+//! - `β` controls how aggressively cache misses are penalised.
+//! - Ratio `α/β` is what matters: higher → more load-sensitive.
+//! - Defaults (`α=1.0, β=0.1`): 1 extra queue position ≈ 10 extra miss
+//!   blocks, which is a good starting point when block_size is large (512)
+//!   and queues are short (0–10).
+//!
+//! Ties are broken by fewest `queue_len`, then highest `prefix`.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct CacheScoreRouter {
+    alpha: f64,
+    beta: f64,
+}
+
+impl CacheScoreRouter {
+    pub fn new(alpha: f64, beta: f64) -> Self {
+        Self { alpha, beta }
+    }
+}
+
+impl Router for CacheScoreRouter {
+    fn name(&self) -> &'static str {
+        "cache_score"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+        let input_blocks = req.hash_ids.len() as f64;
+
+        let mut best_idx: usize = 0;
+        let mut best_exp = f64::INFINITY;
+        let mut best_queue = u32::MAX;
+        let mut best_prefix = 0u32;
+        let mut candidates = Vec::with_capacity(n);
+
+        for (i, inst) in instances.iter().enumerate() {
+            let prefix = scores[i] as f64;
+            let miss = (input_blocks - prefix).max(0.0);
+            let q = inst.queue_len() as f64;
+
+            // Minimise the exponent: α·load + β·miss
+            // (equivalent to minimising 2^exponent)
+            let exponent = self.alpha * q + self.beta * miss;
+
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: scores[i],
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            });
+
+            // Minimise (exponent, queue_len ASC, prefix DESC).
+            let better = exponent < best_exp
+                || (exponent == best_exp && inst.queue_len() < best_queue)
+                || (exponent == best_exp
+                    && inst.queue_len() == best_queue
+                    && scores[i] > best_prefix);
+
+            if better {
+                best_exp = exponent;
+                best_idx = i;
+                best_queue = inst.queue_len();
+                best_prefix = scores[i];
+            }
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "cache_score",
+            chosen: instances[best_idx].id,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "argmin 2^(α·load + β·miss)",
+        }
+    }
+}
diff --git a/src/router/estimated_ttft.rs b/src/router/estimated_ttft.rs
new file mode 100644
index 0000000..314872e
--- /dev/null
+++ b/src/router/estimated_ttft.rs
@@ -0,0 +1,128 @@
+//! First-principles TTFT-optimal routing.
+//!
+//! Estimates the actual time-to-first-token for each candidate instance:
+//!
+//! `TTFT(r,i) = drain(i) + fetch(r,i) + prefill(miss)`
+//!
+//! - **drain** — exact queue drain time: sum of per-request `prefill_time()`
+//!   using the architecture-aware compute model (quadratic / DSA).
+//!
+//! - **fetch** — RDMA fetch time for blocks cached elsewhere in the cluster
+//!   but not on instance `i` locally.
+//!
+//! - **prefill** — compute for cluster-wide cache-miss tokens (constant
+//!   across instances, cancels in the argmin).
+//!
+//! The router minimises `drain(i) + fetch(r,i)`, with ties broken by
+//! lowest `queue_len` then most local cache.  The fetch overlap with queue
+//! drain is handled by keeping the additive form: this gives double
+//! incentive to prefer instances with local cache, which empirically
+//! outperforms the `max(drain, fetch)` alternative because even small
+//! RDMA savings compound across thousands of routing decisions.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::config::Config;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct EstimatedTtftRouter {
+    /// Bytes per KV block (for RDMA cost estimation).
+    kv_block_bytes: f64,
+    /// RDMA bandwidth in bytes/s.
+    rdma_bw: f64,
+    /// RDMA per-transfer latency in seconds.
+    rdma_latency_s: f64,
+}
+
+impl EstimatedTtftRouter {
+    pub fn new(config: &Config) -> Self {
+        Self {
+            kv_block_bytes: config.model.kv_block_bytes() as f64,
+            rdma_bw: config.hardware.rdma_bw,
+            rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
+        }
+    }
+
+    /// Estimate RDMA fetch time for `remote_blocks` blocks.
+    fn fetch_time(&self, remote_blocks: u32) -> f64 {
+        if remote_blocks == 0 {
+            return 0.0;
+        }
+        let bytes = remote_blocks as f64 * self.kv_block_bytes;
+        bytes / self.rdma_bw + self.rdma_latency_s
+    }
+}
+
+impl Router for EstimatedTtftRouter {
+    fn name(&self) -> &'static str {
+        "estimated_ttft"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+
+        // Cluster-wide max prefix: blocks reachable via RDMA from any peer.
+        let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
+
+        let mut best: u32 = 0;
+        let mut best_cost = f64::INFINITY;
+        let mut best_queue = u32::MAX;
+        let mut best_local = 0u32;
+        let mut candidates = Vec::with_capacity(n);
+
+        for inst in instances {
+            let i = inst.id as usize;
+            let local_prefix = scores[i];
+
+            // 1. Exact queue drain time (architecture-aware, per-request sum).
+            let drain = inst.estimated_drain_time();
+
+            // 2. RDMA fetch cost for blocks not locally cached.
+            let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
+            let fetch = self.fetch_time(remote_blocks);
+
+            // Additive cost: drain + fetch.
+            // The additive form gives explicit incentive to prefer local cache
+            // (lower fetch) even when the queue is non-empty, which reduces
+            // total RDMA traffic and improves TTFT in aggregate.
+            let cost = drain + fetch;
+
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: local_prefix,
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            });
+
+            // Minimise (cost, queue_len, -local_prefix).
+            let ql = inst.queue_len();
+            let better = cost < best_cost
+                || (cost == best_cost && ql < best_queue)
+                || (cost == best_cost && ql == best_queue && local_prefix > best_local);
+
+            if better {
+                best_cost = cost;
+                best = inst.id;
+                best_queue = ql;
+                best_local = local_prefix;
+            }
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "estimated_ttft",
+            chosen: best,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "argmin(drain_time + fetch_time)",
+        }
+    }
+}
diff --git a/src/router/least_loaded.rs b/src/router/least_loaded.rs
new file mode 100644
index 0000000..7a722aa
--- /dev/null
+++ b/src/router/least_loaded.rs
@@ -0,0 +1,54 @@
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct LeastLoadedRouter {
+    pub alpha: f64,
+}
+
+impl LeastLoadedRouter {
+    pub fn new(alpha: f64) -> Self {
+        Self { alpha }
+    }
+}
+
+impl Router for LeastLoadedRouter {
+    fn name(&self) -> &'static str {
+        "least_loaded"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        _meta: &MetaStore,
+        _now: f64,
+    ) -> RouteDecision {
+        let mut best = 0u32;
+        let mut best_score = f64::INFINITY;
+        let mut candidates = Vec::with_capacity(instances.len());
+        for inst in instances {
+            let load = inst.kv_blocks_used as f64
+                + self.alpha * inst.queue_len() as f64;
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: 0,
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            });
+            if load < best_score {
+                best_score = load;
+                best = inst.id;
+            }
+        }
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "least_loaded",
+            chosen: best,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "argmin(kv_used + alpha * queue_len)",
+        }
+    }
+}
diff --git a/src/router/least_tokens.rs b/src/router/least_tokens.rs
new file mode 100644
index 0000000..effdad7
--- /dev/null
+++ b/src/router/least_tokens.rs
@@ -0,0 +1,73 @@
+//! Least-waiting-tokens routing.
+//!
+//! Pure load-balancing baseline that picks the instance with the fewest
+//! total prefill tokens remaining across its pending and prefilling queues.
+//! Unlike `least_loaded` (which mixes KV memory pressure with queue depth),
+//! this directly minimises the expected wait time by accounting for the
+//! actual compute backlog in tokens.
+//!
+//! Tiebreak: fewest `queue_len`, then lowest instance ID.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct LeastTokensRouter;
+
+impl LeastTokensRouter {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl Default for LeastTokensRouter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Router for LeastTokensRouter {
+    fn name(&self) -> &'static str {
+        "least_tokens"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        _meta: &MetaStore,
+        _now: f64,
+    ) -> RouteDecision {
+        let mut best: u32 = 0;
+        let mut best_key: (u64, u32) = (u64::MAX, u32::MAX);
+        let mut candidates = Vec::with_capacity(instances.len());
+
+        for inst in instances {
+            let wt = inst.waiting_tokens();
+            let ql = inst.queue_len();
+
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: 0,
+                load_blocks: inst.kv_blocks_used,
+                queue_len: ql,
+            });
+
+            let key = (wt, ql);
+            if key < best_key {
+                best_key = key;
+                best = inst.id;
+            }
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "least_tokens",
+            chosen: best,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "argmin(waiting_prefill_tokens)",
+        }
+    }
+}
diff --git a/src/router/min_pd.rs b/src/router/min_pd.rs
new file mode 100644
index 0000000..87a3d4f
--- /dev/null
+++ b/src/router/min_pd.rs
@@ -0,0 +1,124 @@
+//! Minimum P*D routing.
+//!
+//! For each instance compute:
+//!   - `P` = real prefill tokens this request will do if routed there
+//!   - `D` = ongoing requests currently on that instance
+//!          (pending + prefilling)
+//!
+//! Score = `P * D`, pick the instance that minimizes it.
+//!
+//! `P` accounts for the **actual** prefill work after the cluster fetch
+//! chain runs: the fetch chain serves any block cached anywhere in the
+//! cluster (L0 → L1 → remote v6d via RDMA), so prefill compute only runs
+//! for blocks that are absent cluster-wide *and* for blocks past the
+//! instance-local prefix (the cluster only fetches a contiguous leading
+//! prefix — any gap ends the fetch chain and the rest must be prefilled).
+//!
+//! Concretely, for instance `i`:
+//!
+//! ```text
+//! local_prefix_i = meta_store.score_prefix(req, now)[i]   // blocks
+//! cluster_prefix = max over all j of meta_store_score[j]  // blocks
+//! effective_prefix_i = min(cluster_prefix, input_blocks)
+//!     - if local_prefix_i == cluster_prefix the fetch chain stays local,
+//!     - otherwise the prefill still skips cluster_prefix blocks because
+//!       the missing tail is fetched via RDMA from a peer.
+//! P_i = (input_blocks - effective_prefix_i) * block_size_tokens
+//! ```
+//!
+//! This makes `P` nearly instance-independent on well-populated clusters
+//! (so `min_pd` degenerates to balanced load with a cache-affinity
+//! tiebreak), which is exactly what you want when RDMA is cheap relative
+//! to prefill compute.
+//!
+//! Tiebreaks (essential on 128-instance clusters where many instances are
+//! idle and the raw product collapses to zero):
+//!   1. minimum `P*D`
+//!   2. then minimum `D`  — prefer the less-loaded instance
+//!   3. then maximum `local_prefix_i` — prefer local affinity to avoid
+//!      paying the RDMA fetch cost when P and D are already tied
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct MinPdRouter;
+
+impl MinPdRouter {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl Default for MinPdRouter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Router for MinPdRouter {
+    fn name(&self) -> &'static str {
+        "min_pd"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+        let block_size = instances[0].block_size_tokens as u64;
+        let input_blocks = req.hash_ids.len() as u64;
+
+        // Cluster-wide max prefix: longest contiguous prefix that EXISTS
+        // somewhere in the cluster (and will be fetched via remote RDMA if
+        // not local). This determines the effective prefill work for every
+        // candidate, not just the one that owns the blocks.
+        let cluster_prefix_blocks = scores.iter().copied().max().unwrap_or(0) as u64;
+        let effective_prefix_blocks = cluster_prefix_blocks.min(input_blocks);
+        let miss_blocks = input_blocks.saturating_sub(effective_prefix_blocks);
+        let p_base = miss_blocks.saturating_mul(block_size); // tokens to prefill
+
+        let mut candidates = Vec::with_capacity(n);
+        let mut best: u32 = instances[0].id;
+        // Minimize (P*D, D, -local_prefix).
+        // P is nearly instance-independent; D is the real discriminator.
+        // When tied on D, prefer the instance with the best local prefix
+        // (avoids the RDMA fetch cost).
+        let mut best_key: (u128, u64, i64) = (u128::MAX, u64::MAX, i64::MAX);
+
+        for inst in instances {
+            let i = inst.id as usize;
+            let d = inst.queue_len() as u64;
+            let pd = p_base as u128 * d as u128;
+            let local_prefix = scores[i] as i64;
+
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: scores[i],
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            });
+
+            // minimize (pd, d, -local_prefix)
+            let key = (pd, d, -local_prefix);
+            if key < best_key {
+                best_key = key;
+                best = inst.id;
+            }
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "min_pd",
+            chosen: best,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "argmin(P*D), P=cluster-wide miss tokens, D=ongoing reqs",
+        }
+    }
+}
diff --git a/src/router/mod.rs b/src/router/mod.rs
new file mode 100644
index 0000000..3203382
--- /dev/null
+++ b/src/router/mod.rs
@@ -0,0 +1,80 @@
+//! Cluster-level routing strategies.
+
+pub mod cache_load;
+pub mod cache_score;
+pub mod estimated_ttft;
+pub mod least_loaded;
+pub mod least_tokens;
+pub mod min_pd;
+pub mod precise_aware;
+pub mod prefix_affinity;
+pub mod random;
+pub mod ttl_aware;
+
+use serde::Serialize;
+
+use crate::cluster::meta_store::MetaStore;
+use crate::config::Config;
+use crate::instance::Instance;
+use crate::trace::RequestRecord;
+use crate::types::InstanceId;
+
+#[derive(Debug, Clone, Serialize)]
+pub struct CandidateInfo {
+    pub instance: InstanceId,
+    pub predicted_prefix: u32,
+    pub load_blocks: u32,
+    pub queue_len: u32,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct RouteDecision {
+    pub req_id: u64,
+    pub mode: &'static str,
+    pub chosen: InstanceId,
+    pub probe_overhead_s: f64,
+    pub candidates: Vec<CandidateInfo>,
+    pub reason: &'static str,
+}
+
+pub trait Router: Send {
+    fn name(&self) -> &'static str;
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision;
+}
+
+pub fn build(full: &Config, seed: u64) -> Box<dyn Router> {
+    use crate::config::RouterMode::*;
+    let cfg = &full.cluster.router;
+    match cfg.mode {
+        Random => Box::new(random::RandomRouter::new(seed)) as Box<dyn Router>,
+        RoundRobin => Box::new(random::RoundRobinRouter::new()) as Box<dyn Router>,
+        LeastLoaded => {
+            Box::new(least_loaded::LeastLoadedRouter::new(cfg.load_alpha)) as Box<dyn Router>
+        }
+        TtlAware => Box::new(ttl_aware::TtlAwareRouter::new(cfg.load_alpha)) as Box<dyn Router>,
+        Precise => Box::new(precise_aware::PreciseRouter::new(
+            cfg.precise_probe_topk,
+            cfg.precise_probe_latency_us * 1e-6,
+            cfg.load_alpha,
+        )) as Box<dyn Router>,
+        MinPd => Box::new(min_pd::MinPdRouter::new()) as Box<dyn Router>,
+        LeastTokens => Box::new(least_tokens::LeastTokensRouter::new()) as Box<dyn Router>,
+        CacheLoad => Box::new(cache_load::CacheLoadRouter::new()) as Box<dyn Router>,
+        CacheScore => {
+            Box::new(cache_score::CacheScoreRouter::new(cfg.score_alpha, cfg.score_beta))
+                as Box<dyn Router>
+        }
+        EstimatedTtft => {
+            Box::new(estimated_ttft::EstimatedTtftRouter::new(full)) as Box<dyn Router>
+        }
+        PrefixAffinity => {
+            Box::new(prefix_affinity::PrefixAffinityRouter::new(full)) as Box<dyn Router>
+        }
+    }
+}
diff --git a/src/router/precise_aware.rs b/src/router/precise_aware.rs
new file mode 100644
index 0000000..9a973d6
--- /dev/null
+++ b/src/router/precise_aware.rs
@@ -0,0 +1,120 @@
+//! KV-aware routing via meta-store candidate selection + precise probing.
+//!
+//! The global meta store is used as a *candidate pre-filter*: we score
+//! every instance's predicted prefix from the store, take the top-K by
+//! (predicted_prefix DESC, load ASC), and then exact-probe those K
+//! candidates' actual L0+L1 caches to get the true longest prefix. This
+//! catches two cases where the meta store is wrong:
+//!
+//!   - the store is stale (block evicted from L0/L1 but TTL not yet up),
+//!   - the store undercounts because some blocks' TTL expired individually.
+//!
+//! Because the candidate set is sourced from the meta store rather than
+//! from a load ranking, this router is a strict superset of `ttl_aware`:
+//! any instance the meta store would pick is a candidate here, and the
+//! exact probe can only move the decision toward a truthfully-better
+//! instance. Each probe adds `probe_latency_s` to the request's
+//! effective arrival time.
+//!
+//! If the meta store returns zero-prefix for every instance (e.g. cold
+//! start, or a request whose blocks have never been seen), we fall back
+//! to the top-K least-loaded instances so we still place the request.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct PreciseRouter {
+    pub topk: u32,
+    pub probe_latency_s: f64,
+    pub alpha: f64,
+}
+
+impl PreciseRouter {
+    pub fn new(topk: u32, probe_latency_s: f64, alpha: f64) -> Self {
+        Self { topk, probe_latency_s, alpha }
+    }
+
+    fn load_of(&self, inst: &Instance) -> f64 {
+        inst.kv_blocks_used as f64 + self.alpha * inst.queue_len() as f64
+    }
+}
+
+impl Router for PreciseRouter {
+    fn name(&self) -> &'static str {
+        "precise"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let k = (self.topk as usize).min(n).max(1);
+
+        // 1. Meta-store candidate set: rank all instances by
+        //    (predicted_prefix DESC, load ASC) and take the top-K.
+        let meta_scores = meta.score_prefix(&req.hash_ids, now, n);
+        let any_meta_hit = meta_scores.iter().any(|&p| p > 0);
+
+        let mut ranked: Vec<usize> = (0..n).collect();
+        if any_meta_hit {
+            ranked.sort_by(|&a, &b| {
+                let pa = meta_scores[a];
+                let pb = meta_scores[b];
+                // prefix desc, then load asc
+                pb.cmp(&pa)
+                    .then_with(|| {
+                        self.load_of(&instances[a])
+                            .partial_cmp(&self.load_of(&instances[b]))
+                            .unwrap_or(std::cmp::Ordering::Equal)
+                    })
+            });
+        } else {
+            // Cold start fallback: pure load order.
+            ranked.sort_by(|&a, &b| {
+                self.load_of(&instances[a])
+                    .partial_cmp(&self.load_of(&instances[b]))
+                    .unwrap_or(std::cmp::Ordering::Equal)
+            });
+        }
+        let probed = &ranked[..k];
+
+        // 2. Exact probe each candidate and pick
+        //    argmax(exact_prefix, tiebreak: -load).
+        let mut candidates = Vec::with_capacity(k);
+        let mut best = probed[0] as u32;
+        let mut best_key: (i64, f64) = (i64::MIN, f64::INFINITY);
+        for &i in probed {
+            let inst = &instances[i];
+            let l0 = inst.cache.l0.longest_prefix_peek(&req.hash_ids);
+            let l1 = inst.cache.l1.longest_prefix_peek(&req.hash_ids[l0..]);
+            let predicted = (l0 + l1) as u32;
+            let load = self.load_of(inst);
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: predicted,
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            });
+            let key = (predicted as i64, -load);
+            if key > (best_key.0, -best_key.1) {
+                best_key = (predicted as i64, load);
+                best = inst.id;
+            }
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "precise",
+            chosen: best,
+            probe_overhead_s: k as f64 * self.probe_latency_s,
+            candidates,
+            reason: "exact-probe top-K meta-store candidates",
+        }
+    }
+}
diff --git a/src/router/prefix_affinity.rs b/src/router/prefix_affinity.rs
new file mode 100644
index 0000000..66e6a31
--- /dev/null
+++ b/src/router/prefix_affinity.rs
@@ -0,0 +1,196 @@
+//! Prefix-affinity routing with load-aware fallback.
+//!
+//! **Key insight**: in real LLM traces, 99%+ of requests share a common
+//! system-prompt prefix (dozens to hundreds of 16-token blocks).  If we
+//! *consistently* route requests with the same prefix to the same small set
+//! of instances, L0 (HBM) cache hit rates increase dramatically because the
+//! working set per instance is concentrated rather than scattered.
+//!
+//! Algorithm (rendezvous hashing + drain-time-aware selection):
+//!
+//! 1. **Fingerprint**: hash the first `K` blocks of the request to produce a
+//!    prefix fingerprint that captures the system prompt identity.
+//!
+//! 2. **Rendezvous ranking**: for each instance `i`, compute
+//!    `rendezvous(fingerprint, i)` — a deterministic pseudo-random score.
+//!    Sort instances by this score descending to get a stable, per-prefix
+//!    ordering.
+//!
+//! 3. **Select from top candidates**: among the top `fan_out` instances in
+//!    the rendezvous ranking, pick the one with the lowest estimated drain
+//!    time (architecture-aware, per-request sum).  This accounts for
+//!    heterogeneous request sizes in the queue.
+//!
+//! 4. **Overload fallback**: if all top candidates have queue length above a
+//!    threshold, expand to the full instance set and use estimated-TTFT
+//!    scoring (drain + fetch) for the best selection.
+//!
+//! The combination ensures:
+//! - **Cache locality**: same-prefix requests cluster on a few instances,
+//!   building strong L0 cache entries that benefit subsequent requests.
+//! - **Load balance**: within the affinity group, drain-time-aware selection
+//!   avoids hot-spotting from large-prompt requests.
+//! - **Zero overhead**: no per-instance probes needed; fingerprint +
+//!   rendezvous are pure arithmetic.
+
+use crate::cluster::meta_store::MetaStore;
+use crate::config::Config;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct PrefixAffinityRouter {
+    /// Number of leading block hashes used for the prefix fingerprint.
+    prefix_k: usize,
+    /// Number of top-affinity instances to consider before fallback.
+    fan_out: usize,
+    /// Queue-length threshold: if all top candidates exceed this, expand to
+    /// the full instance set.
+    overload_threshold: u32,
+    /// Bytes per KV block (for RDMA cost estimation in fallback path).
+    kv_block_bytes: f64,
+    /// RDMA bandwidth in bytes/s.
+    rdma_bw: f64,
+    /// RDMA per-transfer latency in seconds.
+    rdma_latency_s: f64,
+}
+
+impl PrefixAffinityRouter {
+    pub fn new(config: &Config) -> Self {
+        let n = config.cluster.num_instances as usize;
+        let cfg_fan = config.cluster.router.affinity_fan_out;
+        // fan_out: if configured, use it; otherwise auto = max(2, n/8).
+        let fan_out = if cfg_fan > 0 {
+            cfg_fan.min(n)
+        } else {
+            (n / 8).max(2).min(n)
+        };
+        Self {
+            prefix_k: config.cluster.router.prefix_k,
+            fan_out,
+            overload_threshold: 4,
+            kv_block_bytes: config.model.kv_block_bytes() as f64,
+            rdma_bw: config.hardware.rdma_bw,
+            rdma_latency_s: config.hardware.rdma_latency_us * 1e-6,
+        }
+    }
+
+    /// Compute a prefix fingerprint from the first K block hashes.
+    fn fingerprint(hash_ids: &[u64], k: usize) -> u64 {
+        let n = hash_ids.len().min(k);
+        let mut fp: u64 = 0xcbf29ce484222325; // FNV offset basis
+        for &h in &hash_ids[..n] {
+            fp ^= h;
+            fp = fp.wrapping_mul(0x100000001b3); // FNV prime
+        }
+        fp
+    }
+
+    /// Rendezvous hash: deterministic pseudo-random score for (fingerprint, instance_id).
+    /// Higher score = higher affinity.
+    fn rendezvous_score(fp: u64, instance_id: u32) -> u64 {
+        let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15);
+        // Splitmix64 finalizer
+        h = h.wrapping_add(0x9e3779b97f4a7c15);
+        h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
+        h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb);
+        h ^ (h >> 31)
+    }
+
+    /// Estimate RDMA fetch time for `remote_blocks` blocks.
+    fn fetch_time(&self, remote_blocks: u32) -> f64 {
+        if remote_blocks == 0 {
+            return 0.0;
+        }
+        let bytes = remote_blocks as f64 * self.kv_block_bytes;
+        bytes / self.rdma_bw + self.rdma_latency_s
+    }
+}
+
+impl Router for PrefixAffinityRouter {
+    fn name(&self) -> &'static str {
+        "prefix_affinity"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let fp = Self::fingerprint(&req.hash_ids, self.prefix_k);
+
+        // Build rendezvous-ranked list of (score, index).
+        let mut ranked: Vec<(u64, usize)> = (0..n)
+            .map(|i| (Self::rendezvous_score(fp, instances[i].id), i))
+            .collect();
+        ranked.sort_unstable_by(|a, b| b.0.cmp(&a.0)); // descending score
+
+        // Collect candidate info for logging (also needed for fallback).
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+        let candidates: Vec<CandidateInfo> = instances
+            .iter()
+            .map(|inst| CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: scores[inst.id as usize],
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            })
+            .collect();
+
+        // Phase 1: among top fan_out instances, pick lowest drain time.
+        let top_k = self.fan_out.min(n);
+        let mut best_idx = ranked[0].1;
+        let mut best_drain = instances[best_idx].estimated_drain_time();
+        let mut best_ql = instances[best_idx].queue_len();
+        let mut all_overloaded = best_ql > self.overload_threshold;
+
+        for &(_, idx) in &ranked[1..top_k] {
+            let drain = instances[idx].estimated_drain_time();
+            let ql = instances[idx].queue_len();
+            if drain < best_drain || (drain == best_drain && ql < best_ql) {
+                best_idx = idx;
+                best_drain = drain;
+                best_ql = ql;
+            }
+            if ql <= self.overload_threshold {
+                all_overloaded = false;
+            }
+        }
+
+        // Phase 2: if all top candidates are overloaded, search globally
+        // using estimated-TTFT (drain + fetch) for optimal fallback.
+        let reason;
+        if all_overloaded {
+            reason = "affinity fallback: min(drain+fetch)";
+            let cluster_prefix = scores.iter().copied().max().unwrap_or(0);
+            let mut best_cost = f64::INFINITY;
+            for &(_, idx) in ranked.iter() {
+                let inst = &instances[idx];
+                let drain = inst.estimated_drain_time();
+                let local_prefix = scores[idx];
+                let remote_blocks = cluster_prefix.saturating_sub(local_prefix);
+                let cost = drain + self.fetch_time(remote_blocks);
+                let ql = inst.queue_len();
+                if cost < best_cost || (cost == best_cost && ql < best_ql) {
+                    best_cost = cost;
+                    best_idx = idx;
+                    best_ql = ql;
+                }
+            }
+        } else {
+            reason = "prefix affinity: top-K min drain";
+        }
+
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "prefix_affinity",
+            chosen: instances[best_idx].id,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason,
+        }
+    }
+}
diff --git a/src/router/random.rs b/src/router/random.rs
new file mode 100644
index 0000000..8457504
--- /dev/null
+++ b/src/router/random.rs
@@ -0,0 +1,90 @@
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+use crate::types::InstanceId;
+
+pub struct RandomRouter {
+    rng: ChaCha8Rng,
+}
+
+impl RandomRouter {
+    pub fn new(seed: u64) -> Self {
+        Self { rng: ChaCha8Rng::seed_from_u64(seed) }
+    }
+}
+
+impl Router for RandomRouter {
+    fn name(&self) -> &'static str {
+        "random"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        _meta: &MetaStore,
+        _now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let chosen = self.rng.gen_range(0..n) as InstanceId;
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "random",
+            chosen,
+            probe_overhead_s: 0.0,
+            candidates: vec![CandidateInfo {
+                instance: chosen,
+                predicted_prefix: 0,
+                load_blocks: instances[chosen as usize].kv_blocks_used,
+                queue_len: instances[chosen as usize].queue_len(),
+            }],
+            reason: "uniform random",
+        }
+    }
+}
+
+#[derive(Default)]
+pub struct RoundRobinRouter {
+    next: u32,
+}
+
+impl RoundRobinRouter {
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl Router for RoundRobinRouter {
+    fn name(&self) -> &'static str {
+        "round_robin"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        _meta: &MetaStore,
+        _now: f64,
+    ) -> RouteDecision {
+        let n = instances.len() as u32;
+        let chosen = self.next % n;
+        self.next = self.next.wrapping_add(1);
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "round_robin",
+            chosen,
+            probe_overhead_s: 0.0,
+            candidates: vec![CandidateInfo {
+                instance: chosen,
+                predicted_prefix: 0,
+                load_blocks: instances[chosen as usize].kv_blocks_used,
+                queue_len: instances[chosen as usize].queue_len(),
+            }],
+            reason: "round robin",
+        }
+    }
+}
diff --git a/src/router/ttl_aware.rs b/src/router/ttl_aware.rs
new file mode 100644
index 0000000..bbdcec5
--- /dev/null
+++ b/src/router/ttl_aware.rs
@@ -0,0 +1,59 @@
+use crate::cluster::meta_store::MetaStore;
+use crate::instance::Instance;
+use crate::router::{CandidateInfo, RouteDecision, Router};
+use crate::trace::RequestRecord;
+
+pub struct TtlAwareRouter {
+    pub alpha: f64,
+}
+
+impl TtlAwareRouter {
+    pub fn new(alpha: f64) -> Self {
+        Self { alpha }
+    }
+}
+
+impl Router for TtlAwareRouter {
+    fn name(&self) -> &'static str {
+        "ttl_aware"
+    }
+
+    fn route(
+        &mut self,
+        req: &RequestRecord,
+        instances: &[Instance],
+        meta: &MetaStore,
+        now: f64,
+    ) -> RouteDecision {
+        let n = instances.len();
+        let scores = meta.score_prefix(&req.hash_ids, now, n);
+        let mut best = 0u32;
+        let mut best_key = (i64::MIN, f64::INFINITY); // maximize prefix, then minimize load
+        let mut candidates = Vec::with_capacity(n);
+        for inst in instances {
+            let p = scores[inst.id as usize];
+            let load = inst.kv_blocks_used as f64
+                + self.alpha * inst.queue_len() as f64;
+            candidates.push(CandidateInfo {
+                instance: inst.id,
+                predicted_prefix: p,
+                load_blocks: inst.kv_blocks_used,
+                queue_len: inst.queue_len(),
+            });
+            let key = (p as i64, -load);
+            // we want max prefix, min load -> compare (p, -load) lexicographically max
+            if key > (best_key.0, -best_key.1) {
+                best_key = (p as i64, load);
+                best = inst.id;
+            }
+        }
+        RouteDecision {
+            req_id: req.req_id,
+            mode: "ttl_aware",
+            chosen: best,
+            probe_overhead_s: 0.0,
+            candidates,
+            reason: "max meta_store prefix, tie -> least loaded",
+        }
+    }
+}
diff --git a/src/sim/engine.rs b/src/sim/engine.rs
new file mode 100644
index 0000000..ee0d548
--- /dev/null
+++ b/src/sim/engine.rs
@@ -0,0 +1,113 @@
+//! Discrete-event engine.
+//!
+//! Single-threaded virtual time `f64` seconds. Events are stored in a min-heap
+//! keyed by `(time, seq)` so equal-time events fire in insertion order.
+
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+
+use super::events::Event;
+
+#[derive(Debug)]
+struct Slot {
+    time: f64,
+    seq: u64,
+    event: Event,
+}
+
+impl Eq for Slot {}
+impl PartialEq for Slot {
+    fn eq(&self, other: &Self) -> bool {
+        self.time == other.time && self.seq == other.seq
+    }
+}
+impl Ord for Slot {
+    fn cmp(&self, other: &Self) -> Ordering {
+        // Reverse so BinaryHeap acts as a min-heap.
+        other
+            .time
+            .partial_cmp(&self.time)
+            .unwrap_or(Ordering::Equal)
+            .then_with(|| other.seq.cmp(&self.seq))
+    }
+}
+impl PartialOrd for Slot {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct EventQueue {
+    heap: BinaryHeap<Slot>,
+    seq: u64,
+    now: f64,
+}
+
+impl EventQueue {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn now(&self) -> f64 {
+        self.now
+    }
+
+    pub fn schedule(&mut self, time: f64, event: Event) {
+        let t = time.max(self.now);
+        self.seq += 1;
+        self.heap.push(Slot { time: t, seq: self.seq, event });
+    }
+
+    pub fn pop(&mut self) -> Option<(f64, Event)> {
+        let slot = self.heap.pop()?;
+        if slot.time > self.now {
+            self.now = slot.time;
+        }
+        Some((slot.time, slot.event))
+    }
+
+    pub fn len(&self) -> usize {
+        self.heap.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.heap.is_empty()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::InstanceId;
+
+    #[test]
+    fn pops_in_time_order() {
+        let mut q = EventQueue::new();
+        q.schedule(2.0, Event::BatchTick { instance: 0 as InstanceId });
+        q.schedule(1.0, Event::BatchTick { instance: 1 });
+        q.schedule(1.5, Event::BatchTick { instance: 2 });
+        let (t1, _) = q.pop().unwrap();
+        let (t2, _) = q.pop().unwrap();
+        let (t3, _) = q.pop().unwrap();
+        assert!(t1 <= t2 && t2 <= t3);
+        assert!((t1 - 1.0).abs() < 1e-12);
+        assert!((t3 - 2.0).abs() < 1e-12);
+    }
+
+    #[test]
+    fn equal_time_fifo() {
+        let mut q = EventQueue::new();
+        q.schedule(1.0, Event::BatchTick { instance: 7 });
+        q.schedule(1.0, Event::BatchTick { instance: 8 });
+        let (_, e1) = q.pop().unwrap();
+        let (_, e2) = q.pop().unwrap();
+        match (e1, e2) {
+            (Event::BatchTick { instance: a }, Event::BatchTick { instance: b }) => {
+                assert_eq!(a, 7);
+                assert_eq!(b, 8);
+            }
+            _ => panic!("wrong events"),
+        }
+    }
+}
diff --git a/src/sim/events.rs b/src/sim/events.rs
new file mode 100644
index 0000000..e369fa2
--- /dev/null
+++ b/src/sim/events.rs
@@ -0,0 +1,15 @@
+//! Event types for the discrete-event engine.
+
+use crate::types::{InstanceId, ReqId};
+
+#[derive(Debug)]
+pub enum Event {
+    /// New trace request arrives at the cluster router.
+    Arrival { req_id: ReqId },
+    /// Per-instance scheduler tick (continuous batching).
+    BatchTick { instance: InstanceId },
+    /// Periodic time-series sample of all instances.
+    Sample,
+    /// Stop the simulation early (used internally).
+    Stop,
+}
diff --git a/src/sim/mod.rs b/src/sim/mod.rs
new file mode 100644
index 0000000..938ff5b
--- /dev/null
+++ b/src/sim/mod.rs
@@ -0,0 +1,5 @@
+pub mod engine;
+pub mod events;
+
+pub use engine::EventQueue;
+pub use events::Event;
diff --git a/src/trace.rs b/src/trace.rs
new file mode 100644
index 0000000..236c0b5
--- /dev/null
+++ b/src/trace.rs
@@ -0,0 +1,102 @@
+//! Streaming JSONL reader for the qwen-bailian trace format.
+//!
+//! Schema (per upstream README):
+//!   chat_id: i64
+//!   parent_chat_id: i64           (-1 = root)
+//!   timestamp: f64                (seconds since trace start)
+//!   input_length: i64
+//!   output_length: i64
+//!   type: string                  (text/search/image/file)
+//!   turn: i64
+//!   hash_ids: [i64]               (16-token blocks, salted SipHash)
+
+use anyhow::{Context, Result};
+use serde::Deserialize;
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+use std::path::Path;
+
+#[derive(Debug, Clone, Deserialize)]
+struct RawRecord {
+    #[serde(default)]
+    chat_id: i64,
+    #[serde(default)]
+    timestamp: f64,
+    #[serde(default)]
+    input_length: i64,
+    #[serde(default)]
+    output_length: i64,
+    #[serde(default)]
+    hash_ids: Vec<i64>,
+}
+
+#[derive(Debug, Clone)]
+pub struct RequestRecord {
+    pub req_id: u64,
+    pub chat_id: i64,
+    pub arrival: f64,
+    pub input_len: u32,
+    pub output_len: u32,
+    pub hash_ids: Vec<u64>,
+}
+
+pub struct TraceReader {
+    inner: BufReader<File>,
+    next_id: u64,
+    line_buf: String,
+    max_requests: Option<u64>,
+}
+
+impl TraceReader {
+    pub fn open<P: AsRef<Path>>(path: P, max_requests: Option<u64>) -> Result<Self> {
+        let path = path.as_ref();
+        let f = File::open(path)
+            .with_context(|| format!("opening trace {}", path.display()))?;
+        Ok(Self {
+            inner: BufReader::with_capacity(1 << 20, f),
+            next_id: 0,
+            line_buf: String::with_capacity(4096),
+            max_requests,
+        })
+    }
+}
+
+impl Iterator for TraceReader {
+    type Item = Result<RequestRecord>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(cap) = self.max_requests {
+            if self.next_id >= cap {
+                return None;
+            }
+        }
+        loop {
+            self.line_buf.clear();
+            match self.inner.read_line(&mut self.line_buf) {
+                Ok(0) => return None,
+                Ok(_) => {
+                    let trimmed = self.line_buf.trim();
+                    if trimmed.is_empty() {
+                        continue;
+                    }
+                    let parsed: Result<RawRecord, _> = serde_json::from_str(trimmed);
+                    let raw = match parsed {
+                        Ok(r) => r,
+                        Err(e) => return Some(Err(anyhow::anyhow!("trace parse: {e}"))),
+                    };
+                    let id = self.next_id;
+                    self.next_id += 1;
+                    return Some(Ok(RequestRecord {
+                        req_id: id,
+                        chat_id: raw.chat_id,
+                        arrival: raw.timestamp,
+                        input_len: raw.input_length.max(0) as u32,
+                        output_len: raw.output_length.max(0) as u32,
+                        hash_ids: raw.hash_ids.into_iter().map(|h| h as u64).collect(),
+                    }));
+                }
+                Err(e) => return Some(Err(e.into())),
+            }
+        }
+    }
+}
diff --git a/src/types.rs b/src/types.rs
new file mode 100644
index 0000000..9bcab09
--- /dev/null
+++ b/src/types.rs
@@ -0,0 +1,4 @@
+//! Shared simple types.
+
+pub type InstanceId = u32;
+pub type ReqId = u64;
diff --git a/tests/smoke.rs b/tests/smoke.rs
new file mode 100644
index 0000000..471ff9f
--- /dev/null
+++ b/tests/smoke.rs
@@ -0,0 +1,155 @@
+//! Smoke test: synthesize a small trace with shared prefixes and assert that
+//! the cache hit rate is monotonic in router sophistication:
+//!     random  <=  least_loaded  <=  ttl_aware  <=  precise
+
+use std::io::Write;
+
+use kvcache_simulator::config::*;
+use kvcache_simulator::driver;
+
+fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
+    Config {
+        model: ModelConfig {
+            name: "test".into(),
+            num_layers: 4,
+            num_kv_heads: 2,
+            head_dim: 64,
+            dtype_bytes: 2,
+            block_size_tokens: 16,
+            flops_per_token_prefill: Some(1.0e9),
+            attn_quadratic_coeff: Some(64.0),
+            ..Default::default()
+        },
+        hardware: HardwareConfig {
+            gpu_flops: 1.0e14,
+            gpu_mem_bw: 1.0e12,
+            hbm_bytes: 1.0e9,
+            dram_bytes: 4.0e9,
+            pcie_bw: 32.0e9,
+            pcie_latency_us: 1.0,
+            rdma_bw: 12.0e9,
+            rdma_latency_us: 5.0,
+            max_batch_slots: 32,
+            prefill_chunk_tokens: 1024,
+        },
+        cluster: ClusterConfig {
+            num_instances: 4,
+            meta_store: MetaStoreConfig { ttl_seconds: 1000.0 },
+            router: RouterConfig {
+                mode,
+                precise_probe_latency_us: 10.0,
+                precise_probe_topk: 4,
+                load_alpha: 0.1,
+                score_alpha: 1.0,
+                score_beta: 0.1,
+                prefix_k: 8,
+                affinity_fan_out: 0,
+            },
+        },
+        sim: SimConfig {
+            trace_path: trace_path.into(),
+            max_requests: None,
+            output_dir: out_dir.into(),
+            sample_interval_s: 0.0,
+            seed: 7,
+        },
+    }
+}
+
+fn write_synthetic_trace(path: &std::path::Path) {
+    // 5 distinct conversations, each with 8 turns. Within a conversation,
+    // turn k+1 reuses the prefix of turn k (shared first ~10 blocks) and
+    // appends a few new blocks. This is the canonical KV-prefix-cache pattern.
+    let mut f = std::fs::File::create(path).unwrap();
+    let mut t = 0.0_f64;
+    let mut req_id_counter: i64 = 0;
+    for conv in 0..5i64 {
+        let mut prefix: Vec<i64> = (0..10).map(|i| conv * 1_000_000 + i).collect();
+        for turn in 0..8 {
+            let mut hashes = prefix.clone();
+            // Append 2 new blocks unique to this turn
+            for j in 0..2 {
+                let h = conv * 1_000_000 + 100 + (turn as i64) * 10 + j;
+                hashes.push(h);
+            }
+            req_id_counter += 1;
+            let line = serde_json::json!({
+                "chat_id": conv,
+                "parent_chat_id": -1,
+                "timestamp": t,
+                "input_length": (hashes.len() as i64) * 16,
+                "output_length": 16, // 1 block of decode
+                "type": "text",
+                "turn": turn,
+                "hash_ids": hashes,
+            });
+            writeln!(f, "{}", line).unwrap();
+            // Next turn's prefix grows to include this turn's appended blocks
+            prefix = hashes;
+            t += 0.05;
+        }
+        let _ = req_id_counter;
+    }
+}
+
+fn run(mode: RouterMode, trace_path: &std::path::Path, out_root: &std::path::Path)
+    -> kvcache_simulator::metrics::Summary
+{
+    let cfg = base_config(
+        trace_path.to_str().unwrap(),
+        out_root.to_str().unwrap(),
+        mode,
+    );
+    let res = driver::run(&cfg, Some(mode.as_str())).expect("sim run");
+    res.summary
+}
+
+#[test]
+fn ablation_hit_rate_ordering() {
+    let tmp = std::env::temp_dir().join("kvcache_sim_smoke");
+    let _ = std::fs::remove_dir_all(&tmp);
+    std::fs::create_dir_all(&tmp).unwrap();
+    let trace_path = tmp.join("trace.jsonl");
+    write_synthetic_trace(&trace_path);
+
+    let s_random = run(RouterMode::Random, &trace_path, &tmp);
+    let s_ll = run(RouterMode::LeastLoaded, &trace_path, &tmp);
+    let s_ttl = run(RouterMode::TtlAware, &trace_path, &tmp);
+    let s_prec = run(RouterMode::Precise, &trace_path, &tmp);
+
+    let total_hit = |s: &kvcache_simulator::metrics::Summary| {
+        s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote
+    };
+
+    let h_rand = total_hit(&s_random);
+    let h_ll = total_hit(&s_ll);
+    let h_ttl = total_hit(&s_ttl);
+    let h_prec = total_hit(&s_prec);
+
+    eprintln!(
+        "smoke: hit rates  random={:.3} least_loaded={:.3} ttl={:.3} precise={:.3}",
+        h_rand, h_ll, h_ttl, h_prec
+    );
+    eprintln!(
+        "         remote+local hit ratio L0/L1/remote: \
+         random=({:.2},{:.2},{:.2}) precise=({:.2},{:.2},{:.2})",
+        s_random.hit_rate_l0, s_random.hit_rate_l1, s_random.hit_rate_remote,
+        s_prec.hit_rate_l0, s_prec.hit_rate_l1, s_prec.hit_rate_remote,
+    );
+
+    // ttl_aware and precise should outperform random / least_loaded for
+    // a workload built entirely of shared-prefix conversations.
+    let eps = 1e-6;
+    assert!(
+        h_ttl + eps >= h_rand,
+        "ttl_aware should >= random hit rate"
+    );
+    assert!(
+        h_prec + eps >= h_rand,
+        "precise should >= random hit rate"
+    );
+    assert!(
+        h_prec + eps >= h_ll,
+        "precise should >= least_loaded hit rate"
+    );
+}