From ec73a95e0518cd58a70e6946e11b547301246ada Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 14 Apr 2026 01:16:02 +0800 Subject: [PATCH] KVCache simulator for LLM serving cluster routing research Discrete-event simulator for evaluating KV cache-aware routing policies in prefill-disaggregated LLM serving clusters. Models a two-tier KV cache hierarchy (L0 GPU HBM + L1 CPU DRAM) with RDMA/PCIe link contention, architecture-derived roofline compute (MoE, MLA, DSA), and a cluster-wide meta-store for prefix-aware routing decisions. Includes 11 routing policies (random, round_robin, least_loaded, least_tokens, ttl_aware, precise, min_pd, cache_load, cache_score, estimated_ttft, prefix_affinity), HuggingFace config.json auto-parsing, built-in GPU hardware presets (H100/H800/H20/A100/B200), and ablation tooling for systematic policy comparison across real Alibaba serving traces. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 29 + .gitmodules | 3 + Cargo.lock | 729 ++++++++++++++++++ Cargo.toml | 28 + README.md | 174 +++++ configs/glm5-8xb200-blk512.yaml | 68 ++ configs/glm5-8xb200-hf.yaml | 40 + configs/glm5-8xb200.yaml | 67 ++ configs/qwen2.5-coder-32b-h800.yaml | 42 + configs/qwen2.5-coder-7b-h800.yaml | 42 + configs/qwen2.5-coder-7b-preset.yaml | 36 + configs/qwen3-coder-480b-8xh20.yaml | 29 + models/GLM-5/config.json | 59 ++ .../config.json | 41 + qwen-bailian-usagetraces-anon | 1 + src/cluster/cluster.rs | 167 ++++ src/cluster/meta_store.rs | 161 ++++ src/cluster/mod.rs | 6 + src/config.rs | 510 ++++++++++++ src/driver.rs | 170 ++++ src/hardware_presets.rs | 225 ++++++ src/hf_config.rs | 193 +++++ src/instance/compute.rs | 405 ++++++++++ src/instance/instance.rs | 191 +++++ src/instance/kv_cache.rs | 226 ++++++ src/instance/mod.rs | 6 + src/lib.rs | 13 + src/main.rs | 271 +++++++ src/metrics/mod.rs | 7 + src/metrics/per_request.rs | 42 + src/metrics/routing_log.rs | 29 + src/metrics/summary.rs | 80 ++ src/metrics/timeseries.rs | 34 + src/network.rs | 84 ++ src/oracle.rs | 279 +++++++ src/router/cache_load.rs | 89 +++ src/router/cache_score.rs | 111 +++ src/router/estimated_ttft.rs | 128 +++ src/router/least_loaded.rs | 54 ++ src/router/least_tokens.rs | 73 ++ src/router/min_pd.rs | 124 +++ src/router/mod.rs | 80 ++ src/router/precise_aware.rs | 120 +++ src/router/prefix_affinity.rs | 196 +++++ src/router/random.rs | 90 +++ src/router/ttl_aware.rs | 59 ++ src/sim/engine.rs | 113 +++ src/sim/events.rs | 15 + src/sim/mod.rs | 5 + src/trace.rs | 102 +++ src/types.rs | 4 + tests/smoke.rs | 155 ++++ 52 files changed, 6005 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 configs/glm5-8xb200-blk512.yaml create mode 100644 configs/glm5-8xb200-hf.yaml create mode 100644 configs/glm5-8xb200.yaml create mode 100644 configs/qwen2.5-coder-32b-h800.yaml create mode 100644 configs/qwen2.5-coder-7b-h800.yaml create mode 100644 configs/qwen2.5-coder-7b-preset.yaml create mode 100644 configs/qwen3-coder-480b-8xh20.yaml create mode 100644 models/GLM-5/config.json create mode 100644 models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json create mode 160000 qwen-bailian-usagetraces-anon create mode 100644 src/cluster/cluster.rs create mode 100644 src/cluster/meta_store.rs create mode 100644 src/cluster/mod.rs create mode 100644 src/config.rs create mode 100644 src/driver.rs create mode 100644 src/hardware_presets.rs create mode 100644 src/hf_config.rs create mode 100644 src/instance/compute.rs create mode 100644 src/instance/instance.rs create mode 100644 src/instance/kv_cache.rs create mode 100644 src/instance/mod.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/metrics/mod.rs create mode 100644 src/metrics/per_request.rs create mode 100644 src/metrics/routing_log.rs create mode 100644 src/metrics/summary.rs create mode 100644 src/metrics/timeseries.rs create mode 100644 src/network.rs create mode 100644 src/oracle.rs create mode 100644 src/router/cache_load.rs create mode 100644 src/router/cache_score.rs create mode 100644 src/router/estimated_ttft.rs create mode 100644 src/router/least_loaded.rs create mode 100644 src/router/least_tokens.rs create mode 100644 src/router/min_pd.rs create mode 100644 src/router/mod.rs create mode 100644 src/router/precise_aware.rs create mode 100644 src/router/prefix_affinity.rs create mode 100644 src/router/random.rs create mode 100644 src/router/ttl_aware.rs create mode 100644 src/sim/engine.rs create mode 100644 src/sim/events.rs create mode 100644 src/sim/mod.rs create mode 100644 src/trace.rs create mode 100644 src/types.rs create mode 100644 tests/smoke.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f70b09b --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Trace files +bailian-traces + +# Rust build artifacts +/target/ +**/*.rs.bk + +# Simulation output +/runs/ + +# Editor / IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Profiling / perf +perf.data* +flamegraph.svg +*.prof + +# Temporary test files +/tmp/ +*.log diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..945f71e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "qwen-bailian-usagetraces-anon"] + path = qwen-bailian-usagetraces-anon + url = https://github.com/alibaba-edu/qwen-bailian-usagetraces-anon.git diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..bad079b --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,729 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.59.0", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "js-sys" +version = "0.3.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "kvcache-simulator" +version = "0.1.0" +dependencies = [ + "ahash", + "anyhow", + "clap", + "csv", + "indicatif", + "ordered-float", + "rand", + "rand_chacha", + "serde", + "serde_json", + "serde_yaml", + "smallvec", + "thiserror", +] + +[[package]] +name = "libc" +version = "0.2.184" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "ordered-float" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" +dependencies = [ + "num-traits", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..841e3d1 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "kvcache-simulator" +version = "0.1.0" +edition = "2021" +description = "Discrete-event simulator for cluster-level LLM serving with two-tier KV cache and KV-aware routing ablation." + +[[bin]] +name = "kvcache-sim" +path = "src/main.rs" + +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = "1" +serde_yaml = "0.9" +clap = { version = "4", features = ["derive"] } +csv = "1" +ahash = "0.8" +ordered-float = "4" +anyhow = "1" +thiserror = "1" +indicatif = "0.17" +rand = "0.8" +rand_chacha = "0.3" +smallvec = { version = "1", features = ["union"] } + +[profile.release] +lto = "thin" +codegen-units = 1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..1dde7d3 --- /dev/null +++ b/README.md @@ -0,0 +1,174 @@ +# kvcache-simulator + +Discrete-event simulator for cluster-level LLM **prefill** serving with a +two-tier KV cache (GPU HBM + CPU DRAM / v6d) and KV-aware request routing. +Replays real production traces against a synthetic cluster so you can +ablate routing strategies and cache sizing without spinning up any GPUs. + +Assumes **PD (prefill/decode) disaggregation** — only the prefill path is +modeled. + +## Build + +```bash +cargo build --release +# binary: target/release/kvcache-sim +``` + +Fetch the upstream trace (consumed as a git submodule): + +```bash +git submodule update --init --recursive +``` + +## Usage + +### 1. Run a single simulation + +```bash +target/release/kvcache-sim run --config configs/qwen2.5-coder-7b-h800.yaml +``` + +Prints `summary.json` to stdout and writes the full output directory +(see [Outputs](#outputs) below). + +### 2. Compare routers on the same trace (ablation) + +```bash +target/release/kvcache-sim ablate \ + --config configs/qwen2.5-coder-7b-h800.yaml \ + --num-instances 64 \ + --output-dir runs/qwen7b_n64 \ + --routers random,least_loaded,ttl_aware,precise +``` + +Writes one subdirectory per router plus a combined +`runs/qwen7b_n64/ablation.json` with side-by-side summaries. + +### 3. Compute theoretical hit-rate ceilings (oracle) + +```bash +# Cluster-aggregate capacity (default) +target/release/kvcache-sim oracle \ + --config configs/qwen2.5-coder-7b-h800.yaml --num-instances 64 + +# A single instance's HBM budget +target/release/kvcache-sim oracle \ + --config configs/qwen2.5-coder-7b-h800.yaml --per-instance + +# Explicit capacity in 16-token blocks +target/release/kvcache-sim oracle \ + --config configs/qwen2.5-coder-7b-h800.yaml --capacity-blocks 200000 +``` + +Reports three numbers: + +- `unlimited.hit_rate` — absolute ceiling (infinite cache) +- `belady_finite.hit_rate` — optimal-eviction ceiling at the given capacity +- `lru_finite.hit_rate` — production LRU at the same capacity + +Gap between `lru_finite` and `belady_finite` = headroom from a smarter +eviction policy. Gap between `belady_finite` and `unlimited` = headroom +only reachable by adding capacity. + +### 4. Validate a config without running + +```bash +target/release/kvcache-sim validate --config configs/qwen2.5-coder-7b-h800.yaml +``` + +Parses the YAML, prints derived per-instance block budgets, and dumps +the first 5 trace records so you can sanity-check the path. + +## CLI overrides + +These flags work on **all** subcommands and override the YAML in place, +so the same config can be reused across sweeps: + +| Flag | Overrides | +|--------------------------|-------------------------------------------| +| `--num-instances ` | `cluster.num_instances` | +| `--max-requests ` | `sim.max_requests` | +| `--trace ` | `sim.trace_path` | +| `--output-dir ` | `sim.output_dir` | +| `--seed ` | `sim.seed` | +| `--precise-topk ` | `cluster.router.precise_probe_topk` | +| `--ttl-seconds ` | `cluster.meta_store.ttl_seconds` | + +`oracle` additionally takes `--capacity-blocks ` / `--per-instance` +and `--out `. `ablate` additionally takes `--routers `. + +## Router modes + +Set `cluster.router.mode` in the YAML or list in `--routers`: + +| Mode | What it does | +|----------------|--------------------------------------------------------------------| +| `random` | Uniform random. Baseline. | +| `round_robin` | Deterministic round-robin. Baseline. | +| `least_loaded` | `argmin(kv_blocks_used + alpha * queue_len)`. KV-blind. | +| `ttl_aware` | Picks instance with longest prefix in the global TTL meta store. | +| `precise` | Probes top-K least-loaded instances' actual caches; charges probe latency into TTFT. | + +Expected hit-rate ordering: `random ≲ least_loaded ≲ ttl_aware ≲ precise`. + +## Outputs + +Each run writes a directory under `sim.output_dir`: + +| File | Contents | +|----------------------|----------------------------------------------------------------------------| +| `summary.json` | Router, throughput, TTFT p50/p95/p99, hit rates per tier, total RDMA/PCIe bytes | +| `per_request.csv` | `req_id,arrival,ttft,e2e,instance,total_blocks,l0_hit,l1_hit,remote_hit,miss,rdma_bytes,pcie_bytes,probe_overhead_s` | +| `instances.csv` | `t,instance,queue_len,kv_blocks_used,kv_blocks_total,busy` per sample | +| `routing_log.jsonl` | One JSON per request: all router candidates + chosen instance + reason | + +For `ablate`: an extra `ablation.json` with one summary per router. +For `oracle`: an `oracle.json` with the three hit-rate analyses. + +### Reading results quickly + +```bash +# Pretty-print the summary +cat runs/qwen7b/summary.json | jq . + +# Compare all routers from an ablation +cat runs/qwen7b_n64/ablation.json | jq '.[] | {router, ttft_p50, hit_rate_l0, total_rdma_bytes}' + +# Hit-rate ceilings vs LRU at the same capacity +cat runs/qwen7b/oracle.json | jq '{unlimited: .unlimited.hit_rate, belady: .belady_finite.hit_rate, lru: .lru_finite.hit_rate}' +``` + +## Config + +A config is a single YAML file with four sections. A working example +lives at +[`configs/qwen2.5-coder-7b-h800.yaml`](configs/qwen2.5-coder-7b-h800.yaml); +copy and edit for other models/hardware. + +```yaml +model: # shape + prefill roofline coefficients +hardware: # per-instance GPU/PCIe/RDMA capabilities + batch knobs +cluster: # num_instances, meta_store TTL, router mode +sim: # trace_path, max_requests, output_dir, seed +``` + +Only prefill-side model coefficients are used; any decode fields in +legacy YAMLs are accepted and ignored. + +## Trace format + +The simulator reads the Alibaba +[`qwen-bailian-usagetraces-anon`](https://github.com/alibaba-edu/qwen-bailian-usagetraces-anon) +JSONL schema. Each record has `chat_id`, `timestamp`, `input_length`, +`output_length`, and `hash_ids` (16-token block hashes). Only the +input side is used. + +## Testing + +```bash +cargo test --release +``` + +16 tests: 15 unit + 1 smoke that runs all four routers on a synthetic +shared-prefix trace and asserts the expected hit-rate ordering. diff --git a/configs/glm5-8xb200-blk512.yaml b/configs/glm5-8xb200-blk512.yaml new file mode 100644 index 0000000..cbf1be8 --- /dev/null +++ b/configs/glm5-8xb200-blk512.yaml @@ -0,0 +1,68 @@ +# GLM-5 (zai-org/GLM-5) on 8 x B200 SXM (192GB each). +# Architecture from HuggingFace config.json — all roofline coefficients +# are derived automatically. + +model: + name: glm-5 + # Core architecture (from HF config.json) + num_layers: 78 + hidden_size: 6144 + num_attention_heads: 64 + num_kv_heads: 64 # formalism; MLA overrides KV cache sizing + head_dim: 64 + intermediate_size: 12288 # shared expert FFN width + dtype_bytes: 2 # BF16 + block_size_tokens: 512 # matches bailian-traces blksz_512 + + # MoE: 256 routed + 1 shared, 8 active per token + moe: + num_experts: 256 + num_active_experts: 8 + num_shared_experts: 1 + expert_intermediate_size: 2048 # moe_intermediate_size + + # MLA (Multi-head Latent Attention): compressed KV cache + mla: + kv_lora_rank: 512 + q_lora_rank: 2048 + qk_nope_head_dim: 192 + qk_rope_head_dim: 64 + v_head_dim: 256 + + # DSA (DeepSeek Sparse Attention): sub-quadratic past dense_window + attention: + type: dsa + dense_window: 4096 + sparse_stride: 8 + first_dense_layers: 3 + +hardware: + # Aggregate of 8 x B200 in one tensor-parallel group. + gpu_flops: 1.80e16 # 8 * 2.25 PFLOPS BF16 dense + gpu_mem_bw: 6.40e13 # 8 * 8 TB/s HBM3e + # KV budget after FP8 weights + activations. GLM-5 FP8 ~744GB of 1536GB. + hbm_bytes: 500.0e9 + dram_bytes: 1.5e12 # ~1.5 TB usable CPU DRAM / v6d per node + pcie_bw: 128.0e9 # PCIe Gen6 x16 + pcie_latency_us: 4.0 + rdma_bw: 50.0e9 # ConnectX-7 400 Gbps + rdma_latency_us: 6.0 + max_batch_slots: 256 + prefill_chunk_tokens: 4096 + +cluster: + num_instances: 64 + meta_store: + ttl_seconds: 300.0 + router: + mode: min_pd + precise_probe_latency_us: 50.0 + precise_probe_topk: 4 + load_alpha: 1.0 + +sim: + trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl + max_requests: null + output_dir: runs/glm5_8xb200_blk512 + sample_interval_s: 1.0 + seed: 42 diff --git a/configs/glm5-8xb200-hf.yaml b/configs/glm5-8xb200-hf.yaml new file mode 100644 index 0000000..0cc32d2 --- /dev/null +++ b/configs/glm5-8xb200-hf.yaml @@ -0,0 +1,40 @@ +# GLM-5 using HuggingFace config.json + hardware preset. +# +# This config demonstrates the simplified format: +# model.config_json — loads architecture from HF config.json +# hardware.type — loads GPU specs from built-in preset +# +# Only deployment-specific fields need to be set explicitly. +# Any field from config_json or the preset can be overridden in YAML. + +model: + # Auto-detect architecture: MoE, MLA, DSA, head dims, etc. + config_json: ../models/GLM-5/config.json + name: glm-5 # override HF model_type + dtype_bytes: 1 # BF16 (not in HF config.json) + block_size_tokens: 512 # matches bailian-traces blksz_512 + +hardware: + type: 8xb200 # 8 x B200 SXM (192GB each) + # Override preset values for this specific deployment: + hbm_bytes: 500.0e9 # KV budget after FP8 weights + activations + dram_bytes: 1.5e12 # ~1.5 TB usable CPU DRAM per node + max_batch_slots: 256 + +cluster: + num_instances: 32 + meta_store: + ttl_seconds: 300.0 + router: + mode: min_pd + precise_probe_latency_us: 50.0 + precise_probe_topk: 4 + load_alpha: 1.0 + prefix_k: 8 + +sim: + trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl + max_requests: null + output_dir: runs/glm5_8xb200_hf + sample_interval_s: 1.0 + seed: 42 diff --git a/configs/glm5-8xb200.yaml b/configs/glm5-8xb200.yaml new file mode 100644 index 0000000..0e3c542 --- /dev/null +++ b/configs/glm5-8xb200.yaml @@ -0,0 +1,67 @@ +# GLM-5 (zai-org/GLM-5) served as a single tensor-parallel instance on +# 8 x NVIDIA B200 SXM (192GB each, 1.5 TB aggregate HBM). +# +# GLM-5 is a 744B-total / 40B-active Mixture-of-Experts model (BF16), +# using DeepSeek Sparse Attention (DSA). The HF card does not publish +# layer/head shapes, so the values below are reasonable estimates based +# on the GLM-4.5 lineage; adjust once the official config.json is public. +# +# Hardware values below represent the *aggregate* of the 8-GPU TP group +# (one simulated "instance" == one 8xB200 serving replica). This is how +# the roofline in src/instance/compute.rs wants to see it: gpu_flops and +# gpu_mem_bw are the effective peaks seen by the TP'd model. +# +# Calibrate `flops_per_token_prefill` and `attn_quadratic_coeff` against +# measured prefill latency before trusting absolute TTFT numbers. + +model: + name: glm-5 + # --- estimates; refine from official config.json when available --- + num_layers: 92 + num_kv_heads: 8 # GQA + head_dim: 128 + dtype_bytes: 2 # BF16 + block_size_tokens: 16 # trace convention + # Active-params-driven roofline: MoE activates ~40B params per token, + # so non-attention prefill FLOPs/token ≈ 2 * 40e9 = 8e10. + flops_per_token_prefill: 8.0e10 + # Quadratic attention term ≈ 2 * num_heads * head_dim. GLM-5 uses + # DeepSeek Sparse Attention which is sub-quadratic in practice, so + # this coefficient is an upper bound — lower it if your measurements + # show DSA kicking in for long prompts. + attn_quadratic_coeff: 2048.0 + bytes_per_token_prefill: 0.0 + +hardware: + # Aggregate of 8 x B200 in one tensor-parallel group. + gpu_flops: 1.80e16 # 8 * 2.25 PFLOPS BF16 dense + gpu_mem_bw: 6.40e13 # 8 * 8 TB/s HBM3e + # KV-cache budget after weights + activations. GLM-5 @ BF16 is ~1.49TB, + # which barely fits in 1.5TB HBM; realistic serving uses FP8 weights + # (~744GB), leaving ~500GB for activations + KV cache. Adjust if your + # deployment uses a different weight dtype. + hbm_bytes: 500.0e9 + dram_bytes: 1.5e12 # ~1.5 TB usable CPU DRAM / v6d per node + pcie_bw: 128.0e9 # PCIe Gen6 x16 ~ 128 GB/s per direction + pcie_latency_us: 4.0 + rdma_bw: 50.0e9 # ConnectX-7 400 Gbps ≈ 50 GB/s + rdma_latency_us: 6.0 + max_batch_slots: 256 + prefill_chunk_tokens: 2048 + +cluster: + num_instances: 8 # 8 TP replicas -> 64 B200s cluster-wide + meta_store: + ttl_seconds: 120.0 + router: + mode: ttl_aware + precise_probe_latency_us: 50.0 + precise_probe_topk: 4 + load_alpha: 1.0 + +sim: + trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl + max_requests: null + output_dir: runs/glm5_8xb200 + sample_interval_s: 1.0 + seed: 42 diff --git a/configs/qwen2.5-coder-32b-h800.yaml b/configs/qwen2.5-coder-32b-h800.yaml new file mode 100644 index 0000000..cc66942 --- /dev/null +++ b/configs/qwen2.5-coder-32b-h800.yaml @@ -0,0 +1,42 @@ +# Qwen2.5-Coder-32B (dense, GQA) on H800 SXM (80GB). +# Architecture from HuggingFace config.json — roofline auto-derived. + +model: + name: qwen2.5-coder-32b + num_layers: 64 + hidden_size: 5120 + num_attention_heads: 40 + num_kv_heads: 8 # GQA + head_dim: 128 + intermediate_size: 27648 # SwiGLU FFN + dtype_bytes: 2 # BF16 + block_size_tokens: 16 + +hardware: + gpu_flops: 9.89e14 + gpu_mem_bw: 3.35e12 + hbm_bytes: 20.0e9 # smaller budget: 32B weights are large + dram_bytes: 512.0e9 + pcie_bw: 64.0e9 + pcie_latency_us: 5.0 + rdma_bw: 25.0e9 + rdma_latency_us: 8.0 + max_batch_slots: 128 + prefill_chunk_tokens: 1024 + +cluster: + num_instances: 16 + meta_store: + ttl_seconds: 60.0 + router: + mode: ttl_aware + precise_probe_latency_us: 50.0 + precise_probe_topk: 4 + load_alpha: 1.0 + +sim: + trace_path: traces/qwen_coder_blksz_16.jsonl + max_requests: null + output_dir: runs/qwen32b + sample_interval_s: 1.0 + seed: 42 diff --git a/configs/qwen2.5-coder-7b-h800.yaml b/configs/qwen2.5-coder-7b-h800.yaml new file mode 100644 index 0000000..bea5226 --- /dev/null +++ b/configs/qwen2.5-coder-7b-h800.yaml @@ -0,0 +1,42 @@ +# Qwen2.5-Coder-7B (dense, GQA) on a single H800 SXM (80GB). +# Architecture from HuggingFace config.json — roofline auto-derived. + +model: + name: qwen2.5-coder-7b + num_layers: 28 + hidden_size: 3584 + num_attention_heads: 28 + num_kv_heads: 4 # GQA: 28 query heads, 4 KV heads + head_dim: 128 + intermediate_size: 18944 # SwiGLU FFN + dtype_bytes: 2 # BF16 + block_size_tokens: 16 # matches qwen_coder_blksz_16 trace + +hardware: + gpu_flops: 9.89e14 # H800 bf16 dense + gpu_mem_bw: 3.35e12 # 3.35 TB/s HBM3 + hbm_bytes: 60.0e9 # leave headroom for weights/activations + dram_bytes: 512.0e9 + pcie_bw: 64.0e9 # PCIe Gen5 x16 + pcie_latency_us: 5.0 + rdma_bw: 25.0e9 # ~200 Gbps NIC + rdma_latency_us: 8.0 + max_batch_slots: 256 + prefill_chunk_tokens: 2048 + +cluster: + num_instances: 16 + meta_store: + ttl_seconds: 60.0 + router: + mode: ttl_aware + precise_probe_latency_us: 50.0 + precise_probe_topk: 4 + load_alpha: 1.0 + +sim: + trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl + max_requests: null + output_dir: runs/qwen7b + sample_interval_s: 1.0 + seed: 42 diff --git a/configs/qwen2.5-coder-7b-preset.yaml b/configs/qwen2.5-coder-7b-preset.yaml new file mode 100644 index 0000000..a411662 --- /dev/null +++ b/configs/qwen2.5-coder-7b-preset.yaml @@ -0,0 +1,36 @@ +# Qwen2.5-Coder-7B using hardware preset. +# +# Model architecture is specified inline (no config.json needed for simple +# models). Hardware uses preset "h800" with a single override for hbm_bytes. + +model: + name: qwen2.5-coder-7b + num_layers: 28 + hidden_size: 3584 + num_attention_heads: 28 + num_kv_heads: 4 + head_dim: 128 + intermediate_size: 18944 + dtype_bytes: 2 + block_size_tokens: 16 + +hardware: + type: h800 # single H800 SXM (80GB) + hbm_bytes: 60.0e9 # KV budget after 7B model weights + +cluster: + num_instances: 16 + meta_store: + ttl_seconds: 60.0 + router: + mode: ttl_aware + precise_probe_latency_us: 50.0 + precise_probe_topk: 4 + load_alpha: 1.0 + +sim: + trace_path: qwen-bailian-usagetraces-anon/qwen_coder_blksz_16.jsonl + max_requests: null + output_dir: runs/qwen7b_preset + sample_interval_s: 1.0 + seed: 42 diff --git a/configs/qwen3-coder-480b-8xh20.yaml b/configs/qwen3-coder-480b-8xh20.yaml new file mode 100644 index 0000000..a56e16e --- /dev/null +++ b/configs/qwen3-coder-480b-8xh20.yaml @@ -0,0 +1,29 @@ +# Qwen3-Coder-480B-A35B (MoE, GQA) on 8 x H20 (96GB each). +# Architecture auto-loaded from HuggingFace config.json. + +model: + config_json: ../models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json + name: qwen3-coder-480b + dtype_bytes: 1 # FP8 inference + block_size_tokens: 16 + +hardware: + type: 8xh20 + hbm_bytes: 400.0e9 # KV budget after FP8 weights on 8x96GB + +cluster: + num_instances: 32 + meta_store: + ttl_seconds: 120.0 + router: + mode: min_pd + precise_probe_latency_us: 50.0 + precise_probe_topk: 4 + load_alpha: 1.0 + +sim: + trace_path: traces/qwen_coder_blksz_16.jsonl + max_requests: null + output_dir: runs/qwen3_coder_8xh20 + sample_interval_s: 1.0 + seed: 42 diff --git a/models/GLM-5/config.json b/models/GLM-5/config.json new file mode 100644 index 0000000..a34ad26 --- /dev/null +++ b/models/GLM-5/config.json @@ -0,0 +1,59 @@ +{ + "architectures": [ + "GlmMoeDsaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "ep_size": 1, + "first_k_dense_replace": 3, + "hidden_act": "silu", + "head_dim": 64, + "hidden_size": 6144, + "index_head_dim": 128, + "index_n_heads": 32, + "index_topk": 2048, + "indexer_rope_interleave": true, + "initializer_range": 0.02, + "intermediate_size": 12288, + "kv_lora_rank": 512, + "max_position_embeddings": 202752, + "moe_intermediate_size": 2048, + "moe_layer_freq": 1, + "model_type": "glm_moe_dsa", + "n_group": 1, + "n_routed_experts": 256, + "n_shared_experts": 1, + "norm_topk_prob": true, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "num_nextn_predict_layers": 1, + "pad_token_id": 154820, + "pretraining_tp": 1, + "q_lora_rank": 2048, + "qk_head_dim": 256, + "qk_nope_head_dim": 192, + "qk_rope_head_dim": 64, + "rms_norm_eps": 1e-05, + "rope_interleave": true, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "routed_scaling_factor": 2.5, + "scoring_func": "sigmoid", + "tie_word_embeddings": false, + "topk_group": 1, + "topk_method": "noaux_tc", + "transformers_version": "5.0.2.dev0", + "use_cache": true, + "v_head_dim": 256, + "vocab_size": 154880 +} diff --git a/models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json b/models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json new file mode 100644 index 0000000..b06dbe1 --- /dev/null +++ b/models/Qwen3-Coder-480B-A35B-Instruct-FP8/config.json @@ -0,0 +1,41 @@ +{ + "architectures": [ + "Qwen3MoeForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "decoder_sparse_step": 1, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 262144, + "max_window_layers": 28, + "mlp_only_layers": [], + "model_type": "qwen3_moe", + "moe_intermediate_size": 2560, + "norm_topk_prob": true, + "num_attention_heads": 96, + "num_experts": 160, + "num_experts_per_tok": 8, + "num_hidden_layers": 62, + "num_key_value_heads": 8, + "output_router_logits": false, + "qkv_bias": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.0, + "shared_expert_intermediate_size": 0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.4", + "use_cache": true, + "use_qk_norm": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/qwen-bailian-usagetraces-anon b/qwen-bailian-usagetraces-anon new file mode 160000 index 0000000..27cfe19 --- /dev/null +++ b/qwen-bailian-usagetraces-anon @@ -0,0 +1 @@ +Subproject commit 27cfe19920eea8c4debb2b915758b678ede7b861 diff --git a/src/cluster/cluster.rs b/src/cluster/cluster.rs new file mode 100644 index 0000000..233f947 --- /dev/null +++ b/src/cluster/cluster.rs @@ -0,0 +1,167 @@ +//! Cluster: routes arrivals, performs the L0 / L1 / remote-RDMA fetch chain +//! described in the design diagram, and bookkeeps the global meta store. + +use crate::cluster::meta_store::MetaStore; +use crate::config::{Config, ModelConfig}; +use crate::instance::instance::AdmittedRequest; +use crate::instance::Instance; +use crate::router::{self, RouteDecision, Router}; +use crate::trace::RequestRecord; +use crate::types::InstanceId; + +#[derive(Debug, Clone)] +pub struct AdmissionStats { + pub instance: InstanceId, + pub l0_hit_blocks: u32, + pub l1_hit_blocks: u32, + pub remote_hit_blocks: u32, + pub miss_blocks: u32, + pub rdma_bytes: u64, + pub pcie_bytes: u64, + pub fetch_time_s: f64, + pub probe_overhead_s: f64, + pub ready_at: f64, + pub decision: RouteDecision, +} + +pub struct Cluster { + pub instances: Vec, + pub meta_store: MetaStore, + pub router: Box, + pub block_size_tokens: u32, + pub kv_block_bytes: u64, +} + +impl Cluster { + pub fn new(config: &Config, model: &ModelConfig) -> Self { + let mut instances = Vec::with_capacity(config.cluster.num_instances as usize); + for id in 0..config.cluster.num_instances { + instances.push(Instance::new(id as InstanceId, model, &config.hardware)); + } + let meta_store = MetaStore::new(config.cluster.meta_store.ttl_seconds); + let router = router::build(config, config.sim.seed); + Self { + instances, + meta_store, + router, + block_size_tokens: model.block_size_tokens, + kv_block_bytes: model.kv_block_bytes(), + } + } + + /// Route + admit a request. Returns the chosen instance plus rich + /// per-request stats for metrics. Does NOT schedule the BatchTick — the + /// simulator driver does that based on the returned `ready_at`. + pub fn route_and_admit(&mut self, req: &RequestRecord, now: f64) -> AdmissionStats { + let decision = self.router.route(req, &self.instances, &self.meta_store, now); + let inst_id = decision.chosen; + let probe_overhead_s = decision.probe_overhead_s; + + // The router probe overhead delays the request's effective start time. + let effective_now = now + probe_overhead_s; + + let inst = &mut self.instances[inst_id as usize]; + let total_blocks = req.hash_ids.len() as u32; + + // 1. L0 lookup (touches matched blocks). + let l0_hits = inst.cache.l0.longest_prefix(&req.hash_ids) as u32; + + // 2. L1 lookup on the remaining suffix. + let suffix_after_l0 = &req.hash_ids[l0_hits as usize..]; + let l1_hits = inst.cache.l1.longest_prefix(suffix_after_l0) as u32; + // L1->L0 transfer cost + let l1_bytes = (l1_hits as u64) * self.kv_block_bytes; + let mut t = effective_now; + if l1_hits > 0 { + t = inst.links.pcie.reserve(t, l1_bytes); + // Promote those blocks into L0 + let mut evicted = Vec::new(); + inst.cache.l0.insert_blocks( + &suffix_after_l0[..l1_hits as usize], + &mut evicted, + ); + } + + // 3. Remote v6d lookup for the still-remaining suffix. + let suffix_after_l1 = &suffix_after_l0[l1_hits as usize..]; + let mut remote_hit_blocks: u32 = 0; + for &h in suffix_after_l1 { + // A block is remotely available iff some instance other than + // `inst_id` lists it (and not expired). + let owners = self.meta_store.instances_for(h, now); + let any_remote = owners.iter().any(|o| *o != inst_id); + if any_remote { + remote_hit_blocks += 1; + } else { + break; // contiguous prefix - stop on first miss + } + } + let remote_bytes = (remote_hit_blocks as u64) * self.kv_block_bytes; + if remote_hit_blocks > 0 { + // RDMA from peer host -> local DRAM, then PCIe -> GPU + let inst = &mut self.instances[inst_id as usize]; + t = inst.links.rdma.reserve(t, remote_bytes); + t = inst.links.pcie.reserve(t, remote_bytes); + // Insert into local L1 (occupies LRU space) AND into L0 + let pulled = &suffix_after_l1[..remote_hit_blocks as usize]; + let mut evicted_l1 = Vec::new(); + inst.cache.l1.insert_blocks(pulled, &mut evicted_l1); + let mut evicted_l0 = Vec::new(); + inst.cache.l0.insert_blocks(pulled, &mut evicted_l0); + // The local instance now also owns these blocks - update meta_store. + for &h in pulled { + self.meta_store.insert(h, inst_id, now); + } + } + + // 4. Miss = remaining tokens to prefill from scratch. + let miss_blocks = total_blocks - l0_hits - l1_hits - remote_hit_blocks; + let miss_tokens = miss_blocks * self.block_size_tokens; + + // The newly-prefilled blocks (after the request runs) are inserted + // into L0 here, and into L1 / meta_store via async writeback. Doing + // this at admission time is OK because we're tracking presence, not + // actually moving bytes — the writeback latency is hidden behind + // request execution and we don't model meta_store inconsistency + // window beyond the TTL itself. + let inst = &mut self.instances[inst_id as usize]; + let new_input_blocks = &req.hash_ids[(l0_hits + l1_hits + remote_hit_blocks) as usize..]; + let mut evicted_l0 = Vec::new(); + inst.cache.l0.insert_blocks(new_input_blocks, &mut evicted_l0); + let mut evicted_l1 = Vec::new(); + inst.cache.l1.insert_blocks(new_input_blocks, &mut evicted_l1); + for &h in new_input_blocks { + self.meta_store.insert(h, inst_id, now); + } + + // 5. Reserve KV slots for this request's prefill residency. + // PD disaggregation: decode runs elsewhere, so only the input + // blocks occupy HBM on this instance. + let reserved_blocks = total_blocks; + let admitted = AdmittedRequest { + req_id: req.req_id, + arrival: req.arrival, + ready_at: t, + prefill_tokens_remaining: miss_tokens, + reserved_blocks, + }; + inst.admit(admitted); + + let pcie_bytes = l1_bytes + remote_bytes; + let fetch_time_s = (t - effective_now).max(0.0); + + AdmissionStats { + instance: inst_id, + l0_hit_blocks: l0_hits, + l1_hit_blocks: l1_hits, + remote_hit_blocks, + miss_blocks, + rdma_bytes: remote_bytes, + pcie_bytes, + fetch_time_s, + probe_overhead_s, + ready_at: t, + decision, + } + } +} diff --git a/src/cluster/meta_store.rs b/src/cluster/meta_store.rs new file mode 100644 index 0000000..2fe3b94 --- /dev/null +++ b/src/cluster/meta_store.rs @@ -0,0 +1,161 @@ +//! Global redis-like KV-cache index. +//! +//! Maps `block_hash -> SmallVec<(instance_id, expires_at)>`. TTL eviction is +//! lazy (on read). The TTL-aware router uses `score_prefix` to score each +//! instance's predicted longest prefix without probing instances directly. + +use ahash::AHashMap; +use smallvec::SmallVec; + +use crate::types::InstanceId; + +#[derive(Debug, Clone, Copy)] +struct Entry { + instance: InstanceId, + expires_at: f64, +} + +#[derive(Debug, Default)] +pub struct MetaStore { + ttl_seconds: f64, + map: AHashMap>, +} + +impl MetaStore { + pub fn new(ttl_seconds: f64) -> Self { + Self { + ttl_seconds, + map: AHashMap::with_capacity(1 << 16), + } + } + + pub fn ttl(&self) -> f64 { + self.ttl_seconds + } + + /// Record that `instance` now holds `block_hash`. + pub fn insert(&mut self, block_hash: u64, instance: InstanceId, now: f64) { + let entry = Entry { + instance, + expires_at: now + self.ttl_seconds, + }; + let bucket = self.map.entry(block_hash).or_default(); + // refresh existing entry if present + for e in bucket.iter_mut() { + if e.instance == instance { + e.expires_at = entry.expires_at; + return; + } + } + bucket.push(entry); + } + + /// Score each candidate instance by the longest leading prefix of + /// `hash_ids` for which the meta store believes that instance still holds + /// every block. Returns scores indexed by instance id. + pub fn score_prefix(&self, hash_ids: &[u64], now: f64, num_instances: usize) -> Vec { + if hash_ids.is_empty() { + return vec![0; num_instances]; + } + // Walk hashes; at each step intersect the still-eligible instance set. + // Use a small bitset since num_instances is typically <= 1024. + let mut alive: Vec = vec![false; num_instances]; + // First block: seed alive set + let first = hash_ids[0]; + let mut any = false; + if let Some(bucket) = self.map.get(&first) { + for e in bucket { + if e.expires_at >= now { + let i = e.instance as usize; + if i < num_instances { + alive[i] = true; + any = true; + } + } + } + } + let mut scores = vec![0u32; num_instances]; + if !any { + return scores; + } + for i in 0..num_instances { + if alive[i] { + scores[i] = 1; + } + } + // Subsequent blocks: an instance survives only if the meta store still + // lists it for that block (and not expired). + for (depth, &h) in hash_ids.iter().enumerate().skip(1) { + let bucket = match self.map.get(&h) { + Some(b) => b, + None => break, + }; + // mark instances present for this block + let mut present = vec![false; num_instances]; + let mut any2 = false; + for e in bucket { + if e.expires_at >= now { + let i = e.instance as usize; + if i < num_instances && alive[i] { + present[i] = true; + any2 = true; + } + } + } + if !any2 { + break; + } + for i in 0..num_instances { + if present[i] { + scores[i] = (depth + 1) as u32; + } else { + alive[i] = false; + } + } + } + scores + } + + /// Lookup which (alive) instances claim to hold a given block. + pub fn instances_for(&self, hash: u64, now: f64) -> SmallVec<[InstanceId; 4]> { + let mut out = SmallVec::new(); + if let Some(bucket) = self.map.get(&hash) { + for e in bucket { + if e.expires_at >= now { + out.push(e.instance); + } + } + } + out + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn score_prefix_basic() { + let mut m = MetaStore::new(60.0); + m.insert(10, 0, 0.0); + m.insert(11, 0, 0.0); + m.insert(12, 0, 0.0); + m.insert(10, 1, 0.0); + m.insert(11, 1, 0.0); + // instance 1 only has 10,11; instance 0 has 10,11,12 + let s = m.score_prefix(&[10, 11, 12, 13], 1.0, 4); + assert_eq!(s[0], 3); + assert_eq!(s[1], 2); + assert_eq!(s[2], 0); + } + + #[test] + fn ttl_expiry() { + let mut m = MetaStore::new(1.0); + m.insert(10, 0, 0.0); + let s_now = m.score_prefix(&[10], 0.5, 2); + assert_eq!(s_now[0], 1); + let s_later = m.score_prefix(&[10], 5.0, 2); + assert_eq!(s_later[0], 0); + } +} diff --git a/src/cluster/mod.rs b/src/cluster/mod.rs new file mode 100644 index 0000000..b6001dd --- /dev/null +++ b/src/cluster/mod.rs @@ -0,0 +1,6 @@ +pub mod meta_store; +#[allow(clippy::module_inception)] +pub mod cluster; + +pub use cluster::Cluster; +pub use meta_store::MetaStore; diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..655ca94 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,510 @@ +//! Top-level configuration loaded from YAML. +//! +//! Two config styles are supported: +//! +//! **Architecture-derived** (preferred): set `hidden_size`, `num_attention_heads`, +//! `intermediate_size` and the simulator derives all roofline coefficients, KV +//! block sizes, and weight-stream costs from the model architecture. Supports +//! MoE, MLA (Multi-head Latent Attention), and DSA (DeepSeek Sparse Attention). +//! +//! **Legacy manual**: omit the architecture fields and set +//! `flops_per_token_prefill` + `attn_quadratic_coeff` directly. Backward +//! compatible with older YAML configs. + +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use std::path::Path; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Config { + pub model: ModelConfig, + pub hardware: HardwareConfig, + pub cluster: ClusterConfig, + pub sim: SimConfig, +} + +// --------------------------------------------------------------------------- +// Model +// --------------------------------------------------------------------------- + +/// Model architecture + roofline coefficients. +/// +/// If `hidden_size` is present the compute model is derived from architecture; +/// otherwise the legacy `flops_per_token_prefill` / `attn_quadratic_coeff` +/// fields are used directly. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct ModelConfig { + #[serde(default)] + pub name: String, + pub num_layers: u32, + pub num_kv_heads: u32, + pub head_dim: u32, + pub dtype_bytes: u32, + pub block_size_tokens: u32, + + // -- Architecture fields (enable auto-derivation when all three present) -- + #[serde(default)] + pub hidden_size: Option, + #[serde(default)] + pub num_attention_heads: Option, + #[serde(default)] + pub intermediate_size: Option, + + #[serde(default)] + pub moe: Option, + #[serde(default)] + pub mla: Option, + #[serde(default)] + pub attention: Option, + + // -- Legacy manual coefficients (used when hidden_size is absent) --------- + #[serde(default)] + pub flops_per_token_prefill: Option, + #[serde(default)] + pub attn_quadratic_coeff: Option, + #[serde(default)] + pub bytes_per_token_prefill: Option, + + #[serde(default, skip_serializing)] + #[allow(dead_code)] + pub flops_per_token_decode: Option, + #[serde(default, skip_serializing)] + #[allow(dead_code)] + pub bytes_per_token_decode: Option, +} + +/// Whether the config is architecture-derived or uses legacy manual knobs. +impl ModelConfig { + pub fn is_arch_mode(&self) -> bool { + self.hidden_size.is_some() + } + + /// Bytes of KV cache per block. + /// + /// For standard / GQA: `2 * L * kv_heads * head_dim * dtype * block_tokens` + /// For MLA: `L * (kv_lora_rank + qk_rope_head_dim) * dtype * block_tokens` + pub fn kv_block_bytes(&self) -> u64 { + if let Some(mla) = &self.mla { + self.num_layers as u64 + * (mla.kv_lora_rank + mla.qk_rope_head_dim) as u64 + * self.dtype_bytes as u64 + * self.block_size_tokens as u64 + } else { + 2u64 + * self.num_layers as u64 + * self.num_kv_heads as u64 + * self.head_dim as u64 + * self.dtype_bytes as u64 + * self.block_size_tokens as u64 + } + } +} + +// -- Sub-configs for MoE / MLA / Attention ----------------------------------- + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MoeConfig { + pub num_experts: u32, + pub num_active_experts: u32, + #[serde(default)] + pub num_shared_experts: u32, + /// Per-expert FFN intermediate size (`moe_intermediate_size` in HF). + /// Falls back to parent `intermediate_size` if absent. + #[serde(default)] + pub expert_intermediate_size: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MlaConfig { + pub kv_lora_rank: u32, + pub q_lora_rank: u32, + pub qk_nope_head_dim: u32, + pub qk_rope_head_dim: u32, + pub v_head_dim: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum AttentionConfig { + Dense, + SlidingWindow { + window_size: u32, + }, + Dsa { + /// Tokens within this window attend fully. + dense_window: u32, + /// Beyond the window, attend to every `sparse_stride`-th token. + sparse_stride: u32, + /// Number of initial layers that use dense attention regardless. + #[serde(default)] + first_dense_layers: u32, + }, +} + +// --------------------------------------------------------------------------- +// Hardware +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HardwareConfig { + pub gpu_flops: f64, + pub gpu_mem_bw: f64, + pub hbm_bytes: f64, + pub dram_bytes: f64, + pub pcie_bw: f64, + pub pcie_latency_us: f64, + pub rdma_bw: f64, + pub rdma_latency_us: f64, + #[serde(default = "default_max_batch_slots")] + pub max_batch_slots: u32, + #[serde(default = "default_prefill_chunk_tokens")] + pub prefill_chunk_tokens: u32, +} + +fn default_max_batch_slots() -> u32 { + 256 +} +fn default_prefill_chunk_tokens() -> u32 { + 2048 +} + +// --------------------------------------------------------------------------- +// Cluster +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ClusterConfig { + pub num_instances: u32, + pub meta_store: MetaStoreConfig, + pub router: RouterConfig, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetaStoreConfig { + pub ttl_seconds: f64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RouterConfig { + pub mode: RouterMode, + #[serde(default = "default_probe_latency_us")] + pub precise_probe_latency_us: f64, + #[serde(default = "default_probe_topk")] + pub precise_probe_topk: u32, + #[serde(default = "default_load_alpha")] + pub load_alpha: f64, + /// Weight for load (queue_len) in cache_score: `2^(α·load + β·miss)`. + #[serde(default = "default_score_alpha")] + pub score_alpha: f64, + /// Weight for cache miss in cache_score: `2^(α·load + β·miss)`. + #[serde(default = "default_score_beta")] + pub score_beta: f64, + /// Number of leading blocks for prefix fingerprint in prefix_affinity. + #[serde(default = "default_prefix_k")] + pub prefix_k: usize, + /// Number of top-affinity instances to consider in prefix_affinity. + /// 0 means auto (n/8, min 2). + #[serde(default)] + pub affinity_fan_out: usize, +} + +fn default_probe_latency_us() -> f64 { + 50.0 +} +fn default_probe_topk() -> u32 { + 4 +} +fn default_load_alpha() -> f64 { + 1.0 +} +fn default_score_alpha() -> f64 { + 1.0 +} +fn default_score_beta() -> f64 { + 0.1 +} +fn default_prefix_k() -> usize { + 8 +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RouterMode { + Random, + RoundRobin, + LeastLoaded, + LeastTokens, + TtlAware, + Precise, + MinPd, + CacheLoad, + CacheScore, + EstimatedTtft, + PrefixAffinity, +} + +impl RouterMode { + pub fn parse(s: &str) -> Result { + match s { + "random" => Ok(Self::Random), + "round_robin" | "rr" => Ok(Self::RoundRobin), + "least_loaded" => Ok(Self::LeastLoaded), + "least_tokens" | "lt" => Ok(Self::LeastTokens), + "ttl_aware" | "ttl" => Ok(Self::TtlAware), + "precise" | "precise_aware" => Ok(Self::Precise), + "min_pd" | "minpd" | "pd" => Ok(Self::MinPd), + "cache_load" | "cl" => Ok(Self::CacheLoad), + "cache_score" | "cs" => Ok(Self::CacheScore), + "estimated_ttft" | "ettft" | "optimal" => Ok(Self::EstimatedTtft), + "prefix_affinity" | "affinity" | "pa" => Ok(Self::PrefixAffinity), + other => Err(anyhow::anyhow!("unknown router mode: {other}")), + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Self::Random => "random", + Self::RoundRobin => "round_robin", + Self::LeastLoaded => "least_loaded", + Self::LeastTokens => "least_tokens", + Self::TtlAware => "ttl_aware", + Self::Precise => "precise", + Self::MinPd => "min_pd", + Self::CacheLoad => "cache_load", + Self::CacheScore => "cache_score", + Self::EstimatedTtft => "estimated_ttft", + Self::PrefixAffinity => "prefix_affinity", + } + } +} + +// --------------------------------------------------------------------------- +// Sim +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SimConfig { + pub trace_path: String, + #[serde(default)] + pub max_requests: Option, + pub output_dir: String, + #[serde(default = "default_sample_interval")] + pub sample_interval_s: f64, + #[serde(default)] + pub seed: u64, +} + +fn default_sample_interval() -> f64 { + 1.0 +} + +impl Config { + /// Load from a YAML file, resolving `config_json` (HF model config) and + /// hardware `type` (preset) references if present. + pub fn from_yaml_path>(path: P) -> Result { + let path = path.as_ref(); + let raw_str = std::fs::read_to_string(path) + .with_context(|| format!("reading config {}", path.display()))?; + let raw: RawConfig = serde_yaml::from_str(&raw_str) + .with_context(|| format!("parsing config {}", path.display()))?; + let yaml_dir = path.parent().unwrap_or(Path::new(".")); + raw.resolve(yaml_dir) + .with_context(|| format!("resolving config {}", path.display())) + } +} + +// --------------------------------------------------------------------------- +// Raw deserialization types — flexible YAML loading +// --------------------------------------------------------------------------- +// +// All model/hardware fields are `Option` so that `config_json` and `type` +// can supply base values, with explicit YAML fields acting as overrides. +// Existing YAML configs (no config_json / type) continue to work unchanged. + +#[derive(Deserialize)] +struct RawConfig { + model: RawModelConfig, + hardware: RawHardwareConfig, + cluster: ClusterConfig, + sim: SimConfig, +} + +#[derive(Deserialize)] +struct RawModelConfig { + /// Path to a HuggingFace `config.json`. Resolved relative to the YAML + /// file's directory. When present, architecture fields are loaded from + /// the JSON and any explicit YAML fields act as overrides. + #[serde(default)] + config_json: Option, + + #[serde(default)] + name: Option, + #[serde(default)] + num_layers: Option, + #[serde(default)] + num_kv_heads: Option, + #[serde(default)] + head_dim: Option, + #[serde(default)] + dtype_bytes: Option, + #[serde(default)] + block_size_tokens: Option, + #[serde(default)] + hidden_size: Option, + #[serde(default)] + num_attention_heads: Option, + #[serde(default)] + intermediate_size: Option, + #[serde(default)] + moe: Option, + #[serde(default)] + mla: Option, + #[serde(default)] + attention: Option, + #[serde(default)] + flops_per_token_prefill: Option, + #[serde(default)] + attn_quadratic_coeff: Option, + #[serde(default)] + bytes_per_token_prefill: Option, + #[serde(default)] + flops_per_token_decode: Option, + #[serde(default)] + bytes_per_token_decode: Option, +} + +#[derive(Deserialize)] +struct RawHardwareConfig { + /// Hardware preset name (e.g. `"h100"`, `"8xb200"`). When present, + /// specs are loaded from the built-in preset database and any explicit + /// YAML fields override individual values. + #[serde(default, rename = "type")] + hw_type: Option, + + #[serde(default)] + gpu_flops: Option, + #[serde(default)] + gpu_mem_bw: Option, + #[serde(default)] + hbm_bytes: Option, + #[serde(default)] + dram_bytes: Option, + #[serde(default)] + pcie_bw: Option, + #[serde(default)] + pcie_latency_us: Option, + #[serde(default)] + rdma_bw: Option, + #[serde(default)] + rdma_latency_us: Option, + #[serde(default)] + max_batch_slots: Option, + #[serde(default)] + prefill_chunk_tokens: Option, +} + +// -- Resolution (merge base + YAML overrides → final Config) ------------------ + +impl RawConfig { + fn resolve(self, yaml_dir: &Path) -> Result { + Ok(Config { + model: self.model.resolve(yaml_dir)?, + hardware: self.hardware.resolve()?, + cluster: self.cluster, + sim: self.sim, + }) + } +} + +impl RawModelConfig { + fn resolve(self, yaml_dir: &Path) -> Result { + // Start from HF config.json if specified, else empty default. + let mut m = if let Some(ref cj) = self.config_json { + let cj_path = if Path::new(cj).is_absolute() { + std::path::PathBuf::from(cj) + } else { + yaml_dir.join(cj) + }; + crate::hf_config::parse(&cj_path)? + } else { + ModelConfig::default() + }; + + // Overlay: explicit YAML fields override the base. + if let Some(v) = self.name { m.name = v; } + if let Some(v) = self.num_layers { m.num_layers = v; } + if let Some(v) = self.num_kv_heads { m.num_kv_heads = v; } + if let Some(v) = self.head_dim { m.head_dim = v; } + if let Some(v) = self.dtype_bytes { m.dtype_bytes = v; } + if let Some(v) = self.block_size_tokens { m.block_size_tokens = v; } + if let Some(v) = self.hidden_size { m.hidden_size = Some(v); } + if let Some(v) = self.num_attention_heads { m.num_attention_heads = Some(v); } + if let Some(v) = self.intermediate_size { m.intermediate_size = Some(v); } + if self.moe.is_some() { m.moe = self.moe; } + if self.mla.is_some() { m.mla = self.mla; } + if self.attention.is_some() { m.attention = self.attention; } + if let Some(v) = self.flops_per_token_prefill { m.flops_per_token_prefill = Some(v); } + if let Some(v) = self.attn_quadratic_coeff { m.attn_quadratic_coeff = Some(v); } + if let Some(v) = self.bytes_per_token_prefill { m.bytes_per_token_prefill = Some(v); } + if let Some(v) = self.flops_per_token_decode { m.flops_per_token_decode = Some(v); } + if let Some(v) = self.bytes_per_token_decode { m.bytes_per_token_decode = Some(v); } + + // Validate deployment-specific fields that HF config.json never provides. + anyhow::ensure!( + m.dtype_bytes > 0, + "model.dtype_bytes is required (not in HF config.json)" + ); + anyhow::ensure!( + m.block_size_tokens > 0, + "model.block_size_tokens is required (not in HF config.json)" + ); + + Ok(m) + } +} + +impl RawHardwareConfig { + fn resolve(self) -> Result { + // Start from preset if specified, else zeros (all must come from YAML). + let mut hw = if let Some(ref t) = self.hw_type { + crate::hardware_presets::resolve(t).ok_or_else(|| { + anyhow::anyhow!( + "unknown hardware preset '{t}'. Available: {}", + crate::hardware_presets::AVAILABLE.join(", ") + ) + })? + } else { + HardwareConfig { + gpu_flops: 0.0, + gpu_mem_bw: 0.0, + hbm_bytes: 0.0, + dram_bytes: 0.0, + pcie_bw: 0.0, + pcie_latency_us: 5.0, + rdma_bw: 0.0, + rdma_latency_us: 8.0, + max_batch_slots: default_max_batch_slots(), + prefill_chunk_tokens: default_prefill_chunk_tokens(), + } + }; + + // Overlay: explicit YAML fields override the preset / defaults. + if let Some(v) = self.gpu_flops { hw.gpu_flops = v; } + if let Some(v) = self.gpu_mem_bw { hw.gpu_mem_bw = v; } + if let Some(v) = self.hbm_bytes { hw.hbm_bytes = v; } + if let Some(v) = self.dram_bytes { hw.dram_bytes = v; } + if let Some(v) = self.pcie_bw { hw.pcie_bw = v; } + if let Some(v) = self.pcie_latency_us { hw.pcie_latency_us = v; } + if let Some(v) = self.rdma_bw { hw.rdma_bw = v; } + if let Some(v) = self.rdma_latency_us { hw.rdma_latency_us = v; } + if let Some(v) = self.max_batch_slots { hw.max_batch_slots = v; } + if let Some(v) = self.prefill_chunk_tokens { hw.prefill_chunk_tokens = v; } + + // Validate minimum requirements. + anyhow::ensure!(hw.gpu_flops > 0.0, "hardware.gpu_flops is required"); + anyhow::ensure!(hw.gpu_mem_bw > 0.0, "hardware.gpu_mem_bw is required"); + anyhow::ensure!(hw.hbm_bytes > 0.0, "hardware.hbm_bytes is required"); + + Ok(hw) + } +} diff --git a/src/driver.rs b/src/driver.rs new file mode 100644 index 0000000..03fd2cf --- /dev/null +++ b/src/driver.rs @@ -0,0 +1,170 @@ +//! Simulation driver: pulls trace records, advances the event queue, runs +//! instance batch ticks, and emits metrics. + +use anyhow::Result; +use std::collections::HashMap; +use std::path::Path; + +use crate::cluster::Cluster; +use crate::config::Config; +use crate::metrics::per_request::{PerRequestRow, PerRequestWriter}; +use crate::metrics::routing_log::RoutingLogWriter; +use crate::metrics::summary::Summary; +use crate::metrics::timeseries::{TimeseriesRow, TimeseriesWriter}; +use crate::sim::{Event, EventQueue}; +use crate::trace::{RequestRecord, TraceReader}; + +pub struct RunOutputs { + pub summary: Summary, + pub rows: Vec, +} + +#[derive(Debug, Clone)] +struct InflightInfo { + arrival: f64, + instance: u32, + total_blocks: u32, + l0_hit_blocks: u32, + l1_hit_blocks: u32, + remote_hit_blocks: u32, + miss_blocks: u32, + rdma_bytes: u64, + pcie_bytes: u64, + probe_overhead_s: f64, +} + +pub fn run(config: &Config, output_subdir: Option<&str>) -> Result { + let mut cluster = Cluster::new(config, &config.model); + let mut q = EventQueue::new(); + + // Output directory + let base = Path::new(&config.sim.output_dir); + let out_dir = match output_subdir { + Some(s) => base.join(s), + None => base.to_path_buf(), + }; + std::fs::create_dir_all(&out_dir)?; + + let mut req_writer = PerRequestWriter::create(out_dir.join("per_request.csv"))?; + let mut ts_writer = TimeseriesWriter::create(out_dir.join("instances.csv"))?; + let mut rt_writer = RoutingLogWriter::create(out_dir.join("routing_log.jsonl"))?; + + let mut trace = TraceReader::open(&config.sim.trace_path, config.sim.max_requests)?; + // Load all records (cheap for moderate traces) so we can index by req_id. + // For very large traces a streaming approach with a peekable iterator + // would be better; this keeps the driver simple. + let records: Vec = (&mut trace).collect::, _>>()?; + let mut by_id: HashMap = HashMap::with_capacity(records.len()); + for r in &records { + q.schedule(r.arrival, Event::Arrival { req_id: r.req_id }); + by_id.insert(r.req_id, r.clone()); + } + // Periodic samples + if config.sim.sample_interval_s > 0.0 && !records.is_empty() { + let max_t = records.iter().map(|r| r.arrival).fold(0.0_f64, f64::max); + let mut t = 0.0; + while t <= max_t + 60.0 { + q.schedule(t, Event::Sample); + t += config.sim.sample_interval_s; + } + } + + let mut inflight: HashMap = HashMap::new(); + let mut rows: Vec = Vec::with_capacity(records.len()); + + while let Some((now, ev)) = q.pop() { + match ev { + Event::Arrival { req_id } => { + let req = match by_id.get(&req_id) { + Some(r) => r.clone(), + None => continue, + }; + let stats = cluster.route_and_admit(&req, now); + rt_writer.write(&stats.decision)?; + inflight.insert( + req_id, + InflightInfo { + arrival: req.arrival, + instance: stats.instance, + total_blocks: req.hash_ids.len() as u32, + l0_hit_blocks: stats.l0_hit_blocks, + l1_hit_blocks: stats.l1_hit_blocks, + remote_hit_blocks: stats.remote_hit_blocks, + miss_blocks: stats.miss_blocks, + rdma_bytes: stats.rdma_bytes, + pcie_bytes: stats.pcie_bytes, + probe_overhead_s: stats.probe_overhead_s, + }, + ); + let inst = &mut cluster.instances[stats.instance as usize]; + if !inst.tick_scheduled { + inst.tick_scheduled = true; + let when = stats.ready_at.max(now); + q.schedule(when, Event::BatchTick { instance: stats.instance }); + } + } + Event::BatchTick { instance } => { + let inst = &mut cluster.instances[instance as usize]; + inst.tick_scheduled = false; + let result = inst.step(now); + for (rid, ttft, end) in result.completed { + if let Some(info) = inflight.remove(&rid) { + let row = PerRequestRow { + req_id: rid, + arrival: info.arrival, + ttft, + e2e: end - info.arrival, + instance: info.instance, + total_blocks: info.total_blocks, + l0_hit_blocks: info.l0_hit_blocks, + l1_hit_blocks: info.l1_hit_blocks, + remote_hit_blocks: info.remote_hit_blocks, + miss_blocks: info.miss_blocks, + rdma_bytes: info.rdma_bytes, + pcie_bytes: info.pcie_bytes, + probe_overhead_s: info.probe_overhead_s, + }; + req_writer.write(&row)?; + rows.push(row); + } + } + if let Some(next) = result.next_tick { + let inst = &mut cluster.instances[instance as usize]; + if !inst.tick_scheduled { + inst.tick_scheduled = true; + q.schedule(next.max(now), Event::BatchTick { instance }); + } + } + } + Event::Sample => { + for inst in &cluster.instances { + let busy = if inst.queue_len() > 0 { 1 } else { 0 }; + ts_writer.write(&TimeseriesRow { + t: now, + instance: inst.id, + queue_len: inst.queue_len(), + kv_blocks_used: inst.kv_blocks_used, + kv_blocks_total: inst.hbm_block_budget, + busy, + })?; + } + } + Event::Stop => break, + } + } + + req_writer.finish()?; + ts_writer.finish()?; + rt_writer.finish()?; + + let sim_duration_s = rows + .iter() + .map(|r| r.arrival + r.e2e) + .fold(0.0_f64, f64::max); + let router_name = config.cluster.router.mode.as_str().to_string(); + let summary = Summary::from_rows(&router_name, &rows, sim_duration_s); + let summary_json = serde_json::to_string_pretty(&summary)?; + std::fs::write(out_dir.join("summary.json"), summary_json)?; + + Ok(RunOutputs { summary, rows }) +} diff --git a/src/hardware_presets.rs b/src/hardware_presets.rs new file mode 100644 index 0000000..b99fd6e --- /dev/null +++ b/src/hardware_presets.rs @@ -0,0 +1,225 @@ +//! Built-in hardware presets for common GPU configurations. +//! +//! Presets provide baseline specs for single GPUs and tensor-parallel (TP) +//! groups. All values can be overridden in the YAML config by specifying +//! explicit fields alongside `type`: +//! +//! ```yaml +//! hardware: +//! type: 8xb200 +//! hbm_bytes: 500.0e9 # override total HBM with actual KV budget +//! ``` + +use crate::config::HardwareConfig; + +/// All recognized preset names (for help/error messages). +pub const AVAILABLE: &[&str] = &[ + "h100", + "h800", + "h20", + "a100-80gb", + "a100-40gb", + "b200", + "2xh100", + "4xh100", + "8xh100", + "2xh800", + "4xh800", + "8xh800", + "2xh20", + "4xh20", + "8xh20", + "2xb200", + "4xb200", + "8xb200", +]; + +/// Resolve a hardware preset by name. +/// +/// Case-insensitive; hyphens, underscores, and spaces are stripped before +/// matching. Accepts `NxGPU` patterns (e.g. `8xb200`). +pub fn resolve(name: &str) -> Option { + let key = normalize(name); + let (count, gpu) = parse_count_gpu(&key); + match gpu.as_str() { + "h100" => Some(make_config(count, &H100)), + "h800" => Some(make_config(count, &H800)), + "h20" => Some(make_config(count, &H20)), + "a10080gb" | "a100" => Some(make_config(count, &A100_80GB)), + "a10040gb" => Some(make_config(count, &A100_40GB)), + "b200" => Some(make_config(count, &B200)), + _ => None, + } +} + +// --------------------------------------------------------------------------- +// Internals +// --------------------------------------------------------------------------- + +fn normalize(s: &str) -> String { + s.to_ascii_lowercase().replace(['-', '_', ' '], "") +} + +/// Parse `"8xh100"` → `(8, "h100")`, `"h100"` → `(1, "h100")`. +fn parse_count_gpu(s: &str) -> (u32, String) { + if let Some(pos) = s.find('x') { + if let Ok(n) = s[..pos].parse::() { + return (n, s[pos + 1..].to_string()); + } + } + (1, s.to_string()) +} + +// -- Per-GPU base specs (single die, BF16 dense) ----------------------------- + +struct GpuBase { + flops: f64, // BF16 dense TFLOPS + mem_bw: f64, // HBM bandwidth (B/s) + hbm: f64, // Total HBM (bytes) + pcie_gen: u32, // PCIe generation (4/5/6) +} + +const H100: GpuBase = GpuBase { + flops: 9.89e14, // 989 TFLOPS BF16 + mem_bw: 3.35e12, // 3.35 TB/s HBM3 + hbm: 80.0e9, // 80 GB + pcie_gen: 5, +}; + +const H800: GpuBase = GpuBase { + flops: 9.89e14, // same die as H100 + mem_bw: 3.35e12, // 3.35 TB/s HBM3 + hbm: 80.0e9, // 80 GB + pcie_gen: 5, +}; + +const H20: GpuBase = GpuBase { + flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper) + mem_bw: 4.0e12, // 4.0 TB/s HBM3 + hbm: 96.0e9, // 96 GB + pcie_gen: 5, +}; + +const A100_80GB: GpuBase = GpuBase { + flops: 3.12e14, // 312 TFLOPS BF16 + mem_bw: 2.0e12, // 2.0 TB/s HBM2e + hbm: 80.0e9, // 80 GB + pcie_gen: 4, +}; + +const A100_40GB: GpuBase = GpuBase { + flops: 3.12e14, // 312 TFLOPS BF16 + mem_bw: 1.555e12, // 1.555 TB/s HBM2e + hbm: 40.0e9, // 40 GB + pcie_gen: 4, +}; + +const B200: GpuBase = GpuBase { + flops: 2.25e15, // 2250 TFLOPS BF16 + mem_bw: 8.0e12, // 8.0 TB/s HBM3e + hbm: 192.0e9, // 192 GB + pcie_gen: 6, +}; + +/// Build a [`HardwareConfig`] from a base GPU spec × TP count. +/// +/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`. +/// PCIe bandwidth scales linearly (one link per GPU). RDMA bandwidth +/// assumes one NIC for ≤4 GPUs and two NICs for ≥8. Server DRAM is a +/// reasonable default based on typical deployment sizes. +fn make_config(n: u32, base: &GpuBase) -> HardwareConfig { + let f = n as f64; + + // PCIe per-GPU bandwidth and latency by generation + let (pcie_per_gpu, pcie_lat) = match base.pcie_gen { + 6 => (128.0e9, 4.0), // Gen6 x16 + 5 => (64.0e9, 5.0), // Gen5 x16 + _ => (32.0e9, 5.0), // Gen4 x16 + }; + + // RDMA: base NIC speed by PCIe gen, scaled for multi-NIC servers + let (rdma_base, rdma_lat) = match base.pcie_gen { + 6 => (50.0e9, 6.0), // 400 Gbps NIC + _ => (25.0e9, 8.0), // 200 Gbps NIC + }; + let rdma_scale = if n >= 8 { 2.0 } else { 1.0 }; + + // Server DRAM: rough defaults by deployment size + let dram = match n { + 1 => 512.0e9, + 2..=4 => 1.0e12, + _ => 1.5e12, + }; + + HardwareConfig { + gpu_flops: base.flops * f, + gpu_mem_bw: base.mem_bw * f, + hbm_bytes: base.hbm * f, + dram_bytes: dram, + pcie_bw: pcie_per_gpu * f, + pcie_latency_us: pcie_lat, + rdma_bw: rdma_base * rdma_scale, + rdma_latency_us: rdma_lat, + max_batch_slots: 256, + prefill_chunk_tokens: if n >= 4 { 4096 } else { 2048 }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn resolve_single_gpu() { + let hw = resolve("h100").unwrap(); + assert!((hw.gpu_flops - 9.89e14).abs() < 1e10); + assert!((hw.hbm_bytes - 80e9).abs() < 1e6); + assert_eq!(hw.prefill_chunk_tokens, 2048); + } + + #[test] + fn resolve_tp_group() { + let hw = resolve("8xb200").unwrap(); + assert!((hw.gpu_flops - 2.25e15 * 8.0).abs() < 1e11); + assert!((hw.hbm_bytes - 192e9 * 8.0).abs() < 1e6); + assert!((hw.pcie_bw - 128e9 * 8.0).abs() < 1e6); + assert_eq!(hw.prefill_chunk_tokens, 4096); + } + + #[test] + fn resolve_case_and_separator_insensitive() { + assert!(resolve("H100").is_some()); + assert!(resolve("8xB200").is_some()); + assert!(resolve("8x-B200").is_some()); + assert!(resolve("a100-80gb").is_some()); + assert!(resolve("A100_80GB").is_some()); + assert!(resolve("a100_80gb").is_some()); + } + + #[test] + fn resolve_unknown_returns_none() { + assert!(resolve("v100").is_none()); + assert!(resolve("tpu-v5").is_none()); + assert!(resolve("").is_none()); + } + + #[test] + fn a100_variants() { + let a80 = resolve("a100-80gb").unwrap(); + let a40 = resolve("a100-40gb").unwrap(); + assert!((a80.hbm_bytes - 80e9).abs() < 1e6); + assert!((a40.hbm_bytes - 40e9).abs() < 1e6); + assert!(a80.gpu_mem_bw > a40.gpu_mem_bw); + } + + #[test] + fn scaling_is_linear() { + let s1 = resolve("h100").unwrap(); + let s4 = resolve("4xh100").unwrap(); + let s8 = resolve("8xh100").unwrap(); + assert!((s4.gpu_flops - s1.gpu_flops * 4.0).abs() < 1.0); + assert!((s8.gpu_flops - s1.gpu_flops * 8.0).abs() < 1.0); + assert!((s4.gpu_mem_bw - s1.gpu_mem_bw * 4.0).abs() < 1.0); + assert!((s8.hbm_bytes - s1.hbm_bytes * 8.0).abs() < 1.0); + } +} diff --git a/src/hf_config.rs b/src/hf_config.rs new file mode 100644 index 0000000..8777751 --- /dev/null +++ b/src/hf_config.rs @@ -0,0 +1,193 @@ +//! Parse a HuggingFace `config.json` into [`ModelConfig`] fields. +//! +//! Handles common architectures: standard transformer, GQA, MoE, MLA +//! (Multi-head Latent Attention), and DSA (DeepSeek Sparse Attention). + +use anyhow::{Context, Result}; +use serde_json::Value; +use std::path::Path; + +use crate::config::{AttentionConfig, MlaConfig, MoeConfig, ModelConfig}; + +/// Parse a HuggingFace config.json and return a partially-populated +/// [`ModelConfig`]. The caller must still set `dtype_bytes` and +/// `block_size_tokens` (not part of the HF schema). +pub fn parse(path: &Path) -> Result { + let raw = std::fs::read_to_string(path) + .with_context(|| format!("reading config.json at {}", path.display()))?; + let v: Value = serde_json::from_str(&raw) + .with_context(|| format!("parsing config.json at {}", path.display()))?; + parse_value(&v) +} + +fn u32_field(v: &Value, key: &str) -> Option { + v.get(key).and_then(|x| x.as_u64()).map(|x| x as u32) +} + +fn parse_value(v: &Value) -> Result { + let name = v + .get("model_type") + .and_then(|x| x.as_str()) + .unwrap_or("unknown") + .to_string(); + + let num_layers = u32_field(v, "num_hidden_layers"); + let hidden_size = u32_field(v, "hidden_size"); + let num_attention_heads = u32_field(v, "num_attention_heads"); + let num_kv_heads = u32_field(v, "num_key_value_heads") + .or(num_attention_heads); // default to MHA + let head_dim = u32_field(v, "head_dim").or_else(|| { + // Infer: hidden_size / num_attention_heads + match (hidden_size, num_attention_heads) { + (Some(h), Some(n)) if n > 0 => Some(h / n), + _ => None, + } + }); + let intermediate_size = u32_field(v, "intermediate_size"); + + // --- MoE detection --- + let moe = u32_field(v, "n_routed_experts") + .or_else(|| u32_field(v, "num_local_experts")) + .or_else(|| u32_field(v, "num_experts")) + .map(|num_experts| MoeConfig { + num_experts, + num_active_experts: u32_field(v, "num_experts_per_tok") + .or_else(|| u32_field(v, "num_experts_per_topk")) + .unwrap_or(2), + num_shared_experts: u32_field(v, "n_shared_experts").unwrap_or(0), + expert_intermediate_size: u32_field(v, "moe_intermediate_size"), + }); + + // --- MLA detection (kv_lora_rank present → MLA) --- + let mla = u32_field(v, "kv_lora_rank").and_then(|kv_lora_rank| { + Some(MlaConfig { + kv_lora_rank, + q_lora_rank: u32_field(v, "q_lora_rank")?, + qk_nope_head_dim: u32_field(v, "qk_nope_head_dim")?, + qk_rope_head_dim: u32_field(v, "qk_rope_head_dim")?, + v_head_dim: u32_field(v, "v_head_dim")?, + }) + }); + + // --- Attention pattern --- + let attention = + if let Some(first_dense) = u32_field(v, "first_k_dense_replace") { + // DSA-style model (GLM-5, DeepSeek-V3). + // dense_window and sparse_stride are typically not in config.json; + // use sensible defaults the user can override in YAML. + Some(AttentionConfig::Dsa { + dense_window: 4096, + sparse_stride: 8, + first_dense_layers: first_dense, + }) + } else if let Some(sw) = v + .get("sliding_window") + .and_then(|x| x.as_u64()) + .map(|x| x as u32) + { + Some(AttentionConfig::SlidingWindow { window_size: sw }) + } else { + None // dense by default + }; + + Ok(ModelConfig { + name, + num_layers: num_layers.unwrap_or(0), + num_kv_heads: num_kv_heads.unwrap_or(0), + head_dim: head_dim.unwrap_or(0), + hidden_size, + num_attention_heads, + intermediate_size, + moe, + mla, + attention, + // Deployment fields: must come from YAML + dtype_bytes: 0, + block_size_tokens: 0, + ..Default::default() + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_dense_model() { + let json = serde_json::json!({ + "model_type": "qwen2", + "num_hidden_layers": 28, + "hidden_size": 3584, + "num_attention_heads": 28, + "num_key_value_heads": 4, + "intermediate_size": 18944, + }); + let m = parse_value(&json).unwrap(); + assert_eq!(m.num_layers, 28); + assert_eq!(m.hidden_size, Some(3584)); + assert_eq!(m.num_kv_heads, 4); + assert_eq!(m.head_dim, 128); // 3584 / 28 + assert!(m.moe.is_none()); + assert!(m.mla.is_none()); + assert!(m.attention.is_none()); + } + + #[test] + fn parse_qwen3_moe() { + let json = serde_json::json!({ + "model_type": "qwen3_moe", + "num_hidden_layers": 62, + "hidden_size": 6144, + "num_attention_heads": 96, + "num_key_value_heads": 8, + "head_dim": 128, + "intermediate_size": 8192, + "num_experts": 160, + "num_experts_per_tok": 8, + "moe_intermediate_size": 2560, + }); + let m = parse_value(&json).unwrap(); + assert_eq!(m.num_layers, 62); + assert_eq!(m.num_kv_heads, 8); + assert_eq!(m.head_dim, 128); + let moe = m.moe.as_ref().unwrap(); + assert_eq!(moe.num_experts, 160); + assert_eq!(moe.num_active_experts, 8); + assert_eq!(moe.expert_intermediate_size, Some(2560)); + assert_eq!(moe.num_shared_experts, 0); + assert!(m.mla.is_none()); + assert!(m.attention.is_none()); + } + + #[test] + fn parse_moe_mla_dsa() { + let json = serde_json::json!({ + "model_type": "glm_moe_dsa", + "num_hidden_layers": 78, + "hidden_size": 6144, + "num_attention_heads": 64, + "num_key_value_heads": 64, + "head_dim": 64, + "intermediate_size": 12288, + "n_routed_experts": 256, + "num_experts_per_tok": 8, + "n_shared_experts": 1, + "moe_intermediate_size": 2048, + "kv_lora_rank": 512, + "q_lora_rank": 2048, + "qk_nope_head_dim": 192, + "qk_rope_head_dim": 64, + "v_head_dim": 256, + "first_k_dense_replace": 3, + }); + let m = parse_value(&json).unwrap(); + assert_eq!(m.num_layers, 78); + assert_eq!(m.head_dim, 64); + let moe = m.moe.as_ref().unwrap(); + assert_eq!(moe.num_experts, 256); + assert_eq!(moe.num_active_experts, 8); + let mla = m.mla.as_ref().unwrap(); + assert_eq!(mla.kv_lora_rank, 512); + assert!(matches!(m.attention, Some(AttentionConfig::Dsa { first_dense_layers: 3, .. }))); + } +} diff --git a/src/instance/compute.rs b/src/instance/compute.rs new file mode 100644 index 0000000..41f6aab --- /dev/null +++ b/src/instance/compute.rs @@ -0,0 +1,405 @@ +//! Roofline cost model for prefill (PD disaggregation — decode not modeled). +//! +//! Two construction modes: +//! +//! **Architecture-derived** (`ModelConfig.hidden_size` present): +//! All FLOPs, attention coefficients, and weight-stream costs are computed +//! from the model shape. Handles standard / GQA / MLA attention projections, +//! MoE routing, and DSA / sliding-window sub-quadratic attention patterns. +//! +//! **Legacy manual** (`hidden_size` absent): uses the raw +//! `flops_per_token_prefill` + `attn_quadratic_coeff` scalars from the YAML. +//! +//! ```text +//! prefill_time(N) = max(compute_time(N), mem_time) +//! +//! compute_time = sum over layers of: +//! (N * linear_flops + attn_coeff * N * effective_ctx(N)) / gpu_flops +//! +//! mem_time = num_layers * weight_bytes_per_layer / gpu_mem_bw +//! ``` +//! +//! `effective_ctx(N)` equals `N` for dense attention (→ O(N²) total) but +//! is sub-linear for DSA / sliding-window. + +use crate::config::{AttentionConfig, HardwareConfig, ModelConfig}; + +/// Resolved attention pattern used at runtime. +#[derive(Debug, Clone)] +pub enum AttentionPattern { + /// Full quadratic: effective_ctx = N. + Dense, + /// Sliding window: effective_ctx = min(N, window). + SlidingWindow { window: f64 }, + /// DeepSeek Sparse Attention: effective_ctx = min(N, dense_window) + + /// max(0, N - dense_window) / sparse_stride. + Dsa { dense_window: f64, sparse_stride: f64 }, +} + +#[derive(Debug, Clone)] +pub struct ComputeModel { + /// Total transformer layers. + pub num_layers: f64, + /// How many initial layers use dense attention (rest use `attn_pattern`). + /// For `Dense` pattern this equals `num_layers`. + pub first_dense_layers: f64, + /// Non-attention FLOPs per token per layer (QKV proj + output proj + MLP). + pub linear_flops_per_token: f64, + /// Attention score coefficient: per-layer attention FLOPs = + /// `attn_coeff * N * effective_ctx(N)`. + pub attn_coeff: f64, + /// Attention pattern for non-dense layers. + pub attn_pattern: AttentionPattern, + /// Weight bytes read from HBM per layer (for memory-bound check). + pub weight_bytes_per_layer: f64, + /// Peak GPU FLOPs (aggregate across TP group). + pub gpu_flops: f64, + /// Peak GPU memory bandwidth (aggregate across TP group). + pub gpu_mem_bw: f64, +} + +impl ComputeModel { + pub fn new(model: &ModelConfig, hw: &HardwareConfig) -> Self { + if model.is_arch_mode() { + Self::from_arch(model, hw) + } else { + Self::from_manual(model, hw) + } + } + + // ----- Architecture-derived construction -------------------------------- + + fn from_arch(model: &ModelConfig, hw: &HardwareConfig) -> Self { + let h = model.hidden_size.unwrap() as f64; + let n_heads = model.num_attention_heads.unwrap_or(model.num_kv_heads) as f64; + let n_kv = model.num_kv_heads as f64; + let hd = model.head_dim as f64; + let inter = model.intermediate_size.unwrap_or(0) as f64; + let dtype = model.dtype_bytes as f64; + + // --- Attention linear FLOPs/token/layer --- + let attn_linear = if let Some(mla) = &model.mla { + let qlr = mla.q_lora_rank as f64; + let kvlr = mla.kv_lora_rank as f64; + let qk_hd = (mla.qk_nope_head_dim + mla.qk_rope_head_dim) as f64; + let qk_rd = mla.qk_rope_head_dim as f64; + let vhd = mla.v_head_dim as f64; + // Q: down-project + up-project + let q = 2.0 * h * qlr + 2.0 * qlr * n_heads * qk_hd; + // KV: down-project (compressed latent + RoPE key) + let kv = 2.0 * h * (kvlr + qk_rd); + // Output: up-project + let o = 2.0 * n_heads * vhd * h; + q + kv + o + } else { + // Standard / GQA + let qkv = 2.0 * h * (n_heads + 2.0 * n_kv) * hd; + let o = 2.0 * n_heads * hd * h; + qkv + o + }; + + // --- MLP FLOPs/token/layer (SwiGLU: gate + up + down = 3 matmuls) --- + let mlp = if let Some(moe) = &model.moe { + let expert_inter = moe.expert_intermediate_size + .unwrap_or(model.intermediate_size.unwrap_or(0)) as f64; + let active = moe.num_active_experts as f64; + let shared = moe.num_shared_experts as f64; + active * 6.0 * h * expert_inter + shared * 6.0 * h * inter + } else { + 6.0 * h * inter + }; + + let linear_flops = attn_linear + mlp; + + // --- Attention quadratic coefficient --- + // attn_flops_per_layer(N) = attn_coeff * N * effective_ctx(N) + let attn_coeff = if let Some(mla) = &model.mla { + let kvlr = mla.kv_lora_rank as f64; + let qk_rd = mla.qk_rope_head_dim as f64; + // Absorbed QK^T: each head dots over (kv_lora_rank + qk_rope_head_dim) dims. + // Absorbed V: each head dots over kv_lora_rank dims. + 2.0 * n_heads * (2.0 * kvlr + qk_rd) + } else { + // Standard: QK^T + attn@V, each 2 * n_heads * head_dim per pair. + 4.0 * n_heads * hd + }; + + // --- Weight bytes per layer (active params only for MoE) --- + let attn_wt = if let Some(mla) = &model.mla { + let qlr = mla.q_lora_rank as f64; + let kvlr = mla.kv_lora_rank as f64; + let qk_hd = (mla.qk_nope_head_dim + mla.qk_rope_head_dim) as f64; + let qk_rd = mla.qk_rope_head_dim as f64; + let vhd = mla.v_head_dim as f64; + (h * qlr + qlr * n_heads * qk_hd + + h * (kvlr + qk_rd) + + n_heads * vhd * h) + * dtype + } else { + ((n_heads + 2.0 * n_kv) * hd * h + n_heads * hd * h) * dtype + }; + let mlp_wt = if let Some(moe) = &model.moe { + let expert_inter = moe.expert_intermediate_size + .unwrap_or(model.intermediate_size.unwrap_or(0)) as f64; + let active = moe.num_active_experts as f64; + let shared = moe.num_shared_experts as f64; + (active * 3.0 * h * expert_inter + shared * 3.0 * h * inter) * dtype + } else { + 3.0 * h * inter * dtype + }; + let weight_bytes = attn_wt + mlp_wt; + + // --- Attention pattern --- + let (attn_pattern, first_dense) = match &model.attention { + Some(AttentionConfig::Dsa { + dense_window, + sparse_stride, + first_dense_layers, + }) => ( + AttentionPattern::Dsa { + dense_window: *dense_window as f64, + sparse_stride: *sparse_stride as f64, + }, + *first_dense_layers as f64, + ), + Some(AttentionConfig::SlidingWindow { window_size }) => ( + AttentionPattern::SlidingWindow { + window: *window_size as f64, + }, + 0.0, + ), + Some(AttentionConfig::Dense) | None => ( + AttentionPattern::Dense, + model.num_layers as f64, + ), + }; + + Self { + num_layers: model.num_layers as f64, + first_dense_layers: first_dense, + linear_flops_per_token: linear_flops, + attn_coeff, + attn_pattern, + weight_bytes_per_layer: weight_bytes, + gpu_flops: hw.gpu_flops, + gpu_mem_bw: hw.gpu_mem_bw, + } + } + + // ----- Legacy manual construction --------------------------------------- + + fn from_manual(model: &ModelConfig, hw: &HardwareConfig) -> Self { + Self { + num_layers: model.num_layers as f64, + first_dense_layers: model.num_layers as f64, + linear_flops_per_token: model.flops_per_token_prefill.unwrap_or(0.0), + attn_coeff: model.attn_quadratic_coeff.unwrap_or(0.0), + attn_pattern: AttentionPattern::Dense, + weight_bytes_per_layer: 0.0, + gpu_flops: hw.gpu_flops, + gpu_mem_bw: hw.gpu_mem_bw, + } + } + + // ----- Prefill time ----------------------------------------------------- + + /// Effective context length a single token attends to at sequence length N. + fn effective_ctx(&self, n: f64, dense_layer: bool) -> f64 { + if dense_layer { + return n; + } + match &self.attn_pattern { + AttentionPattern::Dense => n, + AttentionPattern::SlidingWindow { window } => n.min(*window), + AttentionPattern::Dsa { + dense_window, + sparse_stride, + } => { + if n <= *dense_window { + n + } else { + *dense_window + (n - *dense_window) / *sparse_stride + } + } + } + } + + /// Time (s) to prefill `n` tokens. + pub fn prefill_time(&self, n: u32) -> f64 { + if n == 0 { + return 0.0; + } + let n = n as f64; + let linear = n * self.linear_flops_per_token; + + // Compute FLOPs across all layers (dense + sparse may differ). + let dense_layers = self.first_dense_layers; + let sparse_layers = self.num_layers - dense_layers; + + let dense_flops = dense_layers + * (linear + self.attn_coeff * n * self.effective_ctx(n, true)); + let sparse_flops = sparse_layers + * (linear + self.attn_coeff * n * self.effective_ctx(n, false)); + let total_flops = dense_flops + sparse_flops; + + let compute_time = total_flops / self.gpu_flops; + // Weight stream: all layers' active weights read once from HBM. + let mem_time = self.weight_bytes_per_layer * self.num_layers / self.gpu_mem_bw; + + compute_time.max(mem_time) + } + + /// Print human-readable derived coefficients (for `validate` output). + pub fn describe(&self) -> String { + let pattern_str = match &self.attn_pattern { + AttentionPattern::Dense => "dense".to_string(), + AttentionPattern::SlidingWindow { window } => format!("sliding_window({})", *window as u64), + AttentionPattern::Dsa { + dense_window, + sparse_stride, + } => format!( + "dsa(window={}, stride={}, {} dense layers)", + *dense_window as u64, *sparse_stride as u64, self.first_dense_layers as u64 + ), + }; + format!( + "linear_flops/tok/layer={:.3e}, attn_coeff={:.0}, pattern={}, \ + weight_bytes/layer={:.2e}", + self.linear_flops_per_token, self.attn_coeff, pattern_str, + self.weight_bytes_per_layer, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn cm_legacy() -> ComputeModel { + ComputeModel { + num_layers: 28.0, + first_dense_layers: 28.0, + linear_flops_per_token: 1.4e10, + attn_coeff: 1024.0, + attn_pattern: AttentionPattern::Dense, + weight_bytes_per_layer: 0.0, + gpu_flops: 9.89e14, + gpu_mem_bw: 3.35e12, + } + } + + #[test] + fn prefill_monotonic_in_n() { + let m = cm_legacy(); + let mut prev = 0.0; + for &n in &[1u32, 8, 64, 512, 4096, 32768] { + let t = m.prefill_time(n); + assert!(t > prev, "prefill_time should be monotonic; n={n} t={t}"); + prev = t; + } + } + + #[test] + fn quadratic_dominates_for_long_prompt() { + let m = cm_legacy(); + let lin = m.prefill_time(1024); + let big = m.prefill_time(32768); + assert!(big / lin > 32.0); + } + + #[test] + fn zero_tokens_is_free() { + let m = cm_legacy(); + assert_eq!(m.prefill_time(0), 0.0); + } + + #[test] + fn dsa_subquadratic() { + // With DSA (window=4096, stride=8) the cost at 128k should be + // MUCH less than pure quadratic. + let dense = ComputeModel { + num_layers: 78.0, + first_dense_layers: 78.0, + linear_flops_per_token: 1.0e9, + attn_coeff: 139264.0, + attn_pattern: AttentionPattern::Dense, + weight_bytes_per_layer: 0.0, + gpu_flops: 1.8e16, + gpu_mem_bw: 6.4e13, + }; + let dsa = ComputeModel { + attn_pattern: AttentionPattern::Dsa { + dense_window: 4096.0, + sparse_stride: 8.0, + }, + first_dense_layers: 3.0, + ..dense.clone() + }; + let n = 131072; // 128k tokens + let t_dense = dense.prefill_time(n); + let t_dsa = dsa.prefill_time(n); + // DSA should be dramatically cheaper at long context. + assert!( + t_dsa < t_dense * 0.3, + "DSA should be <30% of dense at 128k: dense={t_dense:.3} dsa={t_dsa:.3}" + ); + // But still monotonic. + assert!(t_dsa > dsa.prefill_time(n / 2)); + } + + #[test] + fn mem_bound_short_prefill() { + // With very heavy weights and a short prompt, memory should dominate. + let m = ComputeModel { + num_layers: 10.0, + first_dense_layers: 10.0, + linear_flops_per_token: 1.0e6, // tiny compute + attn_coeff: 1.0, + attn_pattern: AttentionPattern::Dense, + weight_bytes_per_layer: 1.0e12, // 1 TB per layer + gpu_flops: 1.0e15, + gpu_mem_bw: 1.0e12, + }; + let t1 = m.prefill_time(1); + let t8 = m.prefill_time(8); + // Memory time = 10 * 1e12 / 1e12 = 10s, should dominate. + assert!((t1 - 10.0).abs() < 0.01); + // Doubling tokens shouldn't change time much (mem-bound). + assert!((t8 - t1).abs() / t1 < 0.01); + } + + #[test] + fn arch_derives_from_model_config() { + // Minimal dense model: verify from_arch produces something sensible. + let model = ModelConfig { + name: "test".into(), + num_layers: 4, + num_kv_heads: 2, + head_dim: 64, + dtype_bytes: 2, + block_size_tokens: 16, + hidden_size: Some(256), + num_attention_heads: Some(4), + intermediate_size: Some(512), + ..Default::default() + }; + let hw = HardwareConfig { + gpu_flops: 1e14, + gpu_mem_bw: 1e12, + hbm_bytes: 1e9, + dram_bytes: 4e9, + pcie_bw: 32e9, + pcie_latency_us: 1.0, + rdma_bw: 12e9, + rdma_latency_us: 5.0, + max_batch_slots: 32, + prefill_chunk_tokens: 1024, + }; + let cm = ComputeModel::new(&model, &hw); + assert!(cm.linear_flops_per_token > 0.0); + assert!(cm.attn_coeff > 0.0); + assert!(cm.weight_bytes_per_layer > 0.0); + let t = cm.prefill_time(1024); + assert!(t > 0.0); + } +} diff --git a/src/instance/instance.rs b/src/instance/instance.rs new file mode 100644 index 0000000..c694d3c --- /dev/null +++ b/src/instance/instance.rs @@ -0,0 +1,191 @@ +//! One simulated **prefill** serving instance. +//! +//! This simulator assumes **PD (prefill/decode) disaggregation**: prefill +//! and decode run on dedicated instance pools, and only prefill instances +//! are modeled here. The decode side is invisible to the KV-cache-aware +//! routing problem we are studying — once prefill finishes, the KV cache is +//! shipped to a decode instance via a separate (out-of-scope) path. +//! +//! As a result this `Instance`: +//! * Owns a two-tier KV cache (L0 = HBM, L1 = DRAM / v6d) used only for +//! prefill prefix reuse. +//! * Owns the PCIe / RDMA links used for cache fetches. +//! * Runs a simple FCFS chunked-prefill scheduler: one request's prefill +//! chunk per step, up to `prefill_chunk_tokens` per chunk. +//! +//! The cluster (`crate::cluster`) is responsible for routing arrivals, +//! consulting the global meta store, and inserting fetched blocks into the +//! instance's caches before handing the request off via `admit`. + +use std::collections::VecDeque; + +use crate::config::{HardwareConfig, ModelConfig}; +use crate::instance::compute::ComputeModel; +use crate::instance::kv_cache::TwoTierCache; +use crate::network::InstanceLinks; +use crate::types::{InstanceId, ReqId}; + +#[derive(Debug, Clone)] +pub struct AdmittedRequest { + pub req_id: ReqId, + pub arrival: f64, + /// Earliest time at which the KV fetch chain (L1 + RDMA + PCIe) for this + /// request has completed, so its prefill compute can begin. + pub ready_at: f64, + /// Tokens still needing prefill compute (after cache hits accounted for). + pub prefill_tokens_remaining: u32, + /// KV blocks reserved on this instance's HBM for the lifetime of this + /// request's prefill (= number of input blocks). + pub reserved_blocks: u32, +} + +#[derive(Debug)] +pub struct StepResult { + pub completed: Vec<(ReqId, f64, f64)>, // (req_id, ttft, end_time) + pub next_tick: Option, +} + +pub struct Instance { + pub id: InstanceId, + pub cache: TwoTierCache, + pub links: InstanceLinks, + pub compute: ComputeModel, + pub block_size_tokens: u32, + pub hbm_block_budget: u32, + pub dram_block_budget: u32, + pub max_batch_slots: u32, + pub prefill_chunk_tokens: u32, + + pub kv_blocks_used: u32, + + /// Admitted but not yet ready (waiting for fetch chain to land). + pending: VecDeque, + /// Ready and currently being prefilled (FCFS, one at a time per step). + prefilling: VecDeque, + + /// True if a BatchTick is already on the global queue for us. + pub tick_scheduled: bool, +} + +impl Instance { + pub fn new(id: InstanceId, model: &ModelConfig, hw: &HardwareConfig) -> Self { + let block_bytes = model.kv_block_bytes() as f64; + let hbm_blocks = (hw.hbm_bytes / block_bytes).max(1.0) as u32; + let dram_blocks = (hw.dram_bytes / block_bytes).max(1.0) as u32; + Self { + id, + cache: TwoTierCache::new(hbm_blocks as usize, dram_blocks as usize), + links: InstanceLinks::from_hw(hw), + compute: ComputeModel::new(model, hw), + block_size_tokens: model.block_size_tokens, + hbm_block_budget: hbm_blocks, + dram_block_budget: dram_blocks, + max_batch_slots: hw.max_batch_slots, + prefill_chunk_tokens: hw.prefill_chunk_tokens, + kv_blocks_used: 0, + pending: VecDeque::new(), + prefilling: VecDeque::new(), + tick_scheduled: false, + } + } + + pub fn queue_len(&self) -> u32 { + (self.pending.len() + self.prefilling.len()) as u32 + } + + /// Total prefill tokens remaining across all pending and prefilling requests. + pub fn waiting_tokens(&self) -> u64 { + self.pending + .iter() + .chain(self.prefilling.iter()) + .map(|r| r.prefill_tokens_remaining as u64) + .sum() + } + + /// Estimated wall-clock time to drain all currently queued requests. + /// + /// Sums `compute.prefill_time(tokens_j)` for each queued request, + /// capturing the non-linear (quadratic / DSA) cost accurately. + pub fn estimated_drain_time(&self) -> f64 { + self.pending + .iter() + .chain(self.prefilling.iter()) + .map(|r| self.compute.prefill_time(r.prefill_tokens_remaining)) + .sum() + } + + pub fn admit(&mut self, req: AdmittedRequest) { + self.pending.push_back(req); + } + + /// Run one batch step. Returns any requests that finished prefill during + /// this step plus the next wakeup time for the instance. + pub fn step(&mut self, now: f64) -> StepResult { + let mut completed = Vec::new(); + + // 1. Drain ready pending requests into prefilling, respecting KV + // budget and slot cap. A request whose fetch chain is complete + // *and* has zero prefill tokens (full cache hit) finishes + // immediately at `now`. + while let Some(front) = self.pending.front() { + if front.ready_at > now { + break; + } + if self.prefilling.len() as u32 >= self.max_batch_slots { + break; + } + if self.kv_blocks_used + front.reserved_blocks > self.hbm_block_budget { + break; + } + let r = self.pending.pop_front().unwrap(); + self.kv_blocks_used += r.reserved_blocks; + if r.prefill_tokens_remaining == 0 { + // Full cache hit: nothing to compute. TTFT == fetch time. + let ttft = now - r.arrival; + self.kv_blocks_used = self.kv_blocks_used.saturating_sub(r.reserved_blocks); + completed.push((r.req_id, ttft, now)); + } else { + self.prefilling.push_back(r); + } + } + + // 2. Run one chunked-prefill step on the head of `prefilling`. + let chunk_tokens = self + .prefilling + .front() + .map(|r| r.prefill_tokens_remaining.min(self.prefill_chunk_tokens)) + .unwrap_or(0); + + if chunk_tokens == 0 { + // Nothing compute-bound in flight right now. + return StepResult { + completed, + next_tick: self.next_wakeup(now), + }; + } + + let dt = self.compute.prefill_time(chunk_tokens); + let t_end = now + dt; + + let head = self.prefilling.front_mut().unwrap(); + head.prefill_tokens_remaining -= chunk_tokens; + if head.prefill_tokens_remaining == 0 { + let done = self.prefilling.pop_front().unwrap(); + let ttft = t_end - done.arrival; + self.kv_blocks_used = self.kv_blocks_used.saturating_sub(done.reserved_blocks); + completed.push((done.req_id, ttft, t_end)); + } + + StepResult { + completed, + next_tick: self.next_wakeup(t_end), + } + } + + fn next_wakeup(&self, after: f64) -> Option { + if !self.prefilling.is_empty() { + return Some(after); + } + self.pending.front().map(|r| r.ready_at.max(after)) + } +} diff --git a/src/instance/kv_cache.rs b/src/instance/kv_cache.rs new file mode 100644 index 0000000..9daaa56 --- /dev/null +++ b/src/instance/kv_cache.rs @@ -0,0 +1,226 @@ +//! Two-tier LRU KV cache (L0 = GPU HBM, L1 = CPU DRAM / v6d). +//! +//! Each tier stores block hashes; the unit of accounting is one 16-token +//! block. `longest_prefix` walks the hash slice front-to-back and returns the +//! count of leading blocks present in the tier (and touches them so they +//! stay hot). +//! +//! On insert, evicted block hashes are returned so the caller (instance) can +//! propagate them to the global meta store if desired. + +use ahash::AHashMap; + +/// Doubly-linked-list-backed LRU keyed by block hash. +#[derive(Debug)] +pub struct LruBlocks { + capacity: usize, + map: AHashMap, + nodes: Vec, + head: Option, // most recently used + tail: Option, // least recently used + free: Vec, +} + +#[derive(Debug, Clone, Copy)] +struct Node { + key: u64, + prev: Option, + next: Option, +} + +impl LruBlocks { + pub fn new(capacity: usize) -> Self { + Self { + capacity, + map: AHashMap::with_capacity(capacity), + nodes: Vec::with_capacity(capacity), + head: None, + tail: None, + free: Vec::new(), + } + } + + pub fn capacity(&self) -> usize { + self.capacity + } + + pub fn len(&self) -> usize { + self.map.len() + } + + pub fn is_empty(&self) -> bool { + self.map.is_empty() + } + + pub fn contains(&self, key: u64) -> bool { + self.map.contains_key(&key) + } + + /// Touch (move to MRU) if present. Returns whether the key was present. + pub fn touch(&mut self, key: u64) -> bool { + if let Some(&idx) = self.map.get(&key) { + self.move_to_head(idx); + true + } else { + false + } + } + + /// Insert blocks; evicted hashes appended to `evicted_out`. Reinserting an + /// existing block just touches it. + pub fn insert_blocks(&mut self, hashes: &[u64], evicted_out: &mut Vec) { + for &h in hashes { + if self.touch(h) { + continue; + } + // need to make room? + if self.map.len() == self.capacity { + if let Some(tail_idx) = self.tail { + let tail_key = self.nodes[tail_idx].key; + self.detach(tail_idx); + self.map.remove(&tail_key); + self.free.push(tail_idx); + evicted_out.push(tail_key); + } + } + // allocate node + let idx = if let Some(i) = self.free.pop() { + self.nodes[i] = Node { key: h, prev: None, next: None }; + i + } else { + let i = self.nodes.len(); + self.nodes.push(Node { key: h, prev: None, next: None }); + i + }; + self.map.insert(h, idx); + self.attach_to_head(idx); + } + } + + /// Longest leading prefix of `hashes` present; touches the matched blocks. + pub fn longest_prefix(&mut self, hashes: &[u64]) -> usize { + let mut n = 0usize; + for &h in hashes { + if !self.touch(h) { + break; + } + n += 1; + } + n + } + + /// Read-only longest prefix without LRU updates (used for routing probes). + pub fn longest_prefix_peek(&self, hashes: &[u64]) -> usize { + let mut n = 0usize; + for &h in hashes { + if !self.map.contains_key(&h) { + break; + } + n += 1; + } + n + } + + fn move_to_head(&mut self, idx: usize) { + if Some(idx) == self.head { + return; + } + self.detach(idx); + self.attach_to_head(idx); + } + + fn detach(&mut self, idx: usize) { + let (prev, next) = { + let n = &self.nodes[idx]; + (n.prev, n.next) + }; + if let Some(p) = prev { + self.nodes[p].next = next; + } else { + // it was the head + self.head = next; + } + if let Some(nx) = next { + self.nodes[nx].prev = prev; + } else { + // it was the tail + self.tail = prev; + } + self.nodes[idx].prev = None; + self.nodes[idx].next = None; + } + + fn attach_to_head(&mut self, idx: usize) { + let old_head = self.head; + self.nodes[idx].prev = None; + self.nodes[idx].next = old_head; + if let Some(h) = old_head { + self.nodes[h].prev = Some(idx); + } + self.head = Some(idx); + if self.tail.is_none() { + self.tail = Some(idx); + } + } +} + +/// Two-tier (HBM, DRAM) cache. +#[derive(Debug)] +pub struct TwoTierCache { + pub l0: LruBlocks, + pub l1: LruBlocks, +} + +impl TwoTierCache { + pub fn new(l0_cap: usize, l1_cap: usize) -> Self { + Self { + l0: LruBlocks::new(l0_cap), + l1: LruBlocks::new(l1_cap), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn lcp_full_partial_empty() { + let mut c = LruBlocks::new(8); + let mut ev = Vec::new(); + c.insert_blocks(&[1, 2, 3, 4], &mut ev); + assert_eq!(c.longest_prefix(&[1, 2, 3, 4, 5, 6]), 4); + assert_eq!(c.longest_prefix(&[1, 2, 9]), 2); + assert_eq!(c.longest_prefix(&[99, 1]), 0); + } + + #[test] + fn lru_eviction_order() { + let mut c = LruBlocks::new(3); + let mut ev = Vec::new(); + c.insert_blocks(&[1, 2, 3], &mut ev); + assert!(ev.is_empty()); + // touch 1 -> MRU + c.touch(1); + // insert 4 -> evicts LRU which should be 2 + c.insert_blocks(&[4], &mut ev); + assert_eq!(ev, vec![2]); + assert!(c.contains(1)); + assert!(c.contains(3)); + assert!(c.contains(4)); + assert!(!c.contains(2)); + } + + #[test] + fn longest_prefix_touches_blocks() { + let mut c = LruBlocks::new(3); + let mut ev = Vec::new(); + c.insert_blocks(&[1, 2, 3], &mut ev); + // touch 1 via prefix lookup (only the first matching block: 1) + assert_eq!(c.longest_prefix(&[1, 99]), 1); + // now insert 4 -> LRU should be 2 (since 3 was just inserted MRU after 2, + // 1 is freshest, then 3, then 2) + c.insert_blocks(&[4], &mut ev); + assert_eq!(ev, vec![2]); + } +} diff --git a/src/instance/mod.rs b/src/instance/mod.rs new file mode 100644 index 0000000..8cbb697 --- /dev/null +++ b/src/instance/mod.rs @@ -0,0 +1,6 @@ +pub mod compute; +pub mod kv_cache; +#[allow(clippy::module_inception)] +pub mod instance; + +pub use instance::Instance; diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..36f9e98 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,13 @@ +pub mod cluster; +pub mod config; +pub mod driver; +pub mod hardware_presets; +pub mod hf_config; +pub mod instance; +pub mod metrics; +pub mod network; +pub mod oracle; +pub mod router; +pub mod sim; +pub mod trace; +pub mod types; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..14289b2 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,271 @@ +use anyhow::{Context, Result}; +use clap::{Args, Parser, Subcommand}; +use std::path::PathBuf; + +use kvcache_simulator::config::{Config, RouterMode}; +use kvcache_simulator::{driver, oracle, trace::TraceReader}; + +#[derive(Debug, Parser)] +#[command(name = "kvcache-sim", about = "Cluster-level KV cache simulator")] +struct Cli { + #[command(subcommand)] + cmd: Cmd, +} + +/// Optional CLI overrides applied on top of the YAML config so the same +/// config can be reused across sweeps without editing the file. +#[derive(Debug, Args, Clone, Default)] +struct ConfigOverrides { + /// Override `cluster.num_instances`. + #[arg(long)] + num_instances: Option, + /// Override `sim.max_requests` (cap on processed trace records). + #[arg(long)] + max_requests: Option, + /// Override `sim.trace_path`. + #[arg(long)] + trace: Option, + /// Override `sim.output_dir`. + #[arg(long)] + output_dir: Option, + /// Override `sim.seed`. + #[arg(long)] + seed: Option, + /// Override `cluster.router.precise_probe_topk`. + #[arg(long)] + precise_topk: Option, + /// Override `cluster.meta_store.ttl_seconds`. + #[arg(long)] + ttl_seconds: Option, +} + +impl ConfigOverrides { + fn apply(&self, cfg: &mut Config) { + if let Some(n) = self.num_instances { + cfg.cluster.num_instances = n; + } + if let Some(m) = self.max_requests { + cfg.sim.max_requests = Some(m); + } + if let Some(t) = &self.trace { + cfg.sim.trace_path = t.to_string_lossy().into_owned(); + } + if let Some(o) = &self.output_dir { + cfg.sim.output_dir = o.to_string_lossy().into_owned(); + } + if let Some(s) = self.seed { + cfg.sim.seed = s; + } + if let Some(k) = self.precise_topk { + cfg.cluster.router.precise_probe_topk = k; + } + if let Some(ttl) = self.ttl_seconds { + cfg.cluster.meta_store.ttl_seconds = ttl; + } + } +} + +#[derive(Debug, Subcommand)] +enum Cmd { + /// Run a single simulation with the router specified in the config. + Run { + #[arg(short, long)] + config: PathBuf, + #[command(flatten)] + overrides: ConfigOverrides, + }, + /// Run the same trace under multiple routers and compare summaries. + Ablate { + #[arg(short, long)] + config: PathBuf, + /// Comma-separated router modes + #[arg( + short, + long, + default_value = "random,least_loaded,least_tokens,ttl_aware,min_pd,cache_load,cache_score,estimated_ttft,prefix_affinity" + )] + routers: String, + #[command(flatten)] + overrides: ConfigOverrides, + }, + /// Parse the config and trace head; do not run a simulation. + Validate { + #[arg(short, long)] + config: PathBuf, + #[command(flatten)] + overrides: ConfigOverrides, + }, + /// Offline oracle analysis: theoretical hit-rate ceilings (unlimited + /// cache and offline-optimal Belady eviction at finite capacity), plus + /// LRU at the same capacity for comparison. + Oracle { + #[arg(short, long)] + config: PathBuf, + #[command(flatten)] + overrides: ConfigOverrides, + /// Cache capacity (in 16-token blocks) used for the Belady and LRU + /// analyses. Defaults to `num_instances * per_instance_HBM_blocks` + /// (the cluster-aggregate capacity). + #[arg(long)] + capacity_blocks: Option, + /// Use the per-instance HBM block budget instead of the + /// cluster-aggregate. Mutually exclusive with --capacity-blocks. + #[arg(long, default_value_t = false)] + per_instance: bool, + /// Optional output JSON path. Defaults to `/oracle.json`. + #[arg(long)] + out: Option, + }, +} + +fn main() -> Result<()> { + let cli = Cli::parse(); + match cli.cmd { + Cmd::Run { config, overrides } => cmd_run(&config, &overrides), + Cmd::Ablate { + config, + routers, + overrides, + } => cmd_ablate(&config, &routers, &overrides), + Cmd::Validate { config, overrides } => cmd_validate(&config, &overrides), + Cmd::Oracle { + config, + overrides, + capacity_blocks, + per_instance, + out, + } => cmd_oracle(&config, &overrides, capacity_blocks, per_instance, out.as_deref()), + } +} + +fn load(config: &PathBuf, overrides: &ConfigOverrides) -> Result { + let mut cfg = Config::from_yaml_path(config)?; + overrides.apply(&mut cfg); + Ok(cfg) +} + +fn cmd_run(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> { + let cfg = load(path, overrides)?; + let out = driver::run(&cfg, None)?; + println!("{}", serde_json::to_string_pretty(&out.summary)?); + Ok(()) +} + +fn cmd_ablate(path: &PathBuf, routers: &str, overrides: &ConfigOverrides) -> Result<()> { + let base = load(path, overrides)?; + let modes: Vec = routers + .split(',') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .map(RouterMode::parse) + .collect::>>() + .with_context(|| format!("parsing --routers='{routers}'"))?; + let mut all = Vec::new(); + for mode in modes { + let mut cfg = base.clone(); + cfg.cluster.router.mode = mode; + let sub = mode.as_str().to_string(); + eprintln!("[ablate] running router={}", sub); + let out = driver::run(&cfg, Some(&sub))?; + all.push(out.summary); + } + let agg_path = std::path::Path::new(&base.sim.output_dir).join("ablation.json"); + std::fs::create_dir_all(&base.sim.output_dir)?; + std::fs::write(&agg_path, serde_json::to_string_pretty(&all)?)?; + println!("{}", serde_json::to_string_pretty(&all)?); + eprintln!("[ablate] wrote {}", agg_path.display()); + Ok(()) +} + +fn cmd_validate(path: &PathBuf, overrides: &ConfigOverrides) -> Result<()> { + use kvcache_simulator::instance::compute::ComputeModel; + let cfg = load(path, overrides)?; + eprintln!("config OK: {}", cfg.model.name); + eprintln!("mode = {}", if cfg.model.is_arch_mode() { "architecture-derived" } else { "legacy manual" }); + let cm = ComputeModel::new(&cfg.model, &cfg.hardware); + eprintln!("compute: {}", cm.describe()); + eprintln!("kv_block_bytes = {} ({:.2} MB{})", + cfg.model.kv_block_bytes(), + cfg.model.kv_block_bytes() as f64 / 1e6, + if cfg.model.mla.is_some() { ", MLA compressed" } else { "" }, + ); + let block_bytes = cfg.model.kv_block_bytes() as f64; + let hbm_blocks = (cfg.hardware.hbm_bytes / block_bytes) as u64; + let dram_blocks = (cfg.hardware.dram_bytes / block_bytes) as u64; + eprintln!("per-instance HBM blocks = {hbm_blocks}, DRAM blocks = {dram_blocks}"); + eprintln!("num_instances = {}", cfg.cluster.num_instances); + // Sample prefill times at a few prompt lengths. + eprintln!("prefill_time samples:"); + for &n in &[256, 1024, 4096, 16384, 65536, 131072] { + let t = cm.prefill_time(n); + eprintln!(" N={n:>7} -> {t:.4} s"); + } + let reader = TraceReader::open(&cfg.sim.trace_path, Some(5))?; + for rec in reader { + let rec = rec?; + eprintln!( + " req {} chat={} t={:.3}s in={} out={} blocks={}", + rec.req_id, + rec.chat_id, + rec.arrival, + rec.input_len, + rec.output_len, + rec.hash_ids.len() + ); + } + Ok(()) +} + +fn cmd_oracle( + path: &PathBuf, + overrides: &ConfigOverrides, + capacity_blocks: Option, + per_instance: bool, + out_path: Option<&std::path::Path>, +) -> Result<()> { + let cfg = load(path, overrides)?; + let block_bytes = cfg.model.kv_block_bytes() as f64; + let per_instance_blocks = (cfg.hardware.hbm_bytes / block_bytes).max(1.0) as u64; + let aggregate_blocks = per_instance_blocks * cfg.cluster.num_instances as u64; + let capacity = match (capacity_blocks, per_instance) { + (Some(_), true) => { + return Err(anyhow::anyhow!( + "--capacity-blocks and --per-instance are mutually exclusive" + )) + } + (Some(c), false) => c, + (None, true) => per_instance_blocks, + (None, false) => aggregate_blocks, + }; + + eprintln!( + "[oracle] loading trace {} (max_requests={:?})", + cfg.sim.trace_path, cfg.sim.max_requests + ); + let reader = TraceReader::open(&cfg.sim.trace_path, cfg.sim.max_requests)?; + let records: Vec<_> = reader.collect::, _>>()?; + eprintln!( + "[oracle] loaded {} requests; analyzing with capacity = {} blocks \ + ({} per-instance × {} instances{})", + records.len(), + capacity, + per_instance_blocks, + cfg.cluster.num_instances, + if per_instance { ", per-instance mode" } else { "" } + ); + + let result = oracle::analyze(&records, capacity); + let json = serde_json::to_string_pretty(&result)?; + println!("{}", json); + + let target = match out_path { + Some(p) => p.to_path_buf(), + None => std::path::Path::new(&cfg.sim.output_dir).join("oracle.json"), + }; + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent)?; + } + std::fs::write(&target, &json)?; + eprintln!("[oracle] wrote {}", target.display()); + Ok(()) +} diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs new file mode 100644 index 0000000..86ea2cf --- /dev/null +++ b/src/metrics/mod.rs @@ -0,0 +1,7 @@ +pub mod per_request; +pub mod routing_log; +pub mod summary; +pub mod timeseries; + +pub use per_request::PerRequestRow; +pub use summary::Summary; diff --git a/src/metrics/per_request.rs b/src/metrics/per_request.rs new file mode 100644 index 0000000..b4ccdd8 --- /dev/null +++ b/src/metrics/per_request.rs @@ -0,0 +1,42 @@ +use anyhow::Result; +use serde::Serialize; +use std::path::Path; + +#[derive(Debug, Clone, Serialize)] +pub struct PerRequestRow { + pub req_id: u64, + pub arrival: f64, + pub ttft: f64, + pub e2e: f64, + pub instance: u32, + pub total_blocks: u32, + pub l0_hit_blocks: u32, + pub l1_hit_blocks: u32, + pub remote_hit_blocks: u32, + pub miss_blocks: u32, + pub rdma_bytes: u64, + pub pcie_bytes: u64, + pub probe_overhead_s: f64, +} + +pub struct PerRequestWriter { + inner: csv::Writer, +} + +impl PerRequestWriter { + pub fn create>(path: P) -> Result { + let f = std::fs::File::create(path)?; + let inner = csv::Writer::from_writer(f); + Ok(Self { inner }) + } + + pub fn write(&mut self, row: &PerRequestRow) -> Result<()> { + self.inner.serialize(row)?; + Ok(()) + } + + pub fn finish(mut self) -> Result<()> { + self.inner.flush()?; + Ok(()) + } +} diff --git a/src/metrics/routing_log.rs b/src/metrics/routing_log.rs new file mode 100644 index 0000000..5e3817a --- /dev/null +++ b/src/metrics/routing_log.rs @@ -0,0 +1,29 @@ +use anyhow::Result; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; + +use crate::router::RouteDecision; + +pub struct RoutingLogWriter { + inner: BufWriter, +} + +impl RoutingLogWriter { + pub fn create>(path: P) -> Result { + let f = File::create(path)?; + Ok(Self { inner: BufWriter::new(f) }) + } + + pub fn write(&mut self, decision: &RouteDecision) -> Result<()> { + let line = serde_json::to_string(decision)?; + self.inner.write_all(line.as_bytes())?; + self.inner.write_all(b"\n")?; + Ok(()) + } + + pub fn finish(mut self) -> Result<()> { + self.inner.flush()?; + Ok(()) + } +} diff --git a/src/metrics/summary.rs b/src/metrics/summary.rs new file mode 100644 index 0000000..95d45ca --- /dev/null +++ b/src/metrics/summary.rs @@ -0,0 +1,80 @@ +use serde::Serialize; + +use crate::metrics::per_request::PerRequestRow; + +#[derive(Debug, Clone, Serialize, Default)] +pub struct Summary { + pub router: String, + pub num_requests: u64, + pub sim_duration_s: f64, + pub throughput_req_per_s: f64, + pub ttft_mean: f64, + pub ttft_p50: f64, + pub ttft_p95: f64, + pub ttft_p99: f64, + pub e2e_mean: f64, + pub e2e_p50: f64, + pub e2e_p95: f64, + pub e2e_p99: f64, + pub total_blocks: u64, + pub hit_rate_l0: f64, + pub hit_rate_l1: f64, + pub hit_rate_remote: f64, + pub miss_rate: f64, + pub total_rdma_bytes: u64, + pub total_pcie_bytes: u64, +} + +impl Summary { + pub fn from_rows(router: &str, rows: &[PerRequestRow], sim_duration_s: f64) -> Self { + if rows.is_empty() { + return Summary { + router: router.to_string(), + ..Default::default() + }; + } + let mut ttfts: Vec = rows.iter().map(|r| r.ttft).collect(); + let mut e2es: Vec = rows.iter().map(|r| r.e2e).collect(); + ttfts.sort_by(|a, b| a.partial_cmp(b).unwrap()); + e2es.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let pct = |v: &[f64], q: f64| -> f64 { + let n = v.len(); + let idx = ((n as f64 - 1.0) * q).round() as usize; + v[idx.min(n - 1)] + }; + let mean = |v: &[f64]| -> f64 { + if v.is_empty() { + 0.0 + } else { + v.iter().sum::() / v.len() as f64 + } + }; + let total_blocks: u64 = rows.iter().map(|r| r.total_blocks as u64).sum(); + let l0: u64 = rows.iter().map(|r| r.l0_hit_blocks as u64).sum(); + let l1: u64 = rows.iter().map(|r| r.l1_hit_blocks as u64).sum(); + let remote: u64 = rows.iter().map(|r| r.remote_hit_blocks as u64).sum(); + let miss: u64 = rows.iter().map(|r| r.miss_blocks as u64).sum(); + let denom = total_blocks.max(1) as f64; + Summary { + router: router.to_string(), + num_requests: rows.len() as u64, + sim_duration_s, + throughput_req_per_s: rows.len() as f64 / sim_duration_s.max(1e-9), + ttft_mean: mean(&ttfts), + ttft_p50: pct(&ttfts, 0.50), + ttft_p95: pct(&ttfts, 0.95), + ttft_p99: pct(&ttfts, 0.99), + e2e_mean: mean(&e2es), + e2e_p50: pct(&e2es, 0.50), + e2e_p95: pct(&e2es, 0.95), + e2e_p99: pct(&e2es, 0.99), + total_blocks, + hit_rate_l0: l0 as f64 / denom, + hit_rate_l1: l1 as f64 / denom, + hit_rate_remote: remote as f64 / denom, + miss_rate: miss as f64 / denom, + total_rdma_bytes: rows.iter().map(|r| r.rdma_bytes).sum(), + total_pcie_bytes: rows.iter().map(|r| r.pcie_bytes).sum(), + } + } +} diff --git a/src/metrics/timeseries.rs b/src/metrics/timeseries.rs new file mode 100644 index 0000000..327cc0c --- /dev/null +++ b/src/metrics/timeseries.rs @@ -0,0 +1,34 @@ +use anyhow::Result; +use serde::Serialize; +use std::path::Path; + +#[derive(Debug, Clone, Serialize)] +pub struct TimeseriesRow { + pub t: f64, + pub instance: u32, + pub queue_len: u32, + pub kv_blocks_used: u32, + pub kv_blocks_total: u32, + pub busy: u8, +} + +pub struct TimeseriesWriter { + inner: csv::Writer, +} + +impl TimeseriesWriter { + pub fn create>(path: P) -> Result { + let f = std::fs::File::create(path)?; + Ok(Self { inner: csv::Writer::from_writer(f) }) + } + + pub fn write(&mut self, row: &TimeseriesRow) -> Result<()> { + self.inner.serialize(row)?; + Ok(()) + } + + pub fn finish(mut self) -> Result<()> { + self.inner.flush()?; + Ok(()) + } +} diff --git a/src/network.rs b/src/network.rs new file mode 100644 index 0000000..0bc1984 --- /dev/null +++ b/src/network.rs @@ -0,0 +1,84 @@ +//! Network cost models for RDMA (cross-instance) and PCIe (host<->GPU). +//! +//! Each link is modeled as a token bucket via a `next_free` cursor: a fetch of +//! `bytes` starting at `now` waits until `next_free`, then advances the cursor +//! by `bytes / bw`. Latency is added on top of transfer time. This captures +//! contention without simulating individual packets. + +use crate::config::HardwareConfig; + +#[derive(Debug, Clone)] +pub struct LinkModel { + pub bw_bytes_per_s: f64, + pub latency_s: f64, + next_free: f64, +} + +impl LinkModel { + pub fn new(bw_bytes_per_s: f64, latency_s: f64) -> Self { + Self { + bw_bytes_per_s, + latency_s, + next_free: 0.0, + } + } + + /// Reserve a transfer of `bytes` starting at `now`. Returns the absolute + /// time at which the bytes have all arrived (advances internal cursor). + pub fn reserve(&mut self, now: f64, bytes: u64) -> f64 { + if bytes == 0 { + return now + self.latency_s; + } + let xfer = bytes as f64 / self.bw_bytes_per_s; + let start = self.next_free.max(now); + self.next_free = start + xfer; + self.next_free + self.latency_s + } + + /// Pure cost (no contention): how long to push `bytes` over this link. + pub fn cost(&self, bytes: u64) -> f64 { + if bytes == 0 { + self.latency_s + } else { + self.latency_s + bytes as f64 / self.bw_bytes_per_s + } + } +} + +/// Per-instance bundle of links: PCIe (host<->GPU) and RDMA (host<->remote). +#[derive(Debug, Clone)] +pub struct InstanceLinks { + pub pcie: LinkModel, + pub rdma: LinkModel, +} + +impl InstanceLinks { + pub fn from_hw(hw: &HardwareConfig) -> Self { + Self { + pcie: LinkModel::new(hw.pcie_bw, hw.pcie_latency_us * 1e-6), + rdma: LinkModel::new(hw.rdma_bw, hw.rdma_latency_us * 1e-6), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn link_cost_matches_formula() { + let l = LinkModel::new(1.0e9, 1.0e-6); + // 1 GB / (1 GB/s) = 1s, plus 1us latency + let t = l.cost(1_000_000_000); + assert!((t - (1.0 + 1e-6)).abs() < 1e-9); + } + + #[test] + fn reserve_serializes_concurrent_transfers() { + let mut l = LinkModel::new(1.0e9, 0.0); + let t1 = l.reserve(0.0, 500_000_000); // 0.5s + let t2 = l.reserve(0.0, 500_000_000); // contended -> 1.0s + assert!((t1 - 0.5).abs() < 1e-9); + assert!((t2 - 1.0).abs() < 1e-9); + } +} diff --git a/src/oracle.rs b/src/oracle.rs new file mode 100644 index 0000000..75523b6 --- /dev/null +++ b/src/oracle.rs @@ -0,0 +1,279 @@ +//! Offline oracle analyzers for upper-bound KV-cache hit rates. +//! +//! Two analyses, both treating the cluster as a single aggregated cache so +//! the result is independent of routing — i.e. they answer the question +//! "what is the best the cluster could possibly do?": +//! +//! 1. **Unlimited capacity**: longest-prefix-match against an unbounded +//! cache. The only misses are blocks that the prefix walk encounters for +//! the first time. Sets the absolute ceiling. +//! +//! 2. **Belady (offline optimal eviction) at finite capacity**: classic +//! OPT replacement — evict the cached block whose *next* access is +//! furthest in the future. Run alongside an LRU baseline at the same +//! capacity so the gap tells you how much room LRU is leaving. +//! +//! Hit accounting uses prefix-match semantics matching the rest of the +//! simulator: a block at position k in a request counts as a hit only if +//! all positions 0..k are also in the cache. + +use ahash::{AHashMap, AHashSet}; +use serde::Serialize; +use std::collections::BinaryHeap; + +use crate::instance::kv_cache::LruBlocks; +use crate::trace::RequestRecord; + +#[derive(Debug, Clone, Serialize)] +pub struct OracleResult { + pub num_requests: u64, + pub total_blocks: u64, + pub unique_blocks: u64, + pub unlimited: TierResult, + pub belady_finite: TierResult, + pub lru_finite: TierResult, +} + +#[derive(Debug, Clone, Serialize, Default)] +pub struct TierResult { + pub label: String, + pub capacity_blocks: u64, + pub hits: u64, + pub misses: u64, + pub hit_rate: f64, +} + +impl TierResult { + fn from_counts(label: &str, capacity_blocks: u64, hits: u64, total: u64) -> Self { + let misses = total.saturating_sub(hits); + TierResult { + label: label.to_string(), + capacity_blocks, + hits, + misses, + hit_rate: if total == 0 { 0.0 } else { hits as f64 / total as f64 }, + } + } +} + +pub fn analyze(records: &[RequestRecord], capacity_blocks: u64) -> OracleResult { + // total / unique counters + let total_blocks: u64 = records.iter().map(|r| r.hash_ids.len() as u64).sum(); + let mut unique = AHashSet::new(); + for r in records { + for &h in &r.hash_ids { + unique.insert(h); + } + } + + // 1. Unlimited cache + let unlimited_hits = run_unlimited(records); + let unlimited = TierResult::from_counts( + "unlimited", + u64::MAX, + unlimited_hits, + total_blocks, + ); + + // 2. Precompute next-use index for Belady + let next_use = build_next_use(records); + + // 3. Belady at the given capacity + let belady_hits = run_belady(records, &next_use, capacity_blocks as usize); + let belady = TierResult::from_counts("belady", capacity_blocks, belady_hits, total_blocks); + + // 4. LRU baseline at the same capacity + let lru_hits = run_lru(records, capacity_blocks as usize); + let lru = TierResult::from_counts("lru", capacity_blocks, lru_hits, total_blocks); + + OracleResult { + num_requests: records.len() as u64, + total_blocks, + unique_blocks: unique.len() as u64, + unlimited, + belady_finite: belady, + lru_finite: lru, + } +} + +fn run_unlimited(records: &[RequestRecord]) -> u64 { + let mut seen: AHashSet = AHashSet::with_capacity(1 << 18); + let mut hits: u64 = 0; + for r in records { + // Longest prefix match against `seen` + for &h in &r.hash_ids { + if seen.contains(&h) { + hits += 1; + } else { + break; + } + } + for &h in &r.hash_ids { + seen.insert(h); + } + } + hits +} + +fn run_lru(records: &[RequestRecord], capacity: usize) -> u64 { + if capacity == 0 { + return 0; + } + let mut cache = LruBlocks::new(capacity); + let mut hits: u64 = 0; + let mut evicted = Vec::new(); + for r in records { + hits += cache.longest_prefix(&r.hash_ids) as u64; + evicted.clear(); + cache.insert_blocks(&r.hash_ids, &mut evicted); + } + hits +} + +/// For each (request_idx, position_in_hash_ids) compute the next request +/// index whose `hash_ids` contains the same block (`u32::MAX` if none). +fn build_next_use(records: &[RequestRecord]) -> Vec> { + let n = records.len(); + let mut next_use: Vec> = Vec::with_capacity(n); + for r in records { + next_use.push(vec![u32::MAX; r.hash_ids.len()]); + } + let mut last_seen: AHashMap = AHashMap::with_capacity(1 << 18); + for i in (0..n).rev() { + let r = &records[i]; + for (j, &h) in r.hash_ids.iter().enumerate() { + next_use[i][j] = *last_seen.get(&h).unwrap_or(&u32::MAX); + } + for &h in &r.hash_ids { + last_seen.insert(h, i as u32); + } + } + next_use +} + +/// Belady (offline OPT) eviction over the trace. +/// +/// Implementation: lazy-deletion max-heap keyed by next-use index. Each +/// cache entry has a version; the heap may contain stale entries from +/// previous insertions, which we skip on pop. +fn run_belady(records: &[RequestRecord], next_use: &[Vec], capacity: usize) -> u64 { + if capacity == 0 { + return 0; + } + // block_hash -> (current_version, current_next_use) + let mut in_cache: AHashMap = AHashMap::with_capacity(capacity); + // (next_use, version, block_hash) — BinaryHeap is max-heap, which is what + // we want for "evict the entry whose next access is furthest". + let mut heap: BinaryHeap<(u32, u64, u64)> = BinaryHeap::with_capacity(capacity); + let mut version: u64 = 0; + let mut hits: u64 = 0; + + for (i, r) in records.iter().enumerate() { + // 1. Longest-prefix hit accounting against current cache. + for &h in &r.hash_ids { + if in_cache.contains_key(&h) { + hits += 1; + } else { + break; + } + } + + // 2. Insert / update each block in the request with its new next-use. + for (j, &h) in r.hash_ids.iter().enumerate() { + let nu = next_use[i][j]; + if let Some(slot) = in_cache.get_mut(&h) { + version += 1; + slot.0 = version; + slot.1 = nu; + heap.push((nu, version, h)); + continue; + } + // Need to make room? + if in_cache.len() == capacity { + // Evict max next_use entry, skipping stale heap entries. + loop { + let (nu_top, ver_top, h_top) = match heap.pop() { + Some(x) => x, + None => break, + }; + if let Some(&(cur_ver, cur_nu)) = in_cache.get(&h_top) { + if cur_ver == ver_top && cur_nu == nu_top { + in_cache.remove(&h_top); + break; + } + } + // stale; loop + } + } + version += 1; + in_cache.insert(h, (version, nu)); + heap.push((nu, version, h)); + } + } + + hits +} + +#[cfg(test)] +mod tests { + use super::*; + + fn req(id: u64, t: f64, hashes: Vec) -> RequestRecord { + RequestRecord { + req_id: id, + chat_id: id as i64, + arrival: t, + input_len: (hashes.len() as u32) * 16, + output_len: 16, + hash_ids: hashes, + } + } + + #[test] + fn unlimited_first_occurrence_misses() { + let recs = vec![ + req(0, 0.0, vec![1, 2, 3]), + req(1, 1.0, vec![1, 2, 3, 4]), + req(2, 2.0, vec![1, 2, 3, 4, 5]), + ]; + let out = analyze(&recs, 100); + // total = 3 + 4 + 5 = 12 + assert_eq!(out.total_blocks, 12); + // unique = {1,2,3,4,5} = 5 + assert_eq!(out.unique_blocks, 5); + // unlimited hits = 0 (req 0 all miss) + 3 (req 1 has [1,2,3] cached, then 4 miss) + 4 + assert_eq!(out.unlimited.hits, 7); + assert!((out.unlimited.hit_rate - 7.0 / 12.0).abs() < 1e-9); + } + + #[test] + fn belady_beats_lru_when_lru_thrashes() { + // Capacity 2. Pattern designed so LRU thrashes but Belady keeps the + // useful block: A B A C A B A C A ... + let mut recs = Vec::new(); + let pattern = [1u64, 2, 1, 3, 1, 2, 1, 3]; + for (i, &h) in pattern.iter().enumerate() { + recs.push(req(i as u64, i as f64, vec![h])); + } + let out = analyze(&recs, 2); + assert!( + out.belady_finite.hits >= out.lru_finite.hits, + "belady should be at least as good as lru: belady={} lru={}", + out.belady_finite.hits, + out.lru_finite.hits + ); + } + + #[test] + fn unlimited_is_upper_bound() { + let recs = vec![ + req(0, 0.0, vec![10, 20, 30]), + req(1, 1.0, vec![10, 20, 30, 40, 50]), + req(2, 2.0, vec![60]), + req(3, 3.0, vec![10, 20, 30, 40, 50, 60]), + ]; + let out = analyze(&recs, 3); + assert!(out.unlimited.hit_rate >= out.belady_finite.hit_rate); + assert!(out.belady_finite.hit_rate >= out.lru_finite.hit_rate - 1e-9); + } +} diff --git a/src/router/cache_load.rs b/src/router/cache_load.rs new file mode 100644 index 0000000..b919100 --- /dev/null +++ b/src/router/cache_load.rs @@ -0,0 +1,89 @@ +//! Load-filtered cache-aware routing. +//! +//! **Step 1** — filter: sort all instances by `queue_len` ascending and take the +//! least-loaded quarter (≥ 2 instances). +//! +//! **Step 2** — select: among that pool, pick the instance with the highest +//! meta-store prefix score. Tiebreak on lowest `queue_len`. +//! +//! This cleanly separates concerns: step 1 guarantees the request won't land +//! on a saturated instance, while step 2 maximises cache reuse within the +//! load-safe pool. The 1/4 fraction keeps the pool large enough that good +//! cache candidates are rarely excluded. + +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct CacheLoadRouter; + +impl CacheLoadRouter { + pub fn new() -> Self { + Self + } +} + +impl Default for CacheLoadRouter { + fn default() -> Self { + Self::new() + } +} + +impl Router for CacheLoadRouter { + fn name(&self) -> &'static str { + "cache_load" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision { + let n = instances.len(); + let scores = meta.score_prefix(&req.hash_ids, now, n); + + // Step 1: least-loaded 1/4 of instances (by queue_len). + let pool_size = (n / 4).max(2).min(n); + let mut indices: Vec = (0..n).collect(); + indices.sort_by_key(|&i| instances[i].queue_len()); + let pool = &indices[..pool_size]; + + // Step 2: among the pool, pick highest prefix score. + // Tiebreak: lowest queue_len. + let mut best_idx = pool[0]; + let mut best_prefix = scores[pool[0]]; + let mut best_queue = instances[pool[0]].queue_len(); + + for &i in &pool[1..] { + let p = scores[i]; + let q = instances[i].queue_len(); + if p > best_prefix || (p == best_prefix && q < best_queue) { + best_idx = i; + best_prefix = p; + best_queue = q; + } + } + + let mut candidates = Vec::with_capacity(pool_size); + for &i in pool { + candidates.push(CandidateInfo { + instance: instances[i].id, + predicted_prefix: scores[i], + load_blocks: instances[i].kv_blocks_used, + queue_len: instances[i].queue_len(), + }); + } + + RouteDecision { + req_id: req.req_id, + mode: "cache_load", + chosen: instances[best_idx].id, + probe_overhead_s: 0.0, + candidates, + reason: "least-loaded 1/4, then best prefix", + } + } +} diff --git a/src/router/cache_score.rs b/src/router/cache_score.rs new file mode 100644 index 0000000..495d5b1 --- /dev/null +++ b/src/router/cache_score.rs @@ -0,0 +1,111 @@ +//! Combined-score cache-aware routing with exponential weighting. +//! +//! Each instance is scored by: +//! +//! ```text +//! score_i = 2^(α · load_i + β · miss_i) +//! ``` +//! +//! where +//! +//! - `load_i = queue_len()` — requests pending or prefilling on instance i, +//! - `miss_i = input_blocks − prefix_blocks` — cache-miss blocks, +//! - `α` = `score_alpha` (YAML config, default 1.0), +//! - `β` = `score_beta` (YAML config, default 0.1). +//! +//! The instance with the **lowest** score is chosen. Since `2^x` is +//! monotonic, this is equivalent to minimising the linear exponent +//! `α·load + β·miss`, but the exponential framing highlights that +//! differences are amplified exponentially — a small edge in the exponent +//! creates a large gap in score. +//! +//! **Tuning guide**: +//! +//! - `α` controls how aggressively load is penalised. +//! - `β` controls how aggressively cache misses are penalised. +//! - Ratio `α/β` is what matters: higher → more load-sensitive. +//! - Defaults (`α=1.0, β=0.1`): 1 extra queue position ≈ 10 extra miss +//! blocks, which is a good starting point when block_size is large (512) +//! and queues are short (0–10). +//! +//! Ties are broken by fewest `queue_len`, then highest `prefix`. + +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct CacheScoreRouter { + alpha: f64, + beta: f64, +} + +impl CacheScoreRouter { + pub fn new(alpha: f64, beta: f64) -> Self { + Self { alpha, beta } + } +} + +impl Router for CacheScoreRouter { + fn name(&self) -> &'static str { + "cache_score" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision { + let n = instances.len(); + let scores = meta.score_prefix(&req.hash_ids, now, n); + let input_blocks = req.hash_ids.len() as f64; + + let mut best_idx: usize = 0; + let mut best_exp = f64::INFINITY; + let mut best_queue = u32::MAX; + let mut best_prefix = 0u32; + let mut candidates = Vec::with_capacity(n); + + for (i, inst) in instances.iter().enumerate() { + let prefix = scores[i] as f64; + let miss = (input_blocks - prefix).max(0.0); + let q = inst.queue_len() as f64; + + // Minimise the exponent: α·load + β·miss + // (equivalent to minimising 2^exponent) + let exponent = self.alpha * q + self.beta * miss; + + candidates.push(CandidateInfo { + instance: inst.id, + predicted_prefix: scores[i], + load_blocks: inst.kv_blocks_used, + queue_len: inst.queue_len(), + }); + + // Minimise (exponent, queue_len ASC, prefix DESC). + let better = exponent < best_exp + || (exponent == best_exp && inst.queue_len() < best_queue) + || (exponent == best_exp + && inst.queue_len() == best_queue + && scores[i] > best_prefix); + + if better { + best_exp = exponent; + best_idx = i; + best_queue = inst.queue_len(); + best_prefix = scores[i]; + } + } + + RouteDecision { + req_id: req.req_id, + mode: "cache_score", + chosen: instances[best_idx].id, + probe_overhead_s: 0.0, + candidates, + reason: "argmin 2^(α·load + β·miss)", + } + } +} diff --git a/src/router/estimated_ttft.rs b/src/router/estimated_ttft.rs new file mode 100644 index 0000000..314872e --- /dev/null +++ b/src/router/estimated_ttft.rs @@ -0,0 +1,128 @@ +//! First-principles TTFT-optimal routing. +//! +//! Estimates the actual time-to-first-token for each candidate instance: +//! +//! `TTFT(r,i) = drain(i) + fetch(r,i) + prefill(miss)` +//! +//! - **drain** — exact queue drain time: sum of per-request `prefill_time()` +//! using the architecture-aware compute model (quadratic / DSA). +//! +//! - **fetch** — RDMA fetch time for blocks cached elsewhere in the cluster +//! but not on instance `i` locally. +//! +//! - **prefill** — compute for cluster-wide cache-miss tokens (constant +//! across instances, cancels in the argmin). +//! +//! The router minimises `drain(i) + fetch(r,i)`, with ties broken by +//! lowest `queue_len` then most local cache. The fetch overlap with queue +//! drain is handled by keeping the additive form: this gives double +//! incentive to prefer instances with local cache, which empirically +//! outperforms the `max(drain, fetch)` alternative because even small +//! RDMA savings compound across thousands of routing decisions. + +use crate::cluster::meta_store::MetaStore; +use crate::config::Config; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct EstimatedTtftRouter { + /// Bytes per KV block (for RDMA cost estimation). + kv_block_bytes: f64, + /// RDMA bandwidth in bytes/s. + rdma_bw: f64, + /// RDMA per-transfer latency in seconds. + rdma_latency_s: f64, +} + +impl EstimatedTtftRouter { + pub fn new(config: &Config) -> Self { + Self { + kv_block_bytes: config.model.kv_block_bytes() as f64, + rdma_bw: config.hardware.rdma_bw, + rdma_latency_s: config.hardware.rdma_latency_us * 1e-6, + } + } + + /// Estimate RDMA fetch time for `remote_blocks` blocks. + fn fetch_time(&self, remote_blocks: u32) -> f64 { + if remote_blocks == 0 { + return 0.0; + } + let bytes = remote_blocks as f64 * self.kv_block_bytes; + bytes / self.rdma_bw + self.rdma_latency_s + } +} + +impl Router for EstimatedTtftRouter { + fn name(&self) -> &'static str { + "estimated_ttft" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision { + let n = instances.len(); + let scores = meta.score_prefix(&req.hash_ids, now, n); + + // Cluster-wide max prefix: blocks reachable via RDMA from any peer. + let cluster_prefix = scores.iter().copied().max().unwrap_or(0); + + let mut best: u32 = 0; + let mut best_cost = f64::INFINITY; + let mut best_queue = u32::MAX; + let mut best_local = 0u32; + let mut candidates = Vec::with_capacity(n); + + for inst in instances { + let i = inst.id as usize; + let local_prefix = scores[i]; + + // 1. Exact queue drain time (architecture-aware, per-request sum). + let drain = inst.estimated_drain_time(); + + // 2. RDMA fetch cost for blocks not locally cached. + let remote_blocks = cluster_prefix.saturating_sub(local_prefix); + let fetch = self.fetch_time(remote_blocks); + + // Additive cost: drain + fetch. + // The additive form gives explicit incentive to prefer local cache + // (lower fetch) even when the queue is non-empty, which reduces + // total RDMA traffic and improves TTFT in aggregate. + let cost = drain + fetch; + + candidates.push(CandidateInfo { + instance: inst.id, + predicted_prefix: local_prefix, + load_blocks: inst.kv_blocks_used, + queue_len: inst.queue_len(), + }); + + // Minimise (cost, queue_len, -local_prefix). + let ql = inst.queue_len(); + let better = cost < best_cost + || (cost == best_cost && ql < best_queue) + || (cost == best_cost && ql == best_queue && local_prefix > best_local); + + if better { + best_cost = cost; + best = inst.id; + best_queue = ql; + best_local = local_prefix; + } + } + + RouteDecision { + req_id: req.req_id, + mode: "estimated_ttft", + chosen: best, + probe_overhead_s: 0.0, + candidates, + reason: "argmin(drain_time + fetch_time)", + } + } +} diff --git a/src/router/least_loaded.rs b/src/router/least_loaded.rs new file mode 100644 index 0000000..7a722aa --- /dev/null +++ b/src/router/least_loaded.rs @@ -0,0 +1,54 @@ +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct LeastLoadedRouter { + pub alpha: f64, +} + +impl LeastLoadedRouter { + pub fn new(alpha: f64) -> Self { + Self { alpha } + } +} + +impl Router for LeastLoadedRouter { + fn name(&self) -> &'static str { + "least_loaded" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + _meta: &MetaStore, + _now: f64, + ) -> RouteDecision { + let mut best = 0u32; + let mut best_score = f64::INFINITY; + let mut candidates = Vec::with_capacity(instances.len()); + for inst in instances { + let load = inst.kv_blocks_used as f64 + + self.alpha * inst.queue_len() as f64; + candidates.push(CandidateInfo { + instance: inst.id, + predicted_prefix: 0, + load_blocks: inst.kv_blocks_used, + queue_len: inst.queue_len(), + }); + if load < best_score { + best_score = load; + best = inst.id; + } + } + RouteDecision { + req_id: req.req_id, + mode: "least_loaded", + chosen: best, + probe_overhead_s: 0.0, + candidates, + reason: "argmin(kv_used + alpha * queue_len)", + } + } +} diff --git a/src/router/least_tokens.rs b/src/router/least_tokens.rs new file mode 100644 index 0000000..effdad7 --- /dev/null +++ b/src/router/least_tokens.rs @@ -0,0 +1,73 @@ +//! Least-waiting-tokens routing. +//! +//! Pure load-balancing baseline that picks the instance with the fewest +//! total prefill tokens remaining across its pending and prefilling queues. +//! Unlike `least_loaded` (which mixes KV memory pressure with queue depth), +//! this directly minimises the expected wait time by accounting for the +//! actual compute backlog in tokens. +//! +//! Tiebreak: fewest `queue_len`, then lowest instance ID. + +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct LeastTokensRouter; + +impl LeastTokensRouter { + pub fn new() -> Self { + Self + } +} + +impl Default for LeastTokensRouter { + fn default() -> Self { + Self::new() + } +} + +impl Router for LeastTokensRouter { + fn name(&self) -> &'static str { + "least_tokens" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + _meta: &MetaStore, + _now: f64, + ) -> RouteDecision { + let mut best: u32 = 0; + let mut best_key: (u64, u32) = (u64::MAX, u32::MAX); + let mut candidates = Vec::with_capacity(instances.len()); + + for inst in instances { + let wt = inst.waiting_tokens(); + let ql = inst.queue_len(); + + candidates.push(CandidateInfo { + instance: inst.id, + predicted_prefix: 0, + load_blocks: inst.kv_blocks_used, + queue_len: ql, + }); + + let key = (wt, ql); + if key < best_key { + best_key = key; + best = inst.id; + } + } + + RouteDecision { + req_id: req.req_id, + mode: "least_tokens", + chosen: best, + probe_overhead_s: 0.0, + candidates, + reason: "argmin(waiting_prefill_tokens)", + } + } +} diff --git a/src/router/min_pd.rs b/src/router/min_pd.rs new file mode 100644 index 0000000..87a3d4f --- /dev/null +++ b/src/router/min_pd.rs @@ -0,0 +1,124 @@ +//! Minimum P*D routing. +//! +//! For each instance compute: +//! - `P` = real prefill tokens this request will do if routed there +//! - `D` = ongoing requests currently on that instance +//! (pending + prefilling) +//! +//! Score = `P * D`, pick the instance that minimizes it. +//! +//! `P` accounts for the **actual** prefill work after the cluster fetch +//! chain runs: the fetch chain serves any block cached anywhere in the +//! cluster (L0 → L1 → remote v6d via RDMA), so prefill compute only runs +//! for blocks that are absent cluster-wide *and* for blocks past the +//! instance-local prefix (the cluster only fetches a contiguous leading +//! prefix — any gap ends the fetch chain and the rest must be prefilled). +//! +//! Concretely, for instance `i`: +//! +//! ```text +//! local_prefix_i = meta_store.score_prefix(req, now)[i] // blocks +//! cluster_prefix = max over all j of meta_store_score[j] // blocks +//! effective_prefix_i = min(cluster_prefix, input_blocks) +//! - if local_prefix_i == cluster_prefix the fetch chain stays local, +//! - otherwise the prefill still skips cluster_prefix blocks because +//! the missing tail is fetched via RDMA from a peer. +//! P_i = (input_blocks - effective_prefix_i) * block_size_tokens +//! ``` +//! +//! This makes `P` nearly instance-independent on well-populated clusters +//! (so `min_pd` degenerates to balanced load with a cache-affinity +//! tiebreak), which is exactly what you want when RDMA is cheap relative +//! to prefill compute. +//! +//! Tiebreaks (essential on 128-instance clusters where many instances are +//! idle and the raw product collapses to zero): +//! 1. minimum `P*D` +//! 2. then minimum `D` — prefer the less-loaded instance +//! 3. then maximum `local_prefix_i` — prefer local affinity to avoid +//! paying the RDMA fetch cost when P and D are already tied + +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct MinPdRouter; + +impl MinPdRouter { + pub fn new() -> Self { + Self + } +} + +impl Default for MinPdRouter { + fn default() -> Self { + Self::new() + } +} + +impl Router for MinPdRouter { + fn name(&self) -> &'static str { + "min_pd" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision { + let n = instances.len(); + let scores = meta.score_prefix(&req.hash_ids, now, n); + let block_size = instances[0].block_size_tokens as u64; + let input_blocks = req.hash_ids.len() as u64; + + // Cluster-wide max prefix: longest contiguous prefix that EXISTS + // somewhere in the cluster (and will be fetched via remote RDMA if + // not local). This determines the effective prefill work for every + // candidate, not just the one that owns the blocks. + let cluster_prefix_blocks = scores.iter().copied().max().unwrap_or(0) as u64; + let effective_prefix_blocks = cluster_prefix_blocks.min(input_blocks); + let miss_blocks = input_blocks.saturating_sub(effective_prefix_blocks); + let p_base = miss_blocks.saturating_mul(block_size); // tokens to prefill + + let mut candidates = Vec::with_capacity(n); + let mut best: u32 = instances[0].id; + // Minimize (P*D, D, -local_prefix). + // P is nearly instance-independent; D is the real discriminator. + // When tied on D, prefer the instance with the best local prefix + // (avoids the RDMA fetch cost). + let mut best_key: (u128, u64, i64) = (u128::MAX, u64::MAX, i64::MAX); + + for inst in instances { + let i = inst.id as usize; + let d = inst.queue_len() as u64; + let pd = p_base as u128 * d as u128; + let local_prefix = scores[i] as i64; + + candidates.push(CandidateInfo { + instance: inst.id, + predicted_prefix: scores[i], + load_blocks: inst.kv_blocks_used, + queue_len: inst.queue_len(), + }); + + // minimize (pd, d, -local_prefix) + let key = (pd, d, -local_prefix); + if key < best_key { + best_key = key; + best = inst.id; + } + } + + RouteDecision { + req_id: req.req_id, + mode: "min_pd", + chosen: best, + probe_overhead_s: 0.0, + candidates, + reason: "argmin(P*D), P=cluster-wide miss tokens, D=ongoing reqs", + } + } +} diff --git a/src/router/mod.rs b/src/router/mod.rs new file mode 100644 index 0000000..3203382 --- /dev/null +++ b/src/router/mod.rs @@ -0,0 +1,80 @@ +//! Cluster-level routing strategies. + +pub mod cache_load; +pub mod cache_score; +pub mod estimated_ttft; +pub mod least_loaded; +pub mod least_tokens; +pub mod min_pd; +pub mod precise_aware; +pub mod prefix_affinity; +pub mod random; +pub mod ttl_aware; + +use serde::Serialize; + +use crate::cluster::meta_store::MetaStore; +use crate::config::Config; +use crate::instance::Instance; +use crate::trace::RequestRecord; +use crate::types::InstanceId; + +#[derive(Debug, Clone, Serialize)] +pub struct CandidateInfo { + pub instance: InstanceId, + pub predicted_prefix: u32, + pub load_blocks: u32, + pub queue_len: u32, +} + +#[derive(Debug, Clone, Serialize)] +pub struct RouteDecision { + pub req_id: u64, + pub mode: &'static str, + pub chosen: InstanceId, + pub probe_overhead_s: f64, + pub candidates: Vec, + pub reason: &'static str, +} + +pub trait Router: Send { + fn name(&self) -> &'static str; + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision; +} + +pub fn build(full: &Config, seed: u64) -> Box { + use crate::config::RouterMode::*; + let cfg = &full.cluster.router; + match cfg.mode { + Random => Box::new(random::RandomRouter::new(seed)) as Box, + RoundRobin => Box::new(random::RoundRobinRouter::new()) as Box, + LeastLoaded => { + Box::new(least_loaded::LeastLoadedRouter::new(cfg.load_alpha)) as Box + } + TtlAware => Box::new(ttl_aware::TtlAwareRouter::new(cfg.load_alpha)) as Box, + Precise => Box::new(precise_aware::PreciseRouter::new( + cfg.precise_probe_topk, + cfg.precise_probe_latency_us * 1e-6, + cfg.load_alpha, + )) as Box, + MinPd => Box::new(min_pd::MinPdRouter::new()) as Box, + LeastTokens => Box::new(least_tokens::LeastTokensRouter::new()) as Box, + CacheLoad => Box::new(cache_load::CacheLoadRouter::new()) as Box, + CacheScore => { + Box::new(cache_score::CacheScoreRouter::new(cfg.score_alpha, cfg.score_beta)) + as Box + } + EstimatedTtft => { + Box::new(estimated_ttft::EstimatedTtftRouter::new(full)) as Box + } + PrefixAffinity => { + Box::new(prefix_affinity::PrefixAffinityRouter::new(full)) as Box + } + } +} diff --git a/src/router/precise_aware.rs b/src/router/precise_aware.rs new file mode 100644 index 0000000..9a973d6 --- /dev/null +++ b/src/router/precise_aware.rs @@ -0,0 +1,120 @@ +//! KV-aware routing via meta-store candidate selection + precise probing. +//! +//! The global meta store is used as a *candidate pre-filter*: we score +//! every instance's predicted prefix from the store, take the top-K by +//! (predicted_prefix DESC, load ASC), and then exact-probe those K +//! candidates' actual L0+L1 caches to get the true longest prefix. This +//! catches two cases where the meta store is wrong: +//! +//! - the store is stale (block evicted from L0/L1 but TTL not yet up), +//! - the store undercounts because some blocks' TTL expired individually. +//! +//! Because the candidate set is sourced from the meta store rather than +//! from a load ranking, this router is a strict superset of `ttl_aware`: +//! any instance the meta store would pick is a candidate here, and the +//! exact probe can only move the decision toward a truthfully-better +//! instance. Each probe adds `probe_latency_s` to the request's +//! effective arrival time. +//! +//! If the meta store returns zero-prefix for every instance (e.g. cold +//! start, or a request whose blocks have never been seen), we fall back +//! to the top-K least-loaded instances so we still place the request. + +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct PreciseRouter { + pub topk: u32, + pub probe_latency_s: f64, + pub alpha: f64, +} + +impl PreciseRouter { + pub fn new(topk: u32, probe_latency_s: f64, alpha: f64) -> Self { + Self { topk, probe_latency_s, alpha } + } + + fn load_of(&self, inst: &Instance) -> f64 { + inst.kv_blocks_used as f64 + self.alpha * inst.queue_len() as f64 + } +} + +impl Router for PreciseRouter { + fn name(&self) -> &'static str { + "precise" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision { + let n = instances.len(); + let k = (self.topk as usize).min(n).max(1); + + // 1. Meta-store candidate set: rank all instances by + // (predicted_prefix DESC, load ASC) and take the top-K. + let meta_scores = meta.score_prefix(&req.hash_ids, now, n); + let any_meta_hit = meta_scores.iter().any(|&p| p > 0); + + let mut ranked: Vec = (0..n).collect(); + if any_meta_hit { + ranked.sort_by(|&a, &b| { + let pa = meta_scores[a]; + let pb = meta_scores[b]; + // prefix desc, then load asc + pb.cmp(&pa) + .then_with(|| { + self.load_of(&instances[a]) + .partial_cmp(&self.load_of(&instances[b])) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } else { + // Cold start fallback: pure load order. + ranked.sort_by(|&a, &b| { + self.load_of(&instances[a]) + .partial_cmp(&self.load_of(&instances[b])) + .unwrap_or(std::cmp::Ordering::Equal) + }); + } + let probed = &ranked[..k]; + + // 2. Exact probe each candidate and pick + // argmax(exact_prefix, tiebreak: -load). + let mut candidates = Vec::with_capacity(k); + let mut best = probed[0] as u32; + let mut best_key: (i64, f64) = (i64::MIN, f64::INFINITY); + for &i in probed { + let inst = &instances[i]; + let l0 = inst.cache.l0.longest_prefix_peek(&req.hash_ids); + let l1 = inst.cache.l1.longest_prefix_peek(&req.hash_ids[l0..]); + let predicted = (l0 + l1) as u32; + let load = self.load_of(inst); + candidates.push(CandidateInfo { + instance: inst.id, + predicted_prefix: predicted, + load_blocks: inst.kv_blocks_used, + queue_len: inst.queue_len(), + }); + let key = (predicted as i64, -load); + if key > (best_key.0, -best_key.1) { + best_key = (predicted as i64, load); + best = inst.id; + } + } + + RouteDecision { + req_id: req.req_id, + mode: "precise", + chosen: best, + probe_overhead_s: k as f64 * self.probe_latency_s, + candidates, + reason: "exact-probe top-K meta-store candidates", + } + } +} diff --git a/src/router/prefix_affinity.rs b/src/router/prefix_affinity.rs new file mode 100644 index 0000000..66e6a31 --- /dev/null +++ b/src/router/prefix_affinity.rs @@ -0,0 +1,196 @@ +//! Prefix-affinity routing with load-aware fallback. +//! +//! **Key insight**: in real LLM traces, 99%+ of requests share a common +//! system-prompt prefix (dozens to hundreds of 16-token blocks). If we +//! *consistently* route requests with the same prefix to the same small set +//! of instances, L0 (HBM) cache hit rates increase dramatically because the +//! working set per instance is concentrated rather than scattered. +//! +//! Algorithm (rendezvous hashing + drain-time-aware selection): +//! +//! 1. **Fingerprint**: hash the first `K` blocks of the request to produce a +//! prefix fingerprint that captures the system prompt identity. +//! +//! 2. **Rendezvous ranking**: for each instance `i`, compute +//! `rendezvous(fingerprint, i)` — a deterministic pseudo-random score. +//! Sort instances by this score descending to get a stable, per-prefix +//! ordering. +//! +//! 3. **Select from top candidates**: among the top `fan_out` instances in +//! the rendezvous ranking, pick the one with the lowest estimated drain +//! time (architecture-aware, per-request sum). This accounts for +//! heterogeneous request sizes in the queue. +//! +//! 4. **Overload fallback**: if all top candidates have queue length above a +//! threshold, expand to the full instance set and use estimated-TTFT +//! scoring (drain + fetch) for the best selection. +//! +//! The combination ensures: +//! - **Cache locality**: same-prefix requests cluster on a few instances, +//! building strong L0 cache entries that benefit subsequent requests. +//! - **Load balance**: within the affinity group, drain-time-aware selection +//! avoids hot-spotting from large-prompt requests. +//! - **Zero overhead**: no per-instance probes needed; fingerprint + +//! rendezvous are pure arithmetic. + +use crate::cluster::meta_store::MetaStore; +use crate::config::Config; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct PrefixAffinityRouter { + /// Number of leading block hashes used for the prefix fingerprint. + prefix_k: usize, + /// Number of top-affinity instances to consider before fallback. + fan_out: usize, + /// Queue-length threshold: if all top candidates exceed this, expand to + /// the full instance set. + overload_threshold: u32, + /// Bytes per KV block (for RDMA cost estimation in fallback path). + kv_block_bytes: f64, + /// RDMA bandwidth in bytes/s. + rdma_bw: f64, + /// RDMA per-transfer latency in seconds. + rdma_latency_s: f64, +} + +impl PrefixAffinityRouter { + pub fn new(config: &Config) -> Self { + let n = config.cluster.num_instances as usize; + let cfg_fan = config.cluster.router.affinity_fan_out; + // fan_out: if configured, use it; otherwise auto = max(2, n/8). + let fan_out = if cfg_fan > 0 { + cfg_fan.min(n) + } else { + (n / 8).max(2).min(n) + }; + Self { + prefix_k: config.cluster.router.prefix_k, + fan_out, + overload_threshold: 4, + kv_block_bytes: config.model.kv_block_bytes() as f64, + rdma_bw: config.hardware.rdma_bw, + rdma_latency_s: config.hardware.rdma_latency_us * 1e-6, + } + } + + /// Compute a prefix fingerprint from the first K block hashes. + fn fingerprint(hash_ids: &[u64], k: usize) -> u64 { + let n = hash_ids.len().min(k); + let mut fp: u64 = 0xcbf29ce484222325; // FNV offset basis + for &h in &hash_ids[..n] { + fp ^= h; + fp = fp.wrapping_mul(0x100000001b3); // FNV prime + } + fp + } + + /// Rendezvous hash: deterministic pseudo-random score for (fingerprint, instance_id). + /// Higher score = higher affinity. + fn rendezvous_score(fp: u64, instance_id: u32) -> u64 { + let mut h = fp ^ (instance_id as u64).wrapping_mul(0x9e3779b97f4a7c15); + // Splitmix64 finalizer + h = h.wrapping_add(0x9e3779b97f4a7c15); + h = (h ^ (h >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); + h = (h ^ (h >> 27)).wrapping_mul(0x94d049bb133111eb); + h ^ (h >> 31) + } + + /// Estimate RDMA fetch time for `remote_blocks` blocks. + fn fetch_time(&self, remote_blocks: u32) -> f64 { + if remote_blocks == 0 { + return 0.0; + } + let bytes = remote_blocks as f64 * self.kv_block_bytes; + bytes / self.rdma_bw + self.rdma_latency_s + } +} + +impl Router for PrefixAffinityRouter { + fn name(&self) -> &'static str { + "prefix_affinity" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision { + let n = instances.len(); + let fp = Self::fingerprint(&req.hash_ids, self.prefix_k); + + // Build rendezvous-ranked list of (score, index). + let mut ranked: Vec<(u64, usize)> = (0..n) + .map(|i| (Self::rendezvous_score(fp, instances[i].id), i)) + .collect(); + ranked.sort_unstable_by(|a, b| b.0.cmp(&a.0)); // descending score + + // Collect candidate info for logging (also needed for fallback). + let scores = meta.score_prefix(&req.hash_ids, now, n); + let candidates: Vec = instances + .iter() + .map(|inst| CandidateInfo { + instance: inst.id, + predicted_prefix: scores[inst.id as usize], + load_blocks: inst.kv_blocks_used, + queue_len: inst.queue_len(), + }) + .collect(); + + // Phase 1: among top fan_out instances, pick lowest drain time. + let top_k = self.fan_out.min(n); + let mut best_idx = ranked[0].1; + let mut best_drain = instances[best_idx].estimated_drain_time(); + let mut best_ql = instances[best_idx].queue_len(); + let mut all_overloaded = best_ql > self.overload_threshold; + + for &(_, idx) in &ranked[1..top_k] { + let drain = instances[idx].estimated_drain_time(); + let ql = instances[idx].queue_len(); + if drain < best_drain || (drain == best_drain && ql < best_ql) { + best_idx = idx; + best_drain = drain; + best_ql = ql; + } + if ql <= self.overload_threshold { + all_overloaded = false; + } + } + + // Phase 2: if all top candidates are overloaded, search globally + // using estimated-TTFT (drain + fetch) for optimal fallback. + let reason; + if all_overloaded { + reason = "affinity fallback: min(drain+fetch)"; + let cluster_prefix = scores.iter().copied().max().unwrap_or(0); + let mut best_cost = f64::INFINITY; + for &(_, idx) in ranked.iter() { + let inst = &instances[idx]; + let drain = inst.estimated_drain_time(); + let local_prefix = scores[idx]; + let remote_blocks = cluster_prefix.saturating_sub(local_prefix); + let cost = drain + self.fetch_time(remote_blocks); + let ql = inst.queue_len(); + if cost < best_cost || (cost == best_cost && ql < best_ql) { + best_cost = cost; + best_idx = idx; + best_ql = ql; + } + } + } else { + reason = "prefix affinity: top-K min drain"; + } + + RouteDecision { + req_id: req.req_id, + mode: "prefix_affinity", + chosen: instances[best_idx].id, + probe_overhead_s: 0.0, + candidates, + reason, + } + } +} diff --git a/src/router/random.rs b/src/router/random.rs new file mode 100644 index 0000000..8457504 --- /dev/null +++ b/src/router/random.rs @@ -0,0 +1,90 @@ +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; +use crate::types::InstanceId; + +pub struct RandomRouter { + rng: ChaCha8Rng, +} + +impl RandomRouter { + pub fn new(seed: u64) -> Self { + Self { rng: ChaCha8Rng::seed_from_u64(seed) } + } +} + +impl Router for RandomRouter { + fn name(&self) -> &'static str { + "random" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + _meta: &MetaStore, + _now: f64, + ) -> RouteDecision { + let n = instances.len(); + let chosen = self.rng.gen_range(0..n) as InstanceId; + RouteDecision { + req_id: req.req_id, + mode: "random", + chosen, + probe_overhead_s: 0.0, + candidates: vec![CandidateInfo { + instance: chosen, + predicted_prefix: 0, + load_blocks: instances[chosen as usize].kv_blocks_used, + queue_len: instances[chosen as usize].queue_len(), + }], + reason: "uniform random", + } + } +} + +#[derive(Default)] +pub struct RoundRobinRouter { + next: u32, +} + +impl RoundRobinRouter { + pub fn new() -> Self { + Self::default() + } +} + +impl Router for RoundRobinRouter { + fn name(&self) -> &'static str { + "round_robin" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + _meta: &MetaStore, + _now: f64, + ) -> RouteDecision { + let n = instances.len() as u32; + let chosen = self.next % n; + self.next = self.next.wrapping_add(1); + RouteDecision { + req_id: req.req_id, + mode: "round_robin", + chosen, + probe_overhead_s: 0.0, + candidates: vec![CandidateInfo { + instance: chosen, + predicted_prefix: 0, + load_blocks: instances[chosen as usize].kv_blocks_used, + queue_len: instances[chosen as usize].queue_len(), + }], + reason: "round robin", + } + } +} diff --git a/src/router/ttl_aware.rs b/src/router/ttl_aware.rs new file mode 100644 index 0000000..bbdcec5 --- /dev/null +++ b/src/router/ttl_aware.rs @@ -0,0 +1,59 @@ +use crate::cluster::meta_store::MetaStore; +use crate::instance::Instance; +use crate::router::{CandidateInfo, RouteDecision, Router}; +use crate::trace::RequestRecord; + +pub struct TtlAwareRouter { + pub alpha: f64, +} + +impl TtlAwareRouter { + pub fn new(alpha: f64) -> Self { + Self { alpha } + } +} + +impl Router for TtlAwareRouter { + fn name(&self) -> &'static str { + "ttl_aware" + } + + fn route( + &mut self, + req: &RequestRecord, + instances: &[Instance], + meta: &MetaStore, + now: f64, + ) -> RouteDecision { + let n = instances.len(); + let scores = meta.score_prefix(&req.hash_ids, now, n); + let mut best = 0u32; + let mut best_key = (i64::MIN, f64::INFINITY); // maximize prefix, then minimize load + let mut candidates = Vec::with_capacity(n); + for inst in instances { + let p = scores[inst.id as usize]; + let load = inst.kv_blocks_used as f64 + + self.alpha * inst.queue_len() as f64; + candidates.push(CandidateInfo { + instance: inst.id, + predicted_prefix: p, + load_blocks: inst.kv_blocks_used, + queue_len: inst.queue_len(), + }); + let key = (p as i64, -load); + // we want max prefix, min load -> compare (p, -load) lexicographically max + if key > (best_key.0, -best_key.1) { + best_key = (p as i64, load); + best = inst.id; + } + } + RouteDecision { + req_id: req.req_id, + mode: "ttl_aware", + chosen: best, + probe_overhead_s: 0.0, + candidates, + reason: "max meta_store prefix, tie -> least loaded", + } + } +} diff --git a/src/sim/engine.rs b/src/sim/engine.rs new file mode 100644 index 0000000..ee0d548 --- /dev/null +++ b/src/sim/engine.rs @@ -0,0 +1,113 @@ +//! Discrete-event engine. +//! +//! Single-threaded virtual time `f64` seconds. Events are stored in a min-heap +//! keyed by `(time, seq)` so equal-time events fire in insertion order. + +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +use super::events::Event; + +#[derive(Debug)] +struct Slot { + time: f64, + seq: u64, + event: Event, +} + +impl Eq for Slot {} +impl PartialEq for Slot { + fn eq(&self, other: &Self) -> bool { + self.time == other.time && self.seq == other.seq + } +} +impl Ord for Slot { + fn cmp(&self, other: &Self) -> Ordering { + // Reverse so BinaryHeap acts as a min-heap. + other + .time + .partial_cmp(&self.time) + .unwrap_or(Ordering::Equal) + .then_with(|| other.seq.cmp(&self.seq)) + } +} +impl PartialOrd for Slot { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +#[derive(Debug, Default)] +pub struct EventQueue { + heap: BinaryHeap, + seq: u64, + now: f64, +} + +impl EventQueue { + pub fn new() -> Self { + Self::default() + } + + pub fn now(&self) -> f64 { + self.now + } + + pub fn schedule(&mut self, time: f64, event: Event) { + let t = time.max(self.now); + self.seq += 1; + self.heap.push(Slot { time: t, seq: self.seq, event }); + } + + pub fn pop(&mut self) -> Option<(f64, Event)> { + let slot = self.heap.pop()?; + if slot.time > self.now { + self.now = slot.time; + } + Some((slot.time, slot.event)) + } + + pub fn len(&self) -> usize { + self.heap.len() + } + + pub fn is_empty(&self) -> bool { + self.heap.is_empty() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::InstanceId; + + #[test] + fn pops_in_time_order() { + let mut q = EventQueue::new(); + q.schedule(2.0, Event::BatchTick { instance: 0 as InstanceId }); + q.schedule(1.0, Event::BatchTick { instance: 1 }); + q.schedule(1.5, Event::BatchTick { instance: 2 }); + let (t1, _) = q.pop().unwrap(); + let (t2, _) = q.pop().unwrap(); + let (t3, _) = q.pop().unwrap(); + assert!(t1 <= t2 && t2 <= t3); + assert!((t1 - 1.0).abs() < 1e-12); + assert!((t3 - 2.0).abs() < 1e-12); + } + + #[test] + fn equal_time_fifo() { + let mut q = EventQueue::new(); + q.schedule(1.0, Event::BatchTick { instance: 7 }); + q.schedule(1.0, Event::BatchTick { instance: 8 }); + let (_, e1) = q.pop().unwrap(); + let (_, e2) = q.pop().unwrap(); + match (e1, e2) { + (Event::BatchTick { instance: a }, Event::BatchTick { instance: b }) => { + assert_eq!(a, 7); + assert_eq!(b, 8); + } + _ => panic!("wrong events"), + } + } +} diff --git a/src/sim/events.rs b/src/sim/events.rs new file mode 100644 index 0000000..e369fa2 --- /dev/null +++ b/src/sim/events.rs @@ -0,0 +1,15 @@ +//! Event types for the discrete-event engine. + +use crate::types::{InstanceId, ReqId}; + +#[derive(Debug)] +pub enum Event { + /// New trace request arrives at the cluster router. + Arrival { req_id: ReqId }, + /// Per-instance scheduler tick (continuous batching). + BatchTick { instance: InstanceId }, + /// Periodic time-series sample of all instances. + Sample, + /// Stop the simulation early (used internally). + Stop, +} diff --git a/src/sim/mod.rs b/src/sim/mod.rs new file mode 100644 index 0000000..938ff5b --- /dev/null +++ b/src/sim/mod.rs @@ -0,0 +1,5 @@ +pub mod engine; +pub mod events; + +pub use engine::EventQueue; +pub use events::Event; diff --git a/src/trace.rs b/src/trace.rs new file mode 100644 index 0000000..236c0b5 --- /dev/null +++ b/src/trace.rs @@ -0,0 +1,102 @@ +//! Streaming JSONL reader for the qwen-bailian trace format. +//! +//! Schema (per upstream README): +//! chat_id: i64 +//! parent_chat_id: i64 (-1 = root) +//! timestamp: f64 (seconds since trace start) +//! input_length: i64 +//! output_length: i64 +//! type: string (text/search/image/file) +//! turn: i64 +//! hash_ids: [i64] (16-token blocks, salted SipHash) + +use anyhow::{Context, Result}; +use serde::Deserialize; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::Path; + +#[derive(Debug, Clone, Deserialize)] +struct RawRecord { + #[serde(default)] + chat_id: i64, + #[serde(default)] + timestamp: f64, + #[serde(default)] + input_length: i64, + #[serde(default)] + output_length: i64, + #[serde(default)] + hash_ids: Vec, +} + +#[derive(Debug, Clone)] +pub struct RequestRecord { + pub req_id: u64, + pub chat_id: i64, + pub arrival: f64, + pub input_len: u32, + pub output_len: u32, + pub hash_ids: Vec, +} + +pub struct TraceReader { + inner: BufReader, + next_id: u64, + line_buf: String, + max_requests: Option, +} + +impl TraceReader { + pub fn open>(path: P, max_requests: Option) -> Result { + let path = path.as_ref(); + let f = File::open(path) + .with_context(|| format!("opening trace {}", path.display()))?; + Ok(Self { + inner: BufReader::with_capacity(1 << 20, f), + next_id: 0, + line_buf: String::with_capacity(4096), + max_requests, + }) + } +} + +impl Iterator for TraceReader { + type Item = Result; + + fn next(&mut self) -> Option { + if let Some(cap) = self.max_requests { + if self.next_id >= cap { + return None; + } + } + loop { + self.line_buf.clear(); + match self.inner.read_line(&mut self.line_buf) { + Ok(0) => return None, + Ok(_) => { + let trimmed = self.line_buf.trim(); + if trimmed.is_empty() { + continue; + } + let parsed: Result = serde_json::from_str(trimmed); + let raw = match parsed { + Ok(r) => r, + Err(e) => return Some(Err(anyhow::anyhow!("trace parse: {e}"))), + }; + let id = self.next_id; + self.next_id += 1; + return Some(Ok(RequestRecord { + req_id: id, + chat_id: raw.chat_id, + arrival: raw.timestamp, + input_len: raw.input_length.max(0) as u32, + output_len: raw.output_length.max(0) as u32, + hash_ids: raw.hash_ids.into_iter().map(|h| h as u64).collect(), + })); + } + Err(e) => return Some(Err(e.into())), + } + } + } +} diff --git a/src/types.rs b/src/types.rs new file mode 100644 index 0000000..9bcab09 --- /dev/null +++ b/src/types.rs @@ -0,0 +1,4 @@ +//! Shared simple types. + +pub type InstanceId = u32; +pub type ReqId = u64; diff --git a/tests/smoke.rs b/tests/smoke.rs new file mode 100644 index 0000000..471ff9f --- /dev/null +++ b/tests/smoke.rs @@ -0,0 +1,155 @@ +//! Smoke test: synthesize a small trace with shared prefixes and assert that +//! the cache hit rate is monotonic in router sophistication: +//! random <= least_loaded <= ttl_aware <= precise + +use std::io::Write; + +use kvcache_simulator::config::*; +use kvcache_simulator::driver; + +fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config { + Config { + model: ModelConfig { + name: "test".into(), + num_layers: 4, + num_kv_heads: 2, + head_dim: 64, + dtype_bytes: 2, + block_size_tokens: 16, + flops_per_token_prefill: Some(1.0e9), + attn_quadratic_coeff: Some(64.0), + ..Default::default() + }, + hardware: HardwareConfig { + gpu_flops: 1.0e14, + gpu_mem_bw: 1.0e12, + hbm_bytes: 1.0e9, + dram_bytes: 4.0e9, + pcie_bw: 32.0e9, + pcie_latency_us: 1.0, + rdma_bw: 12.0e9, + rdma_latency_us: 5.0, + max_batch_slots: 32, + prefill_chunk_tokens: 1024, + }, + cluster: ClusterConfig { + num_instances: 4, + meta_store: MetaStoreConfig { ttl_seconds: 1000.0 }, + router: RouterConfig { + mode, + precise_probe_latency_us: 10.0, + precise_probe_topk: 4, + load_alpha: 0.1, + score_alpha: 1.0, + score_beta: 0.1, + prefix_k: 8, + affinity_fan_out: 0, + }, + }, + sim: SimConfig { + trace_path: trace_path.into(), + max_requests: None, + output_dir: out_dir.into(), + sample_interval_s: 0.0, + seed: 7, + }, + } +} + +fn write_synthetic_trace(path: &std::path::Path) { + // 5 distinct conversations, each with 8 turns. Within a conversation, + // turn k+1 reuses the prefix of turn k (shared first ~10 blocks) and + // appends a few new blocks. This is the canonical KV-prefix-cache pattern. + let mut f = std::fs::File::create(path).unwrap(); + let mut t = 0.0_f64; + let mut req_id_counter: i64 = 0; + for conv in 0..5i64 { + let mut prefix: Vec = (0..10).map(|i| conv * 1_000_000 + i).collect(); + for turn in 0..8 { + let mut hashes = prefix.clone(); + // Append 2 new blocks unique to this turn + for j in 0..2 { + let h = conv * 1_000_000 + 100 + (turn as i64) * 10 + j; + hashes.push(h); + } + req_id_counter += 1; + let line = serde_json::json!({ + "chat_id": conv, + "parent_chat_id": -1, + "timestamp": t, + "input_length": (hashes.len() as i64) * 16, + "output_length": 16, // 1 block of decode + "type": "text", + "turn": turn, + "hash_ids": hashes, + }); + writeln!(f, "{}", line).unwrap(); + // Next turn's prefix grows to include this turn's appended blocks + prefix = hashes; + t += 0.05; + } + let _ = req_id_counter; + } +} + +fn run(mode: RouterMode, trace_path: &std::path::Path, out_root: &std::path::Path) + -> kvcache_simulator::metrics::Summary +{ + let cfg = base_config( + trace_path.to_str().unwrap(), + out_root.to_str().unwrap(), + mode, + ); + let res = driver::run(&cfg, Some(mode.as_str())).expect("sim run"); + res.summary +} + +#[test] +fn ablation_hit_rate_ordering() { + let tmp = std::env::temp_dir().join("kvcache_sim_smoke"); + let _ = std::fs::remove_dir_all(&tmp); + std::fs::create_dir_all(&tmp).unwrap(); + let trace_path = tmp.join("trace.jsonl"); + write_synthetic_trace(&trace_path); + + let s_random = run(RouterMode::Random, &trace_path, &tmp); + let s_ll = run(RouterMode::LeastLoaded, &trace_path, &tmp); + let s_ttl = run(RouterMode::TtlAware, &trace_path, &tmp); + let s_prec = run(RouterMode::Precise, &trace_path, &tmp); + + let total_hit = |s: &kvcache_simulator::metrics::Summary| { + s.hit_rate_l0 + s.hit_rate_l1 + s.hit_rate_remote + }; + + let h_rand = total_hit(&s_random); + let h_ll = total_hit(&s_ll); + let h_ttl = total_hit(&s_ttl); + let h_prec = total_hit(&s_prec); + + eprintln!( + "smoke: hit rates random={:.3} least_loaded={:.3} ttl={:.3} precise={:.3}", + h_rand, h_ll, h_ttl, h_prec + ); + eprintln!( + " remote+local hit ratio L0/L1/remote: \ + random=({:.2},{:.2},{:.2}) precise=({:.2},{:.2},{:.2})", + s_random.hit_rate_l0, s_random.hit_rate_l1, s_random.hit_rate_remote, + s_prec.hit_rate_l0, s_prec.hit_rate_l1, s_prec.hit_rate_remote, + ); + + // ttl_aware and precise should outperform random / least_loaded for + // a workload built entirely of shared-prefix conversations. + let eps = 1e-6; + assert!( + h_ttl + eps >= h_rand, + "ttl_aware should >= random hit rate" + ); + assert!( + h_prec + eps >= h_rand, + "precise should >= random hit rate" + ); + assert!( + h_prec + eps >= h_ll, + "precise should >= least_loaded hit rate" + ); +}