fix: kvcache evict workflow

Support compute_dtype for FP4/FP8 tensor core FLOPS selection
Add `compute_dtype` field to ModelConfig ("bf16", "fp8", "fp4") which controls two things: - GPU FLOPS tier: auto-selects from preset FP4/FP8/BF16 TFLOPS - Weight bytes: uses 0.5/1.0/2.0 bytes per param for memory-bound check Hardware presets now include per-GPU FP8 and FP4 dense FLOPS for all GPUs that support them (H100/H800/H20: FP8, B200/B300: FP8+FP4). Config resolution auto-selects the right FLOPS when compute_dtype is set and the user hasn't explicitly overridden gpu_flops. GLM-5-NVFP4 on 8xB300 now correctly uses 13.5 PFLOPS/GPU FP4 (6x faster prefill) and 0.5 bytes/param weights (halved memory footprint). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 15:46:36 +08:00 · 2026-04-14 11:54:10 +08:00 · 2026-04-14 11:37:20 +08:00
9 changed files with 459 additions and 87 deletions
--- a/configs/glm5-nvfp4-8xb300.yaml
+++ b/configs/glm5-nvfp4-8xb300.yaml
@@ -0,0 +1,33 @@
+# GLM-5-NVFP4 (nvidia/GLM-5-NVFP4) on 8 x B300 (Blackwell Ultra, 288GB each).
+# Architecture auto-loaded from HuggingFace config.json.
+#
+# FP4 weights: ~744B params * 0.5 bytes = ~372 GB across 8 GPUs.
+# Total HBM: 8 * 288 GB = 2304 GB.  KV budget: ~1900 GB after weights.
+
+model:
+  config_json: ../models/GLM-5-NVFP4/config.json
+  name: glm-5-nvfp4
+  compute_dtype: fp4            # FP4 weights → selects FP4 tensor core FLOPS
+  dtype_bytes: 1                # FP8 KV cache
+  block_size_tokens: 512
+
+hardware:
+  type: 8xb300
+  hbm_bytes: 1900.0e9          # KV budget after FP4 weights (~372 GB)
+  dram_bytes: 1.5e12             # ~1.5 TB usable CPU DRAM per node
+
+cluster:
+  num_instances: 8
+  meta_store:
+    ttl_seconds: 300.0
+  router:
+    mode: prefix_affinity
+    prefix_k: 8
+    load_alpha: 1.0
+
+sim:
+  trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
+  max_requests: null
+  output_dir: runs/glm5_nvfp4_8xb300
+  sample_interval_s: 1.0
+  seed: 42
--- a/models/GLM-5-NVFP4/config.json
+++ b/models/GLM-5-NVFP4/config.json
@@ -0,0 +1,59 @@
+{
+  "architectures": [
+    "GlmMoeDsaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": [
+    154820,
+    154827,
+    154829
+  ],
+  "ep_size": 1,
+  "first_k_dense_replace": 3,
+  "hidden_act": "silu",
+  "head_dim": 64,
+  "hidden_size": 6144,
+  "index_head_dim": 128,
+  "index_n_heads": 32,
+  "index_topk": 2048,
+  "indexer_rope_interleave": true,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 202752,
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": 1,
+  "model_type": "glm_moe_dsa",
+  "n_group": 1,
+  "n_routed_experts": 256,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 64,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 78,
+  "num_key_value_heads": 64,
+  "num_nextn_predict_layers": 1,
+  "pad_token_id": 154820,
+  "pretraining_tp": 1,
+  "q_lora_rank": 2048,
+  "qk_head_dim": 256,
+  "qk_nope_head_dim": 192,
+  "qk_rope_head_dim": 64,
+  "rms_norm_eps": 1e-05,
+  "rope_interleave": true,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "routed_scaling_factor": 2.5,
+  "scoring_func": "sigmoid",
+  "tie_word_embeddings": false,
+  "topk_group": 1,
+  "topk_method": "noaux_tc",
+  "transformers_version": "5.0.2.dev0",
+  "use_cache": true,
+  "v_head_dim": 256,
+  "vocab_size": 154880
+}
--- a/src/cluster/cluster.rs
+++ b/src/cluster/cluster.rs
@@ -4,6 +4,7 @@
 use crate::cluster::meta_store::MetaStore;
 use crate::config::{Config, ModelConfig};
 use crate::instance::instance::AdmittedRequest;
+use crate::instance::kv_cache::L1Change;
 use crate::instance::Instance;
 use crate::router::{self, RouteDecision, Router};
 use crate::trace::RequestRecord;
@@ -53,7 +54,9 @@ impl Cluster {
    /// per-request stats for metrics. Does NOT schedule the BatchTick — the
    /// simulator driver does that based on the returned `ready_at`.
    pub fn route_and_admit(&mut self, req: &RequestRecord, now: f64) -> AdmissionStats {
-        let decision = self.router.route(req, &self.instances, &self.meta_store, now);
+        let decision = self
+            .router
+            .route(req, &self.instances, &self.meta_store, now);
        let inst_id = decision.chosen;
        let probe_overhead_s = decision.probe_overhead_s;

@@ -68,19 +71,18 @@ impl Cluster {

        // 2. L1 lookup on the remaining suffix.
        let suffix_after_l0 = &req.hash_ids[l0_hits as usize..];
-        let l1_hits = inst.cache.l1.longest_prefix(suffix_after_l0) as u32;
+        let l1_hits = inst.cache.l1.longest_prefix_peek(suffix_after_l0) as u32;
        // L1->L0 transfer cost
        let l1_bytes = (l1_hits as u64) * self.kv_block_bytes;
        let mut t = effective_now;
+        let mut l1_changes = Vec::new();
        if l1_hits > 0 {
            t = inst.links.pcie.reserve(t, l1_bytes);
-            // Promote those blocks into L0
-            let mut evicted = Vec::new();
-            inst.cache.l0.insert_blocks(
-                &suffix_after_l0[..l1_hits as usize],
-                &mut evicted,
-            );
+            l1_changes = inst
+                .cache
+                .promote_l1_blocks_to_l0(&suffix_after_l0[..l1_hits as usize]);
        }
+        Self::apply_l1_changes(&mut self.meta_store, inst_id, now, &l1_changes);

        // 3. Remote v6d lookup for the still-remaining suffix.
        let suffix_after_l1 = &suffix_after_l0[l1_hits as usize..];
@@ -98,20 +100,14 @@ impl Cluster {
        }
        let remote_bytes = (remote_hit_blocks as u64) * self.kv_block_bytes;
        if remote_hit_blocks > 0 {
-            // RDMA from peer host -> local DRAM, then PCIe -> GPU
+            let pulled = &suffix_after_l1[..remote_hit_blocks as usize];
+            let l1_changes = {
                let inst = &mut self.instances[inst_id as usize];
                t = inst.links.rdma.reserve(t, remote_bytes);
                t = inst.links.pcie.reserve(t, remote_bytes);
-            // Insert into local L1 (occupies LRU space) AND into L0
-            let pulled = &suffix_after_l1[..remote_hit_blocks as usize];
-            let mut evicted_l1 = Vec::new();
-            inst.cache.l1.insert_blocks(pulled, &mut evicted_l1);
-            let mut evicted_l0 = Vec::new();
-            inst.cache.l0.insert_blocks(pulled, &mut evicted_l0);
-            // The local instance now also owns these blocks - update meta_store.
-            for &h in pulled {
-                self.meta_store.insert(h, inst_id, now);
-            }
+                inst.cache.fetch_remote_blocks_to_l0(pulled)
+            };
+            Self::apply_l1_changes(&mut self.meta_store, inst_id, now, &l1_changes);
        }

        // 4. Miss = remaining tokens to prefill from scratch.
@@ -119,20 +115,14 @@ impl Cluster {
        let miss_tokens = miss_blocks * self.block_size_tokens;

        // The newly-prefilled blocks (after the request runs) are inserted
-        // into L0 here, and into L1 / meta_store via async writeback. Doing
-        // this at admission time is OK because we're tracking presence, not
-        // actually moving bytes — the writeback latency is hidden behind
-        // request execution and we don't model meta_store inconsistency
-        // window beyond the TTL itself.
-        let inst = &mut self.instances[inst_id as usize];
+        // into L0 here. Only later L0 evictions become remotely visible by
+        // landing in L1 and being published to the meta store.
        let new_input_blocks = &req.hash_ids[(l0_hits + l1_hits + remote_hit_blocks) as usize..];
-        let mut evicted_l0 = Vec::new();
-        inst.cache.l0.insert_blocks(new_input_blocks, &mut evicted_l0);
-        let mut evicted_l1 = Vec::new();
-        inst.cache.l1.insert_blocks(new_input_blocks, &mut evicted_l1);
-        for &h in new_input_blocks {
-            self.meta_store.insert(h, inst_id, now);
-        }
+        let l1_changes = {
+            let inst = &mut self.instances[inst_id as usize];
+            inst.cache.insert_blocks_into_l0(new_input_blocks)
+        };
+        Self::apply_l1_changes(&mut self.meta_store, inst_id, now, &l1_changes);

        // 5. Reserve KV slots for this request's prefill residency.
        //    PD disaggregation: decode runs elsewhere, so only the input
@@ -145,6 +135,7 @@ impl Cluster {
            prefill_tokens_remaining: miss_tokens,
            reserved_blocks,
        };
+        let inst = &mut self.instances[inst_id as usize];
        inst.admit(admitted);

        let pcie_bytes = l1_bytes + remote_bytes;
@@ -164,4 +155,18 @@ impl Cluster {
            decision,
        }
    }
+
+    fn apply_l1_changes(
+        meta_store: &mut MetaStore,
+        inst_id: InstanceId,
+        now: f64,
+        changes: &[L1Change],
+    ) {
+        for change in changes {
+            match *change {
+                L1Change::Added(hash) => meta_store.insert(hash, inst_id, now),
+                L1Change::Removed(hash) => meta_store.remove(hash, inst_id),
+            }
+        }
+    }
 }
--- a/src/cluster/meta_store.rs
+++ b/src/cluster/meta_store.rs
@@ -116,6 +116,21 @@ impl MetaStore {
        scores
    }

+    /// Remove `instance`'s entry for `block_hash` (e.g. after L1 eviction).
+    ///
+    /// The meta-store must reflect **L1 (DRAM) presence only**, because remote
+    /// RDMA fetch can only reach CPU DRAM, never GPU HBM.  Whenever the L1
+    /// tier evicts a block, the caller must invoke this so the meta-store
+    /// stops advertising the block as remotely available on this instance.
+    pub fn remove(&mut self, block_hash: u64, instance: InstanceId) {
+        if let Some(bucket) = self.map.get_mut(&block_hash) {
+            bucket.retain(|e| e.instance != instance);
+            if bucket.is_empty() {
+                self.map.remove(&block_hash);
+            }
+        }
+    }
+
    /// Lookup which (alive) instances claim to hold a given block.
    pub fn instances_for(&self, hash: u64, now: f64) -> SmallVec<[InstanceId; 4]> {
        let mut out = SmallVec::new();
@@ -149,6 +164,34 @@ mod tests {
        assert_eq!(s[2], 0);
    }

+    #[test]
+    fn remove_cleans_up() {
+        let mut m = MetaStore::new(60.0);
+        m.insert(10, 0, 0.0);
+        m.insert(10, 1, 0.0);
+        m.insert(11, 0, 0.0);
+
+        // instance 0 has both blocks, instance 1 has block 10 only
+        let owners = m.instances_for(10, 0.5);
+        assert_eq!(owners.len(), 2);
+
+        // Remove instance 0's entry for block 10
+        m.remove(10, 0);
+        let owners = m.instances_for(10, 0.5);
+        assert_eq!(owners.len(), 1);
+        assert_eq!(owners[0], 1);
+
+        // Instance 0 still owns block 11
+        let owners = m.instances_for(11, 0.5);
+        assert_eq!(owners.len(), 1);
+        assert_eq!(owners[0], 0);
+
+        // Remove last owner of a block -> entry fully cleaned
+        m.remove(10, 1);
+        let owners = m.instances_for(10, 0.5);
+        assert!(owners.is_empty());
+    }
+
    #[test]
    fn ttl_expiry() {
        let mut m = MetaStore::new(1.0);
--- a/src/config.rs
+++ b/src/config.rs
@@ -57,6 +57,13 @@ pub struct ModelConfig {
    #[serde(default)]
    pub attention: Option<AttentionConfig>,

+    /// Compute / weight precision: `"bf16"` (default), `"fp8"`, or `"fp4"`.
+    /// Controls which hardware FLOPS tier to use (`gpu_fp4_flops`, etc.) and
+    /// the weight-bytes-per-parameter for the memory-bound roofline check.
+    /// Independent of `dtype_bytes`, which sizes the KV cache.
+    #[serde(default)]
+    pub compute_dtype: Option<String>,
+
    // -- Legacy manual coefficients (used when hidden_size is absent) ---------
    #[serde(default)]
    pub flops_per_token_prefill: Option<f64>,
@@ -79,6 +86,20 @@ impl ModelConfig {
        self.hidden_size.is_some()
    }

+    /// Bytes per parameter for weight storage, derived from `compute_dtype`.
+    ///
+    /// - `"fp4"` → 0.5
+    /// - `"fp8"` → 1.0
+    /// - `"bf16"` / absent → `dtype_bytes` (backward-compatible)
+    pub fn weight_dtype_bytes(&self) -> f64 {
+        match self.compute_dtype.as_deref() {
+            Some("fp4") => 0.5,
+            Some("fp8") => 1.0,
+            Some("bf16") => 2.0,
+            _ => self.dtype_bytes as f64, // backward compat
+        }
+    }
+
    /// Bytes of KV cache per block.
    ///
    /// For standard / GQA: `2 * L * kv_heads * head_dim * dtype * block_tokens`
@@ -147,7 +168,14 @@ pub enum AttentionConfig {

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct HardwareConfig {
+    /// Active GPU FLOPS (selected from bf16/fp8/fp4 based on compute_dtype).
    pub gpu_flops: f64,
+    /// FP8 tensor core FLOPS (0 if not populated by preset).
+    #[serde(default)]
+    pub gpu_fp8_flops: f64,
+    /// FP4 tensor core FLOPS (0 if not populated by preset).
+    #[serde(default)]
+    pub gpu_fp4_flops: f64,
    pub gpu_mem_bw: f64,
    pub hbm_bytes: f64,
    pub dram_bytes: f64,
@@ -368,6 +396,8 @@ struct RawModelConfig {
    #[serde(default)]
    bytes_per_token_prefill: Option<f64>,
    #[serde(default)]
+    compute_dtype: Option<String>,
+    #[serde(default)]
    flops_per_token_decode: Option<f64>,
    #[serde(default)]
    bytes_per_token_decode: Option<f64>,
@@ -407,12 +437,25 @@ struct RawHardwareConfig {

 impl RawConfig {
    fn resolve(self, yaml_dir: &Path) -> Result<Config> {
-        Ok(Config {
-            model: self.model.resolve(yaml_dir)?,
-            hardware: self.hardware.resolve()?,
-            cluster: self.cluster,
-            sim: self.sim,
-        })
+        let model = self.model.resolve(yaml_dir)?;
+        let user_set_gpu_flops = self.hardware.gpu_flops.is_some();
+        let mut hardware = self.hardware.resolve()?;
+
+        // Auto-select gpu_flops tier based on model's compute_dtype,
+        // but only if the user did NOT explicitly override gpu_flops in YAML.
+        if !user_set_gpu_flops {
+            match model.compute_dtype.as_deref() {
+                Some("fp4") if hardware.gpu_fp4_flops > 0.0 => {
+                    hardware.gpu_flops = hardware.gpu_fp4_flops;
+                }
+                Some("fp8") if hardware.gpu_fp8_flops > 0.0 => {
+                    hardware.gpu_flops = hardware.gpu_fp8_flops;
+                }
+                _ => {} // keep BF16
+            }
+        }
+
+        Ok(Config { model, hardware, cluster: self.cluster, sim: self.sim })
    }
 }

@@ -446,6 +489,7 @@ impl RawModelConfig {
        if let Some(v) = self.flops_per_token_prefill { m.flops_per_token_prefill = Some(v); }
        if let Some(v) = self.attn_quadratic_coeff { m.attn_quadratic_coeff = Some(v); }
        if let Some(v) = self.bytes_per_token_prefill { m.bytes_per_token_prefill = Some(v); }
+        if self.compute_dtype.is_some() { m.compute_dtype = self.compute_dtype; }
        if let Some(v) = self.flops_per_token_decode { m.flops_per_token_decode = Some(v); }
        if let Some(v) = self.bytes_per_token_decode { m.bytes_per_token_decode = Some(v); }

@@ -476,6 +520,8 @@ impl RawHardwareConfig {
        } else {
            HardwareConfig {
                gpu_flops: 0.0,
+                gpu_fp8_flops: 0.0,
+                gpu_fp4_flops: 0.0,
                gpu_mem_bw: 0.0,
                hbm_bytes: 0.0,
                dram_bytes: 0.0,
--- a/src/hardware_presets.rs
+++ b/src/hardware_presets.rs
@@ -20,6 +20,7 @@ pub const AVAILABLE: &[&str] = &[
    "a100-80gb",
    "a100-40gb",
    "b200",
+    "b300",
    "2xh100",
    "4xh100",
    "8xh100",
@@ -32,6 +33,9 @@ pub const AVAILABLE: &[&str] = &[
    "2xb200",
    "4xb200",
    "8xb200",
+    "2xb300",
+    "4xb300",
+    "8xb300",
 ];

 /// Resolve a hardware preset by name.
@@ -48,6 +52,7 @@ pub fn resolve(name: &str) -> Option<HardwareConfig> {
        "a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
        "a10040gb" => Some(make_config(count, &A100_40GB)),
        "b200" => Some(make_config(count, &B200)),
+        "b300" => Some(make_config(count, &B300)),
        _ => None,
    }
 }
@@ -73,14 +78,18 @@ fn parse_count_gpu(s: &str) -> (u32, String) {
 // -- Per-GPU base specs (single die, BF16 dense) -----------------------------

 struct GpuBase {
-    flops: f64,   // BF16 dense TFLOPS
+    flops: f64,      // BF16 dense FLOPS
+    fp8_flops: f64,  // FP8 dense FLOPS (0 = not supported)
+    fp4_flops: f64,  // FP4 dense FLOPS (0 = not supported)
    mem_bw: f64,     // HBM bandwidth (B/s)
    hbm: f64,        // Total HBM (bytes)
    pcie_gen: u32,   // PCIe generation (4/5/6)
 }

 const H100: GpuBase = GpuBase {
-    flops: 9.89e14,  // 989 TFLOPS BF16
+    flops: 9.89e14,    // 989 TFLOPS BF16 dense
+    fp8_flops: 1.979e15, // 1979 TFLOPS FP8 dense
+    fp4_flops: 0.0,    // not supported
    mem_bw: 3.35e12,   // 3.35 TB/s HBM3
    hbm: 80.0e9,       // 80 GB
    pcie_gen: 5,
@@ -88,6 +97,8 @@ const H100: GpuBase = GpuBase {

 const H800: GpuBase = GpuBase {
    flops: 9.89e14,    // same die as H100
+    fp8_flops: 1.979e15,
+    fp4_flops: 0.0,
    mem_bw: 3.35e12,   // 3.35 TB/s HBM3
    hbm: 80.0e9,       // 80 GB
    pcie_gen: 5,
@@ -95,6 +106,8 @@ const H800: GpuBase = GpuBase {

 const H20: GpuBase = GpuBase {
    flops: 1.48e14,    // 148 TFLOPS BF16 (China-export Hopper)
+    fp8_flops: 2.96e14, // 296 TFLOPS FP8
+    fp4_flops: 0.0,    // not supported
    mem_bw: 4.0e12,    // 4.0 TB/s HBM3
    hbm: 96.0e9,       // 96 GB
    pcie_gen: 5,
@@ -102,6 +115,8 @@ const H20: GpuBase = GpuBase {

 const A100_80GB: GpuBase = GpuBase {
    flops: 3.12e14,    // 312 TFLOPS BF16
+    fp8_flops: 0.0,    // A100 has no FP8 tensor cores
+    fp4_flops: 0.0,
    mem_bw: 2.0e12,    // 2.0 TB/s HBM2e
    hbm: 80.0e9,       // 80 GB
    pcie_gen: 4,
@@ -109,18 +124,33 @@ const A100_80GB: GpuBase = GpuBase {

 const A100_40GB: GpuBase = GpuBase {
    flops: 3.12e14,    // 312 TFLOPS BF16
+    fp8_flops: 0.0,
+    fp4_flops: 0.0,
    mem_bw: 1.555e12,  // 1.555 TB/s HBM2e
    hbm: 40.0e9,       // 40 GB
    pcie_gen: 4,
 };

+// DGX B200 (8 GPU) specs: BF16 18 PFLOPS, FP8 36 PFLOPS, FP4 72 PFLOPS (dense)
 const B200: GpuBase = GpuBase {
-    flops: 2.25e15,  // 2250 TFLOPS BF16
+    flops: 2.25e15,    // 2250 TFLOPS BF16 dense
+    fp8_flops: 4.5e15, // 4500 TFLOPS FP8 dense
+    fp4_flops: 9.0e15, // 9000 TFLOPS FP4 dense
    mem_bw: 8.0e12,    // 8.0 TB/s HBM3e
    hbm: 192.0e9,      // 192 GB
    pcie_gen: 6,
 };

+// DGX B300 (8 GPU) specs: BF16 18 PFLOPS, FP8 ~54 PFLOPS, FP4 108 PFLOPS (dense)
+const B300: GpuBase = GpuBase {
+    flops: 2.25e15,      // 2250 TFLOPS BF16 dense (same GB202 die as B200)
+    fp8_flops: 6.75e15,  // 6750 TFLOPS FP8 dense (estimated from FP4/2)
+    fp4_flops: 13.5e15,  // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
+    mem_bw: 12.0e12,     // 12 TB/s HBM3e 12-Hi
+    hbm: 288.0e9,        // 288 GB HBM3e 12-Hi
+    pcie_gen: 6,
+};
+
 /// Build a [`HardwareConfig`] from a base GPU spec × TP count.
 ///
 /// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
@@ -153,6 +183,8 @@ fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {

    HardwareConfig {
        gpu_flops: base.flops * f,
+        gpu_fp8_flops: base.fp8_flops * f,
+        gpu_fp4_flops: base.fp4_flops * f,
        gpu_mem_bw: base.mem_bw * f,
        hbm_bytes: base.hbm * f,
        dram_bytes: dram,
--- a/src/instance/compute.rs
+++ b/src/instance/compute.rs
@@ -75,7 +75,8 @@ impl ComputeModel {
        let n_kv = model.num_kv_heads as f64;
        let hd = model.head_dim as f64;
        let inter = model.intermediate_size.unwrap_or(0) as f64;
-        let dtype = model.dtype_bytes as f64;
+        // Weight dtype for memory-bound check (separate from KV cache dtype).
+        let wdtype = model.weight_dtype_bytes();

        // --- Attention linear FLOPs/token/layer ---
        let attn_linear = if let Some(mla) = &model.mla {
@@ -134,18 +135,18 @@ impl ComputeModel {
            (h * qlr + qlr * n_heads * qk_hd
                + h * (kvlr + qk_rd)
                + n_heads * vhd * h)
-                * dtype
+                * wdtype
        } else {
-            ((n_heads + 2.0 * n_kv) * hd * h + n_heads * hd * h) * dtype
+            ((n_heads + 2.0 * n_kv) * hd * h + n_heads * hd * h) * wdtype
        };
        let mlp_wt = if let Some(moe) = &model.moe {
            let expert_inter = moe.expert_intermediate_size
                .unwrap_or(model.intermediate_size.unwrap_or(0)) as f64;
            let active = moe.num_active_experts as f64;
            let shared = moe.num_shared_experts as f64;
-            (active * 3.0 * h * expert_inter + shared * 3.0 * h * inter) * dtype
+            (active * 3.0 * h * expert_inter + shared * 3.0 * h * inter) * wdtype
        } else {
-            3.0 * h * inter * dtype
+            3.0 * h * inter * wdtype
        };
        let weight_bytes = attn_wt + mlp_wt;

@@ -385,6 +386,8 @@ mod tests {
        };
        let hw = HardwareConfig {
            gpu_flops: 1e14,
+            gpu_fp8_flops: 0.0,
+            gpu_fp4_flops: 0.0,
            gpu_mem_bw: 1e12,
            hbm_bytes: 1e9,
            dram_bytes: 4e9,
--- a/src/instance/kv_cache.rs
+++ b/src/instance/kv_cache.rs
@@ -10,6 +10,12 @@

 use ahash::AHashMap;

+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum L1Change {
+    Added(u64),
+    Removed(u64),
+}
+
 /// Doubly-linked-list-backed LRU keyed by block hash.
 #[derive(Debug)]
 pub struct LruBlocks {
@@ -56,6 +62,16 @@ impl LruBlocks {
        self.map.contains_key(&key)
    }

+    pub fn remove(&mut self, key: u64) -> bool {
+        if let Some(idx) = self.map.remove(&key) {
+            self.detach(idx);
+            self.free.push(idx);
+            true
+        } else {
+            false
+        }
+    }
+
    /// Touch (move to MRU) if present. Returns whether the key was present.
    pub fn touch(&mut self, key: u64) -> bool {
        if let Some(&idx) = self.map.get(&key) {
@@ -70,31 +86,45 @@ impl LruBlocks {
    /// existing block just touches it.
    pub fn insert_blocks(&mut self, hashes: &[u64], evicted_out: &mut Vec<u64>) {
        for &h in hashes {
-            if self.touch(h) {
-                continue;
+            if let Some(evicted) = self.insert_block(h) {
+                evicted_out.push(evicted);
            }
-            // need to make room?
+        }
+    }
+
+    pub fn insert_block(&mut self, key: u64) -> Option<u64> {
+        if self.touch(key) {
+            return None;
+        }
+        let mut evicted = None;
        if self.map.len() == self.capacity {
            if let Some(tail_idx) = self.tail {
                let tail_key = self.nodes[tail_idx].key;
                self.detach(tail_idx);
                self.map.remove(&tail_key);
                self.free.push(tail_idx);
-                    evicted_out.push(tail_key);
+                evicted = Some(tail_key);
            }
        }
-            // allocate node
        let idx = if let Some(i) = self.free.pop() {
-                self.nodes[i] = Node { key: h, prev: None, next: None };
+            self.nodes[i] = Node {
+                key,
+                prev: None,
+                next: None,
+            };
            i
        } else {
            let i = self.nodes.len();
-                self.nodes.push(Node { key: h, prev: None, next: None });
+            self.nodes.push(Node {
+                key,
+                prev: None,
+                next: None,
+            });
            i
        };
-            self.map.insert(h, idx);
+        self.map.insert(key, idx);
        self.attach_to_head(idx);
-        }
+        evicted
    }

    /// Longest leading prefix of `hashes` present; touches the matched blocks.
@@ -178,6 +208,68 @@ impl TwoTierCache {
            l1: LruBlocks::new(l1_cap),
        }
    }
+
+    pub fn insert_blocks_into_l0(&mut self, hashes: &[u64]) -> Vec<L1Change> {
+        let mut changes = Vec::new();
+        for &h in hashes {
+            self.insert_block_into_l0(h, &mut changes);
+        }
+        changes
+    }
+
+    pub fn promote_l1_blocks_to_l0(&mut self, hashes: &[u64]) -> Vec<L1Change> {
+        let mut changes = Vec::new();
+        for &h in hashes {
+            if self.l1.remove(h) {
+                changes.push(L1Change::Removed(h));
+            }
+            self.insert_block_into_l0(h, &mut changes);
+        }
+        changes
+    }
+
+    pub fn fetch_remote_blocks_to_l0(&mut self, hashes: &[u64]) -> Vec<L1Change> {
+        let mut changes = Vec::new();
+        for &h in hashes {
+            self.stage_remote_block_in_l1(h, &mut changes);
+            let removed = self.l1.remove(h);
+            debug_assert!(removed, "staged remote block must be present in l1");
+            self.insert_block_into_l0(h, &mut changes);
+        }
+        changes
+    }
+
+    fn insert_block_into_l0(&mut self, hash: u64, changes: &mut Vec<L1Change>) {
+        if self.l0.touch(hash) {
+            return;
+        }
+        if self.l1.remove(hash) {
+            changes.push(L1Change::Removed(hash));
+        }
+        if let Some(evicted_l0) = self.l0.insert_block(hash) {
+            self.demote_into_l1(evicted_l0, changes);
+        }
+    }
+
+    fn stage_remote_block_in_l1(&mut self, hash: u64, changes: &mut Vec<L1Change>) {
+        if self.l0.contains(hash) || self.l1.contains(hash) {
+            return;
+        }
+        if let Some(evicted_l1) = self.l1.insert_block(hash) {
+            changes.push(L1Change::Removed(evicted_l1));
+        }
+    }
+
+    fn demote_into_l1(&mut self, hash: u64, changes: &mut Vec<L1Change>) {
+        debug_assert!(!self.l0.contains(hash));
+        if self.l1.touch(hash) {
+            return;
+        }
+        if let Some(evicted_l1) = self.l1.insert_block(hash) {
+            changes.push(L1Change::Removed(evicted_l1));
+        }
+        changes.push(L1Change::Added(hash));
+    }
 }

 #[cfg(test)]
@@ -223,4 +315,61 @@ mod tests {
        c.insert_blocks(&[4], &mut ev);
        assert_eq!(ev, vec![2]);
    }
+
+    #[test]
+    fn two_tier_cache_demotes_l0_evictions_into_l1() {
+        let mut c = TwoTierCache::new(2, 2);
+
+        assert!(c.insert_blocks_into_l0(&[1, 2]).is_empty());
+        let changes = c.insert_blocks_into_l0(&[3]);
+
+        assert!(c.l0.contains(2));
+        assert!(c.l0.contains(3));
+        assert!(!c.l0.contains(1));
+        assert!(c.l1.contains(1));
+        assert_eq!(changes, vec![L1Change::Added(1)]);
+    }
+
+    #[test]
+    fn promoting_l1_blocks_to_l0_keeps_tiers_exclusive() {
+        let mut c = TwoTierCache::new(2, 2);
+        c.insert_blocks_into_l0(&[1, 2, 3]);
+
+        let changes = c.promote_l1_blocks_to_l0(&[1]);
+
+        assert!(c.l0.contains(1));
+        assert!(c.l0.contains(3));
+        assert!(!c.l0.contains(2));
+        assert!(!c.l1.contains(1));
+        assert!(c.l1.contains(2));
+        assert_eq!(changes, vec![L1Change::Removed(1), L1Change::Added(2)]);
+    }
+
+    #[test]
+    fn reinserting_block_into_l0_removes_duplicate_from_l1() {
+        let mut c = TwoTierCache::new(2, 2);
+        c.insert_blocks_into_l0(&[1, 2, 3]);
+
+        let changes = c.insert_blocks_into_l0(&[1]);
+
+        assert!(c.l0.contains(1));
+        assert!(c.l0.contains(3));
+        assert!(!c.l1.contains(1));
+        assert!(c.l1.contains(2));
+        assert_eq!(changes, vec![L1Change::Removed(1), L1Change::Added(2)]);
+    }
+
+    #[test]
+    fn remote_fetch_uses_l1_capacity_before_promoting_to_l0() {
+        let mut c = TwoTierCache::new(2, 1);
+        c.insert_blocks_into_l0(&[1, 2, 3]);
+
+        let changes = c.fetch_remote_blocks_to_l0(&[4]);
+
+        assert!(c.l0.contains(3));
+        assert!(c.l0.contains(4));
+        assert!(!c.l1.contains(1));
+        assert!(c.l1.contains(2));
+        assert_eq!(changes, vec![L1Change::Removed(1), L1Change::Added(2)]);
+    }
 }
--- a/tests/smoke.rs
+++ b/tests/smoke.rs
@@ -22,6 +22,8 @@ fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
        },
        hardware: HardwareConfig {
            gpu_flops: 1.0e14,
+            gpu_fp8_flops: 0.0,
+            gpu_fp4_flops: 0.0,
            gpu_mem_bw: 1.0e12,
            hbm_bytes: 1.0e9,
            dram_bytes: 4.0e9,