speculative: EAGLE3 draft head implementation (Phase 25 step 1)

- eagle3.rs: Eagle3Head struct loads AngelSlim/Qwen3-8B_eagle3 safetensors, runs a single draft step via fc(concat(h_low, h_mid, h_high)) + concat(input_norm(emb), hidden_norm(fused_h)) → 1 midlayer → norm → lm_head → argmax in draft_vocab(32000) → d2t → target_vocab. - qwen3.rs: new decode_core_with_hidden method that mirrors decode_core but captures hidden states at 3 configurable layer indices (default [11, 23, 35] for the 36-layer Qwen3-8B). Also expose embed_tokens_tensor and (in eagle3) map_draft_to_target as public accessors. - loader.rs: make_tensor now pub(crate) so eagle3 can reuse it. - bin/check-eagle3.rs: sanity binary that loads target + EAGLE, runs one prefill + one decode + one EAGLE step, prints the top-5 EAGLE predictions. Verified on dash5 with prompt "The capital of France is": target says: " Paris" then "." EAGLE top-5: "," / " Paris" / " Madrid" / "." / " Berlin" Weights load correctly, d2t mapping works, hidden state hooks are the right shape ([1, 4096]), and EAGLE produces thematically-relevant tokens. The top-1 pick "," doesn't match target's "." at this position, but that's expected: this test uses hidden states from a single decode step with no recursive chaining. A full speculative loop still needs the γ≥2 verify + accept path wired up (next step).
2026-07-01 17:23:22 +08:00
parent 6485c87c5b
commit e04a8ffb18
5 changed files with 577 additions and 1 deletions
--- a/crates/xserv-model/src/bin/check-eagle3.rs
+++ b/crates/xserv-model/src/bin/check-eagle3.rs
@@ -0,0 +1,152 @@
 //! EAGLE3 sanity check: load weights, run one draft step, print top-5 predictions.
 //!
 //! This verifies that:
 //! - Eagle3Head weights load without shape mismatches
 //! - Target hidden states can be captured via decode_core_with_hidden
 //! - Eagle3Head::step produces a valid token id (in target vocab)
 //!
 //! Does NOT measure speedup — that requires a full γ≥2 speculative loop, which
 //! is more complex integration work.
 use std::path::PathBuf;
 use xserv_model::eagle3::{EAGLE_HOOK_LAYERS, Eagle3Head};
 use xserv_model::{BLOCK_SIZE, ModelConfig, PagedKVCache, Qwen3, loader};
 use xserv_tensor::{DType, Device, Tensor};
 use xserv_tokenizer::Tokenizer;
 fn main() {
    let args: Vec<String> = std::env::args().collect();
    if args.len() < 3 {
        eprintln!("Usage: check-eagle3 <target-model-dir> <eagle3-model-dir> [prompt]");
        std::process::exit(1);
    }
    let target_dir = PathBuf::from(&args[1]);
    let eagle_dir = PathBuf::from(&args[2]);
    let prompt = args
        .get(3)
        .cloned()
        .unwrap_or_else(|| "The capital of France is".to_string());
    let device: u32 = 0;
    xserv_cuda::device::set_device(device).unwrap();
    let target_config = ModelConfig::from_file(&target_dir.join("config.json"));
    eprintln!("Loading target Qwen3-8B...");
    let target_weights = loader::load_model_dir(&target_dir, Device::Cuda(device));
    let target = Qwen3::from_weights(target_config.clone(), target_weights);
    xserv_cuda::allocator::cached_trim();
    eprintln!("Loading EAGLE3 head from {}", eagle_dir.display());
    let eagle = Eagle3Head::load(&eagle_dir, device);
    xserv_cuda::allocator::cached_trim();
    let tokenizer = Tokenizer::from_file(&target_dir.join("tokenizer.json"));
    let embed_tokens = target.embed_tokens_tensor();
    let ids = tokenizer.encode(&prompt);
    let max_seq_len = 512;
    let num_blocks = (max_seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE + 2;
    let mut cache = PagedKVCache::new(
        &target_config,
        num_blocks,
        0,
        1,
        num_blocks,
        DType::BF16,
        device,
    );
    cache.register_sequence(0).unwrap();
    // Prefill target.
    let logits = target.forward_prefill_paged(&ids, 0, &mut cache);
    let target_first = *xserv_kernels::argmax_bf16_to_host(&logits).last().unwrap();
    let target_first_text = tokenizer.decode(&[target_first]);
    println!("Prompt: {:?}", prompt);
    println!(
        "Target argmax after prefill: {} ({:?})",
        target_first, target_first_text
    );
    // Now run one target decode step with target_first to get hidden states at the
    // hook layers.
    let pos = cache.seq_len(0);
    target.decode_prepare(&[pos], &[0], &mut cache);
    let ids_gpu = upload_u32(&[target_first]);
    let pos_gpu = upload_u32(&[pos as u32]);
    let (target_next_logits, hooks) = target.decode_core_with_hidden(
        ids_gpu.as_ptr() as *const std::ffi::c_void,
        pos_gpu.as_ptr() as *const std::ffi::c_void,
        1,
        &[0],
        &mut cache,
        &EAGLE_HOOK_LAYERS,
    );
    let target_next = xserv_kernels::argmax_bf16_single(&target_next_logits);
    let target_next_text = tokenizer.decode(&[target_next]);
    println!(
        "Target argmax after 1 decode step: {} ({:?})",
        target_next, target_next_text
    );
    for (i, h) in hooks.iter().enumerate() {
        println!(
            "hook[{}] (layer {}): shape={:?} dtype={:?}",
            i,
            EAGLE_HOOK_LAYERS[i],
            h.shape(),
            h.dtype()
        );
    }
    // Ask EAGLE what it thinks the NEXT token is (given target_first as prev_token
    // and the hidden states from the position where target_first lives).
    // EAGLE should predict target_next (or close to it) to be useful.
    let (eagle_pred, eagle_logits) = eagle.step(&hooks, embed_tokens, target_first, pos);
    let eagle_pred_text = tokenizer.decode(&[eagle_pred]);
    println!(
        "EAGLE draft prediction: {} ({:?})",
        eagle_pred, eagle_pred_text
    );
    if eagle_pred == target_next {
        println!("MATCH: EAGLE agrees with target on next token.");
    } else {
        println!(
            "MISMATCH: EAGLE draft={} vs target={} (this is fine per-step; check top-5 below)",
            eagle_pred, target_next
        );
    }
    // Show top-5 from eagle logits (in draft vocab space, mapped to target).
    print_top5(&eagle_logits, "EAGLE draft top-5", &eagle, &tokenizer);
 }
 fn upload_u32(vals: &[u32]) -> xserv_cuda::GpuBuffer {
    let bytes = unsafe { std::slice::from_raw_parts(vals.as_ptr() as *const u8, vals.len() * 4) };
    let mut buf = xserv_cuda::allocator::cached_alloc(bytes.len()).unwrap();
    buf.copy_from_host(bytes).unwrap();
    buf
 }
 fn print_top5(logits: &Tensor, label: &str, eagle: &Eagle3Head, tokenizer: &Tokenizer) {
    use half::bf16;
    let cpu = logits.to_device(Device::Cpu);
    let data = cpu.as_slice::<bf16>();
    let mut vals: Vec<(usize, f32)> = data
        .iter()
        .enumerate()
        .map(|(i, v)| (i, v.to_f32()))
        .collect();
    vals.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
    println!("{label}:");
    for (i, val) in vals.iter().take(5) {
        let target_id = eagle.map_draft_to_target(*i as u32);
        let text = tokenizer.decode(&[target_id]);
        println!(
            "  draft_id={} target_id={} val={:.3} text={:?}",
            i, target_id, val, text
        );
    }
 }
--- a/crates/xserv-model/src/eagle3.rs
+++ b/crates/xserv-model/src/eagle3.rs
@@ -0,0 +1,312 @@
 //! EAGLE3 speculative draft head for Qwen3-8B (Phase 25).
 //!
 //! Loads the AngelSlim/Qwen3-8B_eagle3 pytorch_model.bin and provides a
 //! single-step forward pass that takes 3 target hidden states + the previous
 //! token and returns a draft token in the target vocabulary.
 //!
 //! Architecture (from weights):
 //! - fc:           [hidden, 3*hidden] → fuse 3 target hidden states
 //! - midlayer:     1 decoder layer (attn input dim = 2*hidden)
 //! - norm + lm_head: → [draft_vocab_size=32000]
 //! - d2t:          draft_id → target_id offset mapping
 use std::collections::HashMap;
 use std::path::Path;
 use xserv_kernels::*;
 use xserv_tensor::{DType, Device, Tensor};
 pub const EAGLE_HOOK_LAYERS: [usize; 3] = [11, 23, 35];
 const DRAFT_VOCAB_SIZE: usize = 32000;
 fn matmul_2d(a: &Tensor, b: &Tensor) -> Tensor {
    assert_eq!(a.ndim(), 2);
    assert_eq!(b.ndim(), 2);
    matmul(a, b, GemmBackend::CuBlas)
 }
 pub struct Eagle3Head {
    fc_wt: Tensor,                    // [hidden, 3*hidden] transposed for matmul
    hidden_norm: Tensor,              // [hidden]
    input_layernorm: Tensor,          // [hidden]
    q_proj_wt: Tensor,                // [num_heads*head_dim, 2*hidden]
    k_proj_wt: Tensor,                // [num_kv_heads*head_dim, 2*hidden]
    v_proj_wt: Tensor,                // [num_kv_heads*head_dim, 2*hidden]
    o_proj_wt: Tensor,                // [hidden, num_heads*head_dim]
    gate_proj_wt: Tensor,             // [intermediate, hidden]
    up_proj_wt: Tensor,               // [intermediate, hidden]
    down_proj_wt: Tensor,             // [hidden, intermediate]
    post_attention_layernorm: Tensor, // [hidden]
    norm: Tensor,                     // [hidden] final
    lm_head_wt: Tensor,               // [draft_vocab, hidden]
    d2t: Vec<i64>,                    // [draft_vocab] offset mapping
    hidden_size: usize,
    num_heads: usize,
    num_kv_heads: usize,
    head_dim: usize,
    rope_cache: RopeCache,
 }
 impl Eagle3Head {
    pub fn load(dir: &Path, device: u32) -> Self {
        let (weights, d2t) = load_eagle3_weights(dir, device);
        let hidden_size = 4096;
        let num_heads = 32;
        let num_kv_heads = 8;
        let head_dim = 128;
        let intermediate_size = 12288;
        let max_seq_len = 2048;
        let rope_theta = 1_000_000.0f32;
        let get = |name: &str| -> Tensor {
            weights
                .get(name)
                .unwrap_or_else(|| panic!("missing eagle3 weight: {name}"))
                .clone()
        };
        let fc_wt = get("fc.weight").transpose(0, 1).contiguous();
        let q_proj_wt = get("midlayer.self_attn.q_proj.weight")
            .transpose(0, 1)
            .contiguous();
        let k_proj_wt = get("midlayer.self_attn.k_proj.weight")
            .transpose(0, 1)
            .contiguous();
        let v_proj_wt = get("midlayer.self_attn.v_proj.weight")
            .transpose(0, 1)
            .contiguous();
        let o_proj_wt = get("midlayer.self_attn.o_proj.weight")
            .transpose(0, 1)
            .contiguous();
        let gate_proj_wt = get("midlayer.mlp.gate_proj.weight")
            .transpose(0, 1)
            .contiguous();
        let up_proj_wt = get("midlayer.mlp.up_proj.weight")
            .transpose(0, 1)
            .contiguous();
        let down_proj_wt = get("midlayer.mlp.down_proj.weight")
            .transpose(0, 1)
            .contiguous();
        let hidden_norm = get("midlayer.hidden_norm.weight");
        let input_layernorm = get("midlayer.input_layernorm.weight");
        let post_attention_layernorm = get("midlayer.post_attention_layernorm.weight");
        let norm = get("norm.weight");
        let lm_head_wt = get("lm_head.weight").transpose(0, 1).contiguous();
        assert_eq!(d2t.len(), DRAFT_VOCAB_SIZE);
        let rope_cache = RopeCache::new(max_seq_len, head_dim, rope_theta);
        Self {
            fc_wt,
            hidden_norm,
            input_layernorm,
            q_proj_wt,
            k_proj_wt,
            v_proj_wt,
            o_proj_wt,
            gate_proj_wt,
            up_proj_wt,
            down_proj_wt,
            post_attention_layernorm,
            norm,
            lm_head_wt,
            d2t,
            hidden_size,
            num_heads,
            num_kv_heads,
            head_dim,
            rope_cache,
        }
    }
    /// One draft step: produce a token in target vocabulary space.
    ///
    /// - `target_hidden`: 3 tensors [1, hidden_size] from target hook layers
    /// - `embed_table`: the target model's embed_tokens (shared, not copied)
    /// - `prev_token`: the previous committed token
    /// - `position`: the decode position for RoPE
    ///
    /// Returns (draft_token_in_target_vocab, draft_logits_tensor).
    pub fn step(
        &self,
        target_hidden: &[Tensor; 3],
        embed_table: &Tensor,
        prev_token: u32,
        position: usize,
    ) -> (u32, Tensor) {
        let eps = 1e-6f32;
        // 1. Fuse target hidden states: concat [h_low, h_mid, h_high] → fc
        let h_cat = concat_hidden(target_hidden);
        let fused_h = matmul_2d(&h_cat, &self.fc_wt); // [1, hidden]
        // 2. Embed previous token (shared with target)
        let emb = embedding(embed_table, &[prev_token]); // [1, hidden]
        // 3. Concat normalized: [norm(emb), norm(fused_h)] → [1, 2*hidden]
        let emb_normed = rmsnorm(&emb, &self.input_layernorm, eps);
        let h_normed = rmsnorm(&fused_h, &self.hidden_norm, eps);
        let attn_in = concat_last_dim(&emb_normed, &h_normed); // [1, 8192]
        // 4. Self-attention (no KV cache for simplicity in v0 — single query)
        let q = matmul_2d(&attn_in, &self.q_proj_wt); // [1, num_heads*head_dim]
        let k = matmul_2d(&attn_in, &self.k_proj_wt); // [1, num_kv*head_dim]
        let v = matmul_2d(&attn_in, &self.v_proj_wt); // [1, num_kv*head_dim]
        let q_3d = q.reshape(&[1, self.num_heads, self.head_dim]);
        let k_3d = k.reshape(&[1, self.num_kv_heads, self.head_dim]);
        let positions = [position as u32];
        rope_inplace(&q_3d, &self.rope_cache, &positions);
        rope_inplace(&k_3d, &self.rope_cache, &positions);
        // Single-token attention: Q·K^T / sqrt(d) → softmax → V
        // With seq_len=1, attention is trivial: output = V (weight=1.0)
        let attn_out = v.reshape(&[1, self.num_kv_heads, self.head_dim]);
        let attn_out = if self.num_heads != self.num_kv_heads {
            repeat_kv_for_single_token(&attn_out, self.num_heads / self.num_kv_heads)
        } else {
            attn_out
        };
        let attn_merged = attn_out.reshape(&[1, self.num_heads * self.head_dim]);
        let attn_proj = matmul_2d(&attn_merged, &self.o_proj_wt); // [1, hidden]
        // Residual from embedding
        let x = add(&attn_proj, &emb);
        // 5. MLP
        let normed = rmsnorm(&x, &self.post_attention_layernorm, eps);
        let gate = matmul_2d(&normed, &self.gate_proj_wt);
        let up = matmul_2d(&normed, &self.up_proj_wt);
        let mlp_out = silu_mul(&gate, &up);
        let down = matmul_2d(&mlp_out, &self.down_proj_wt);
        let x = add(&x, &down);
        // 6. Final norm + lm_head
        let x = rmsnorm(&x, &self.norm, eps);
        let logits = matmul_2d(&x, &self.lm_head_wt); // [1, 32000]
        // 7. Argmax in draft vocab → map to target vocab
        let draft_id = argmax_bf16_single(&logits);
        let target_id = (draft_id as i64 + self.d2t[draft_id as usize]) as u32;
        (target_id, logits)
    }
    /// Map a draft-vocab token id to the full target-vocab id via d2t.
    pub fn map_draft_to_target(&self, draft_id: u32) -> u32 {
        (draft_id as i64 + self.d2t[draft_id as usize]) as u32
    }
 }
 fn d2d(dst: *mut u8, src: *const u8, bytes: usize) {
    unsafe {
        xserv_cuda::ffi::cudaMemcpy(dst, src, bytes, xserv_cuda::ffi::CUDA_MEMCPY_D2D);
    }
 }
 fn concat_hidden(hidden: &[Tensor; 3]) -> Tensor {
    let h = hidden[0].shape()[1];
    let dtype = hidden[0].dtype();
    let device = hidden[0].device();
    let elem_bytes = dtype.size_bytes();
    let out = Tensor::empty(&[1, 3 * h], dtype, device);
    for (i, t) in hidden.iter().enumerate() {
        assert!(t.is_contiguous());
        let dst = unsafe { (out.data_ptr() as *mut u8).add(i * h * elem_bytes) };
        d2d(dst, t.data_ptr() as *const u8, h * elem_bytes);
    }
    out
 }
 fn concat_last_dim(a: &Tensor, b: &Tensor) -> Tensor {
    let da = a.shape()[1];
    let db = b.shape()[1];
    let dtype = a.dtype();
    let device = a.device();
    let elem_bytes = dtype.size_bytes();
    let out = Tensor::empty(&[1, da + db], dtype, device);
    d2d(
        out.data_ptr() as *mut u8,
        a.data_ptr() as *const u8,
        da * elem_bytes,
    );
    let dst = unsafe { (out.data_ptr() as *mut u8).add(da * elem_bytes) };
    d2d(dst, b.data_ptr() as *const u8, db * elem_bytes);
    out
 }
 fn repeat_kv_for_single_token(kv: &Tensor, repeats: usize) -> Tensor {
    if repeats == 1 {
        return kv.clone();
    }
    let nkv = kv.shape()[1];
    let d = kv.shape()[2];
    let dtype = kv.dtype();
    let device = kv.device();
    let head_bytes = d * dtype.size_bytes();
    let out = Tensor::empty(&[1, nkv * repeats, d], dtype, device);
    for h in 0..nkv {
        let src = unsafe { (kv.data_ptr() as *const u8).add(h * head_bytes) };
        for r in 0..repeats {
            let dst = unsafe { (out.data_ptr() as *mut u8).add((h * repeats + r) * head_bytes) };
            d2d(dst, src, head_bytes);
        }
    }
    out
 }
 /// Load EAGLE3 weights from safetensors, handling int64 d2t specially.
 fn load_eagle3_weights(dir: &Path, device: u32) -> (HashMap<String, Tensor>, Vec<i64>) {
    let st_path = dir.join("model.safetensors");
    assert!(
        st_path.exists(),
        "Eagle3 model.safetensors not found in {}. Convert with:\n\
         python3 -c \"import torch; from safetensors.torch import save_file; \
         sd=torch.load('pytorch_model.bin', map_location='cpu', weights_only=False); \
         save_file(sd, 'model.safetensors')\"",
        dir.display()
    );
    let data = std::fs::read(&st_path)
        .unwrap_or_else(|e| panic!("failed to read {}: {e}", st_path.display()));
    let st = safetensors::SafeTensors::deserialize(&data)
        .unwrap_or_else(|e| panic!("failed to parse {}: {e}", st_path.display()));
    let mut tensors = HashMap::new();
    let mut d2t_vec: Vec<i64> = Vec::new();
    for (name, view) in st.tensors() {
        if name == "t2d" {
            continue;
        }
        if name == "d2t" {
            let raw = view.data();
            assert_eq!(view.dtype(), safetensors::Dtype::I64);
            let n = raw.len() / 8;
            d2t_vec = (0..n)
                .map(|i| i64::from_le_bytes(raw[i * 8..(i + 1) * 8].try_into().unwrap()))
                .collect();
            continue;
        }
        let dtype = match view.dtype() {
            safetensors::Dtype::BF16 => DType::BF16,
            safetensors::Dtype::F32 => DType::F32,
            safetensors::Dtype::F16 => DType::F16,
            other => {
                eprintln!("eagle3: skipping {name} with unsupported dtype {other:?}");
                continue;
            }
        };
        let shape: Vec<usize> = view.shape().to_vec();
        let raw = view.data();
        let t = crate::loader::make_tensor(raw, &shape, dtype);
        let t = t.to_device(Device::Cuda(device));
        tensors.insert(name.to_string(), t);
    }
    assert!(
        !d2t_vec.is_empty(),
        "d2t tensor not found in eagle3 weights"
    );
    (tensors, d2t_vec)
 }
--- a/crates/xserv-model/src/lib.rs
+++ b/crates/xserv-model/src/lib.rs
@@ -1,5 +1,6 @@
 pub mod config;
 pub mod decode_graph;
 pub mod eagle3;
 pub mod gpt2;
 pub mod gpt_oss;
 pub mod gpt_oss_graph;
--- a/crates/xserv-model/src/loader.rs
+++ b/crates/xserv-model/src/loader.rs
@@ -68,7 +68,7 @@ pub fn load_model_dir(dir: &Path, device: Device) -> HashMap<String, Tensor> {
    all_tensors
 }
-fn make_tensor(raw_bytes: &[u8], shape: &[usize], dtype: DType) -> Tensor {
+pub(crate) fn make_tensor(raw_bytes: &[u8], shape: &[usize], dtype: DType) -> Tensor {
    match dtype {
        DType::F32 => {
            let floats: &[f32] = unsafe {
--- a/crates/xserv-model/src/qwen3.rs
+++ b/crates/xserv-model/src/qwen3.rs
@@ -825,6 +825,111 @@ impl Qwen3 {
        matmul_2d(&x, &self.lm_head_t)
    }
    /// Like `decode_core` but also captures hidden states at 3 specified layer
    /// indices (after residual+MLP output). Used by EAGLE3 speculative drafting
    /// to feed the draft head with low/mid/high target representations.
    pub fn decode_core_with_hidden(
        &self,
        ids_gpu: *const std::ffi::c_void,
        pos_gpu: *const std::ffi::c_void,
        batch: usize,
        seq_slots: &[usize],
        paged_cache: &mut PagedKVCache,
        hook_layers: &[usize; 3],
    ) -> (Tensor, [Tensor; 3]) {
        let num_heads = self.local_num_heads;
        let num_kv_heads = self.local_num_kv_heads;
        let head_dim = self.config.head_dim();
        let eps = self.config.rms_norm_eps.unwrap_or(1e-6) as f32;
        let bt_ptr = paged_cache.block_table_gpu().as_ptr() as *const i32;
        let cl_ptr = paged_cache.context_lens_gpu().as_ptr() as *const i32;
        let max_blocks = paged_cache.max_blocks_per_seq();
        let mut x = embedding_device_ids(&self.embed_tokens, ids_gpu, batch);
        let mut hooks: [Option<Tensor>; 3] = [None, None, None];
        for (layer_idx, layer) in self.layers.iter().enumerate() {
            let residual = x.clone();
            let normed = rmsnorm(&x, &layer.input_norm, eps);
            let qkv = matmul_2d(&normed, &layer.qkv_proj_wt);
            let q_dim = num_heads * head_dim;
            let kv_dim = num_kv_heads * head_dim;
            let q_all = qkv.narrow(1, 0, q_dim);
            let k_all = qkv.narrow(1, q_dim, kv_dim);
            let v_all = qkv.narrow(1, q_dim + kv_dim, kv_dim);
            let q_flat = q_all.contiguous().reshape(&[batch * num_heads, head_dim]);
            let k_flat = k_all
                .contiguous()
                .reshape(&[batch * num_kv_heads, head_dim]);
            let q_normed = rmsnorm(&q_flat, &layer.q_norm, eps);
            let k_normed = rmsnorm(&k_flat, &layer.k_norm, eps);
            let q_3d = q_normed.reshape(&[batch, num_heads, head_dim]);
            let k_3d = k_normed.reshape(&[batch, num_kv_heads, head_dim]);
            rope_inplace_device_pos(&q_3d, &self.rope_cache, pos_gpu);
            rope_inplace_device_pos(&k_3d, &self.rope_cache, pos_gpu);
            let v_3d = v_all.contiguous().reshape(&[batch, num_kv_heads, head_dim]);
            paged_cache.append_tokens_batched(layer_idx, &k_3d, &v_3d, batch);
            let q_4d = q_3d.reshape(&[batch, num_heads, 1, head_dim]);
            let k_pool_ptr = paged_cache.k_pool(layer_idx).as_ptr() as *const std::ffi::c_void;
            let v_pool_ptr = paged_cache.v_pool(layer_idx).as_ptr() as *const std::ffi::c_void;
            let attn_out = xserv_kernels::paged_decode_attention(
                &q_4d,
                k_pool_ptr,
                v_pool_ptr,
                bt_ptr,
                cl_ptr,
                batch,
                num_heads,
                num_kv_heads,
                head_dim,
                max_blocks,
            );
            let attn_merged = attn_out.reshape(&[batch, num_heads * head_dim]);
            let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
            self.all_reduce(&attn_proj);
            let (normed, x_new) =
                xserv_kernels::add_rmsnorm(&attn_proj, &residual, &layer.post_norm, eps);
            let residual = x_new.clone();
            let gate_up = matmul_2d(&normed, &layer.gate_up_proj_wt);
            let ffn_dim = gate_up.shape()[1] / 2;
            let gate = gate_up.narrow(1, 0, ffn_dim).contiguous();
            let up = gate_up.narrow(1, ffn_dim, ffn_dim).contiguous();
            let hidden_states = xserv_kernels::silu_mul(&gate, &up);
            let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
            self.all_reduce(&down);
            x = add_any(&residual, &down);
            for (h_idx, &h_layer) in hook_layers.iter().enumerate() {
                if layer_idx == h_layer {
                    hooks[h_idx] = Some(x.clone());
                }
            }
        }
        for &slot in seq_slots {
            paged_cache.advance_seq_len(slot, 1);
        }
        let x = rmsnorm(&x, &self.norm, eps);
        let logits = matmul_2d(&x, &self.lm_head_t);
        let hidden_arr = [
            hooks[0].take().expect("hook layer 0 not reached"),
            hooks[1].take().expect("hook layer 1 not reached"),
            hooks[2].take().expect("hook layer 2 not reached"),
        ];
        (logits, hidden_arr)
    }
    /// Paged prefill: write a sequence of `new_tokens` K/V into the paged
    /// cache for `slot`, run flash attention via gathered contiguous K/V.
    /// Returns logits [new_tokens, vocab_size].
@@ -1074,6 +1179,12 @@ impl Qwen3 {
        matmul_2d(&x, &self.lm_head_t)
    }
    /// Reference to the target's token embedding table. Shared (not copied)
    /// with speculative draft heads like EAGLE3.
    pub fn embed_tokens_tensor(&self) -> &Tensor {
        &self.embed_tokens
    }
    /// Extract weight pointers for CUDA Graph capture.
    pub fn layer_weight_ptrs(&self) -> Vec<crate::decode_graph::LayerWeightPtrs> {
        self.layers