phase19: MoE support — gpt-oss-20b end-to-end inference with TP=2

Add Mixture-of-Experts support for the gpt-oss-20b model (20.9B params, 32 experts × top-4 routing). Key additions: - ModelConfig: MoE fields (num_local_experts, layer_types, sliding_window, attention_bias, explicit head_dim, rope_scaling, swiglu_limit) - YaRN RoPE: RopeCache::new_yarn() with correct frequency interpolation and attention_scaling = 0.1*ln(factor)+1 - Custom GLU kernel: gpt_oss_glu_bf16 (clamped sigmoid gate activation) - Paged attention with sinks + sliding window kernel variant - GptOss model struct with expert-parallel TP (split 32 experts across ranks) - bench-gpt-oss binary for TP inference benchmarking Verified on dash5 with 2x RTX 5090: 63.6 tok/s decode, ~160ms TTFT. Model generates topically-coherent output (needs chat template for quality). Known issues: - Custom GEMV kernel produces NaN with small N (workaround: pad to M=2) - Prefill doesn't use attention sinks (uses standard flash attention) - Output quality requires chat template formatting Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-30 15:18:01 +08:00
parent 46bfb59f30
commit 9ad91a4a92
12 changed files with 1390 additions and 44 deletions
--- a/crates/xserv-kernels/src/activation.rs
+++ b/crates/xserv-kernels/src/activation.rs
@@ -13,6 +13,8 @@ unsafe extern "C" {
    fn launch_mul_f32(a: *const c_void, b: *const c_void, out: *mut c_void, n: i32, stream: *mut c_void);
    fn launch_mul_bf16(a: *const c_void, b: *const c_void, out: *mut c_void, n: i32, stream: *mut c_void);
    fn launch_silu_mul_bf16(gate: *const c_void, up: *const c_void, out: *mut c_void, n: i32, stream: *mut c_void);
    fn launch_gpt_oss_glu_bf16(gate_up: *const c_void, out: *mut c_void, n_elements: i32,
                                alpha: f32, limit: f32, stream: *mut c_void);
 }
 fn dispatch_unary(x: &Tensor, f32_fn: unsafe extern "C" fn(*const c_void, *mut c_void, i32, *mut c_void),
@@ -97,3 +99,31 @@ pub fn silu_mul(gate: &Tensor, up: &Tensor) -> Tensor {
    }
    out
 }
 /// gpt-oss fused GLU activation (BF16 only).
 /// Input: gate_up [rows, 2*D] with interleaved columns (gate=even, up=odd).
 /// Output: [rows, D]
 /// Computes: gate.clamp(max=limit) * sigmoid(gate * alpha) * (up.clamp(-limit,limit) + 1)
 pub fn gpt_oss_glu(gate_up: &Tensor, alpha: f32, limit: f32) -> Tensor {
    assert!(gate_up.is_contiguous());
    assert!(matches!(gate_up.device(), Device::Cuda(_)));
    assert_eq!(gate_up.dtype(), DType::BF16, "gpt_oss_glu requires BF16");
    assert_eq!(gate_up.ndim(), 2);
    let rows = gate_up.shape()[0];
    let cols = gate_up.shape()[1];
    assert_eq!(cols % 2, 0);
    let d = cols / 2;
    let out = Tensor::empty(&[rows, d], gate_up.dtype(), gate_up.device());
    let n_elements = (rows * d) as i32;
    unsafe {
        launch_gpt_oss_glu_bf16(
            gate_up.data_ptr() as *const c_void,
            out.data_ptr() as *mut c_void,
            n_elements,
            alpha,
            limit,
            std::ptr::null_mut(),
        );
    }
    out
 }
--- a/crates/xserv-kernels/src/attention.rs
+++ b/crates/xserv-kernels/src/attention.rs
@@ -33,6 +33,18 @@ unsafe extern "C" {
        head_dim: i32, max_blocks_per_seq: i32,
        scale: f32, stream: *mut c_void,
    );
    fn launch_paged_decode_attention_sinks_bf16(
        q: *const c_void,
        k_cache: *const c_void,
        v_cache: *const c_void,
        o: *mut c_void,
        block_tables: *const i32,
        context_lens: *const i32,
        sinks: *const c_void,
        batch: i32, num_q_heads: i32, num_kv_heads: i32,
        head_dim: i32, max_blocks_per_seq: i32,
        scale: f32, window_size: i32, stream: *mut c_void,
    );
    fn launch_reshape_and_cache_bf16(
        k_src: *const c_void, v_src: *const c_void,
        k_pool: *mut c_void, v_pool: *mut c_void,
@@ -337,3 +349,58 @@ pub fn paged_decode_attention(
    output
 }
 /// Paged decode attention with attention sinks and optional sliding window.
 ///
 /// sinks_ptr: pointer to [num_q_heads] BF16 on GPU (or null for no sinks)
 /// window_size: 0 = full attention, >0 = sliding window
 #[allow(clippy::too_many_arguments)]
 pub fn paged_decode_attention_sinks(
    q: &Tensor,
    k_cache_ptr: *const c_void,
    v_cache_ptr: *const c_void,
    block_tables_ptr: *const i32,
    context_lens_ptr: *const i32,
    sinks_ptr: *const c_void,
    batch: usize,
    num_q_heads: usize,
    num_kv_heads: usize,
    head_dim: usize,
    max_blocks_per_seq: usize,
    window_size: usize,
 ) -> Tensor {
    assert_eq!(q.ndim(), 4);
    assert_eq!(q.shape()[2], 1);
    assert_eq!(q.dtype(), DType::BF16);
    assert!(num_q_heads % num_kv_heads == 0);
    assert!(head_dim <= 128);
    let scale = 1.0 / (head_dim as f32).sqrt();
    let output = Tensor::empty(
        &[batch, num_q_heads, 1, head_dim],
        DType::BF16,
        q.device(),
    );
    unsafe {
        launch_paged_decode_attention_sinks_bf16(
            q.data_ptr() as *const c_void,
            k_cache_ptr,
            v_cache_ptr,
            output.data_ptr() as *mut c_void,
            block_tables_ptr,
            context_lens_ptr,
            sinks_ptr,
            batch as i32,
            num_q_heads as i32,
            num_kv_heads as i32,
            head_dim as i32,
            max_blocks_per_seq as i32,
            scale,
            window_size as i32,
            std::ptr::null_mut(),
        );
    }
    output
 }
--- a/crates/xserv-kernels/src/lib.rs
+++ b/crates/xserv-kernels/src/lib.rs
@@ -10,10 +10,10 @@ pub mod rope;
 pub mod softmax;
 pub mod transpose;
-pub use activation::{add, gelu, mul, scale, silu, silu_mul};
+pub use activation::{add, gelu, gpt_oss_glu, mul, scale, silu, silu_mul};
 pub use argmax::{argmax_bf16_single, argmax_bf16_to_host};
 pub use transpose::{merge_heads_gpu, repeat_kv_gpu, reshape_heads_gpu, strided_to_contiguous_gpu, transpose_for_rope_gpu, transpose_from_rope_gpu};
-pub use attention::{attention, decode_attention, flash_attention, paged_decode_attention, reshape_and_cache_bf16, reshape_and_cache_batched_bf16};
+pub use attention::{attention, decode_attention, flash_attention, paged_decode_attention, paged_decode_attention_sinks, reshape_and_cache_bf16, reshape_and_cache_batched_bf16};
 pub use embedding::embedding;
 pub use gemm::{batched_matmul, matmul, GemmBackend};
 pub use layernorm::layernorm;
--- a/crates/xserv-kernels/src/rope.rs
+++ b/crates/xserv-kernels/src/rope.rs
@@ -37,6 +37,81 @@ impl RopeCache {
        Self { cos, sin, max_seq_len, half_dim }
    }
    /// YaRN (Yet another RoPE extensioN) RoPE cache. Applies frequency-dependent
    /// interpolation so the model can extrapolate beyond its training context.
    pub fn new_yarn(
        max_seq_len: usize,
        head_dim: usize,
        theta: f64,
        factor: f64,
        original_max_pos: usize,
        beta_fast: f64,
        beta_slow: f64,
    ) -> Self {
        let half_dim = head_dim / 2;
        let dim = head_dim as f64;
        // find_correction_dim: inverse formula to find dimension from number of rotations
        let find_correction_dim = |num_rotations: f64| -> f64 {
            dim * (original_max_pos as f64 / (num_rotations * 2.0 * std::f64::consts::PI)).ln()
                / (2.0 * theta.ln())
        };
        let low_raw = find_correction_dim(beta_fast);
        let high_raw = find_correction_dim(beta_slow);
        // config has truncate=false, so use raw values (no floor/ceil)
        let low = low_raw.max(0.0);
        let high = high_raw.min((half_dim - 1) as f64);
        // Compute inv_freq with YaRN interpolation
        let mut inv_freq = vec![0.0f64; half_dim];
        for i in 0..half_dim {
            let pos_freq = theta.powf((2 * i) as f64 / dim);
            let inv_freq_extrapolation = 1.0 / pos_freq;  // original
            let inv_freq_interpolation = 1.0 / (factor * pos_freq);  // scaled
            // Linear ramp: 0 where we keep original, 1 where we interpolate
            let ramp = if (high - low).abs() < 0.001 {
                0.5
            } else {
                ((i as f64 - low) / (high - low)).clamp(0.0, 1.0)
            };
            let extrapolation_factor = 1.0 - ramp;
            inv_freq[i] = inv_freq_interpolation * (1.0 - extrapolation_factor)
                + inv_freq_extrapolation * extrapolation_factor;
        }
        // Attention scaling factor for YaRN: 0.1 * ln(factor) + 1.0
        let attn_factor = 0.1 * factor.ln() + 1.0;
        // Build cos/sin cache on CPU then upload
        let total = max_seq_len * half_dim;
        let mut cos_host = vec![0.0f32; total];
        let mut sin_host = vec![0.0f32; total];
        for pos in 0..max_seq_len {
            for i in 0..half_dim {
                let angle = pos as f64 * inv_freq[i];
                cos_host[pos * half_dim + i] = (angle.cos() * attn_factor) as f32;
                sin_host[pos * half_dim + i] = (angle.sin() * attn_factor) as f32;
            }
        }
        let nbytes = total * std::mem::size_of::<f32>();
        let mut cos = GpuBuffer::alloc(nbytes).expect("alloc yarn cos_cache");
        let mut sin = GpuBuffer::alloc(nbytes).expect("alloc yarn sin_cache");
        let cos_bytes = unsafe {
            std::slice::from_raw_parts(cos_host.as_ptr() as *const u8, nbytes)
        };
        let sin_bytes = unsafe {
            std::slice::from_raw_parts(sin_host.as_ptr() as *const u8, nbytes)
        };
        cos.copy_from_host(cos_bytes).unwrap();
        sin.copy_from_host(sin_bytes).unwrap();
        Self { cos, sin, max_seq_len, half_dim }
    }
 }
 /// Apply RoPE in-place to x.
--- a/crates/xserv-model/src/bin/bench-gpt-oss.rs
+++ b/crates/xserv-model/src/bin/bench-gpt-oss.rs
@@ -0,0 +1,231 @@
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Instant;
 use xserv_distributed::{TpContext, UniqueId, get_unique_id};
 use xserv_model::{loader, GptOss, ModelConfig, PagedKVCache, BLOCK_SIZE};
 use xserv_tensor::{DType, Device};
 use xserv_tokenizer::Tokenizer;
 fn main() {
    let args: Vec<String> = std::env::args().collect();
    if args.len() < 2 {
        eprintln!("Usage: bench-gpt-oss <model-dir> [--max-tokens N] [--tp N]");
        std::process::exit(1);
    }
    let model_dir = PathBuf::from(&args[1]);
    let max_tokens: usize = get_arg(&args, "--max-tokens").unwrap_or(32);
    let world: usize = get_arg(&args, "--tp").unwrap_or(2);
    let config = ModelConfig::from_file(&model_dir.join("config.json"));
    let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
    eprintln!(
        "gpt-oss-20b: layers={}, hidden={}, heads={}/{} kv, experts={}, top_k={}, vocab={}",
        config.num_layers(), config.hidden(), config.num_heads(),
        config.num_kv_heads(), config.num_experts(), config.experts_per_token(),
        config.vocab_size
    );
    eprintln!("TP world={world}, max_tokens={max_tokens}");
    let max_seq_len: usize = 2048;
    let max_blocks_per_seq = (max_seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
    // TP setup
    let uid = get_unique_id();
    let local_kv = config.num_kv_heads() / world;
    // Spawn worker threads for ranks 1..world
    let mut worker_handles = Vec::new();
    let mut worker_txs = Vec::new();
    for rank in 1..world {
        let (tx, rx) = std::sync::mpsc::channel::<WorkerCmd>();
        let (ack_tx, ack_rx) = std::sync::mpsc::channel::<()>();
        let cfg = config.clone();
        let md = model_dir.clone();
        let uid_copy = uid;
        worker_handles.push((
            std::thread::spawn(move || {
                worker_loop(rank, world, uid_copy, md, cfg, max_seq_len, rx, ack_tx);
            }),
            ack_rx,
        ));
        worker_txs.push(tx);
    }
    // Rank 0 setup
    xserv_cuda::device::set_device(0).unwrap();
    let tp0 = Arc::new(TpContext::init(0, world, uid, 0));
    eprintln!("[rank 0] Loading weights...");
    let weights = loader::load_model_dir(&model_dir, Device::Cpu);
    eprintln!("[rank 0] Loaded {} tensors, building model...", weights.len());
    let model = GptOss::from_weights_tp(config.clone(), weights, 0, world, 0, Some(tp0));
    let total_blocks = max_blocks_per_seq + 64;
    let mut cache = PagedKVCache::new_tp(
        &config, local_kv, total_blocks, 0, 4, max_blocks_per_seq, DType::BF16, 0,
    );
    eprintln!("[rank 0] Ready.");
    // Prompt
    let prompt = "What is the meaning of life?";
    let token_ids = tokenizer.encode(prompt);
    eprintln!("Prompt ({} tokens): {prompt}", token_ids.len());
    // Register sequence
    let slot = 0;
    cache.register_sequence(slot).unwrap();
    broadcast_cmd(&worker_txs, &worker_handles, WorkerCmd::Register(slot));
    // Prefill
    let t0 = Instant::now();
    broadcast_cmd(&worker_txs, &worker_handles, WorkerCmd::Prefill {
        tokens: token_ids.clone(), slot,
    });
    let logits = model.forward_prefill_paged(&token_ids, slot, &mut cache);
    wait_workers(&worker_handles);
    let ttft = t0.elapsed();
    let mut next = sample_greedy_last(&logits);
    let mut output_tokens = vec![next];
    eprintln!("TTFT: {:.1}ms", ttft.as_secs_f64() * 1000.0);
    print!("{prompt}");
    // Decode
    let decode_start = Instant::now();
    for _ in 1..max_tokens {
        let text = tokenizer.decode(&[next]);
        print!("{text}");
        if tokenizer.eos_token_id() == Some(next) { break; }
        let pos = cache.seq_len(slot);
        broadcast_cmd(&worker_txs, &worker_handles, WorkerCmd::Decode {
            tokens: vec![next], positions: vec![pos], slots: vec![slot],
        });
        let logits = model.forward_decode_paged(&[next], &[pos], &[slot], &mut cache);
        wait_workers(&worker_handles);
        next = sample_greedy_last(&logits);
        output_tokens.push(next);
    }
    let decode_elapsed = decode_start.elapsed();
    println!();
    let gen_tokens = output_tokens.len();
    let full_text = tokenizer.decode(&output_tokens);
    eprintln!("\nGenerated text: {full_text}");
    eprintln!("Token IDs: {:?}", &output_tokens[..output_tokens.len().min(20)]);
    let tpot = if gen_tokens > 1 {
        decode_elapsed.as_secs_f64() * 1000.0 / (gen_tokens - 1) as f64
    } else { 0.0 };
    let tok_s = if gen_tokens > 1 {
        (gen_tokens - 1) as f64 / decode_elapsed.as_secs_f64()
    } else { 0.0 };
    eprintln!("\n--- Performance ---");
    eprintln!("Generated: {} tokens", gen_tokens);
    eprintln!("TTFT: {:.1}ms", ttft.as_secs_f64() * 1000.0);
    eprintln!("TPOT: {:.1}ms", tpot);
    eprintln!("Throughput: {:.1} tok/s", tok_s);
    // Cleanup
    broadcast_cmd(&worker_txs, &worker_handles, WorkerCmd::Shutdown);
    for (h, _) in worker_handles {
        h.join().unwrap();
    }
 }
 // --- Worker infrastructure ---
 #[derive(Clone)]
 enum WorkerCmd {
    Register(usize),
    Prefill { tokens: Vec<u32>, slot: usize },
    Decode { tokens: Vec<u32>, positions: Vec<usize>, slots: Vec<usize> },
    Shutdown,
 }
 fn worker_loop(
    rank: usize,
    world: usize,
    uid: UniqueId,
    model_dir: PathBuf,
    config: ModelConfig,
    max_seq_len: usize,
    rx: std::sync::mpsc::Receiver<WorkerCmd>,
    ack_tx: std::sync::mpsc::Sender<()>,
 ) {
    xserv_cuda::device::set_device(rank as u32).unwrap();
    let tp = Arc::new(TpContext::init(rank, world, uid, rank as u32));
    eprintln!("[rank {rank}] Loading weights...");
    let weights = loader::load_model_dir(&model_dir, Device::Cpu);
    let model = GptOss::from_weights_tp(config.clone(), weights, rank, world, rank as u32, Some(tp));
    let local_kv = config.num_kv_heads() / world;
    let max_blocks_per_seq = (max_seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
    let total_blocks = max_blocks_per_seq + 64;
    let mut cache = PagedKVCache::new_tp(
        &config, local_kv, total_blocks, 0, 4, max_blocks_per_seq, DType::BF16, rank as u32,
    );
    eprintln!("[rank {rank}] Ready.");
    ack_tx.send(()).unwrap();
    while let Ok(cmd) = rx.recv() {
        match cmd {
            WorkerCmd::Register(slot) => {
                let _ = cache.register_sequence(slot);
            }
            WorkerCmd::Prefill { tokens, slot } => {
                let _ = model.forward_prefill_paged(&tokens, slot, &mut cache);
            }
            WorkerCmd::Decode { tokens, positions, slots } => {
                let _ = model.forward_decode_paged(&tokens, &positions, &slots, &mut cache);
            }
            WorkerCmd::Shutdown => break,
        }
        ack_tx.send(()).unwrap();
    }
 }
 fn broadcast_cmd(
    txs: &[std::sync::mpsc::Sender<WorkerCmd>],
    _handles: &[(std::thread::JoinHandle<()>, std::sync::mpsc::Receiver<()>)],
    cmd: WorkerCmd,
 ) {
    for tx in txs {
        tx.send(cmd.clone()).unwrap();
    }
 }
 fn wait_workers(handles: &[(std::thread::JoinHandle<()>, std::sync::mpsc::Receiver<()>)]) {
    for (_, rx) in handles {
        rx.recv().unwrap();
    }
 }
 fn sample_greedy_last(logits: &xserv_tensor::Tensor) -> u32 {
    use half::bf16;
    assert_eq!(logits.ndim(), 2);
    let logits_cpu = logits.to_device(Device::Cpu);
    let vocab_size = logits.shape()[1];
    let seq_len = logits.shape()[0];
    let data = logits_cpu.as_slice::<bf16>();
    let last = &data[(seq_len - 1) * vocab_size..seq_len * vocab_size];
    last.iter().enumerate()
        .max_by(|a, b| {
            let af = a.1.to_f32();
            let bf = b.1.to_f32();
            af.partial_cmp(&bf).unwrap_or(std::cmp::Ordering::Equal)
        })
        .map(|(i, _)| i as u32).unwrap()
 }
 fn get_arg<T: std::str::FromStr>(args: &[String], flag: &str) -> Option<T> {
    args.iter()
        .position(|a| a == flag)
        .and_then(|i| args.get(i + 1))
        .and_then(|s| s.parse().ok())
 }
--- a/crates/xserv-model/src/bin/xserv-cli.rs
+++ b/crates/xserv-model/src/bin/xserv-cli.rs
@@ -1,6 +1,6 @@
 use std::io::{self, Write};
 use std::path::PathBuf;
-use xserv_model::{loader, KVCache, ModelConfig};
+use xserv_model::{loader, KVCache, ModelConfig, PagedKVCache, BLOCK_SIZE};
 use xserv_tensor::{DType, Device};
 use xserv_tokenizer::Tokenizer;
@@ -36,14 +36,18 @@ fn main() {
    eprintln!("Loaded {} tensors", weights.len());
    let is_qwen3 = model_type.contains("qwen");
-    let dtype = if is_qwen3 { DType::BF16 } else { DType::F32 };
+    let is_gpt_oss = model_type.contains("gpt_oss");
    let dtype = if is_qwen3 || is_gpt_oss { DType::BF16 } else { DType::F32 };
    // Build model
    enum Model {
        GPT2(xserv_model::GPT2),
        Qwen3(xserv_model::Qwen3),
        GptOss(xserv_model::GptOss),
    }
-    let model = if is_qwen3 {
+    let model = if is_gpt_oss {
        Model::GptOss(xserv_model::GptOss::from_weights(config.clone(), weights))
    } else if is_qwen3 {
        Model::Qwen3(xserv_model::Qwen3::from_weights(config.clone(), weights))
    } else {
        Model::GPT2(xserv_model::GPT2::from_weights(config.clone(), weights))
@@ -62,40 +66,92 @@ fn main() {
        if input == "quit" || input == "exit" { break; }
        let token_ids = tokenizer.encode(input);
        let kv_heads = if is_qwen3 { config.num_kv_heads() } else { config.num_heads() };
        let mut cache = KVCache::new(
            config.num_layers(), kv_heads, config.head_dim(), dtype, Device::Cuda(0),
        );
-        // Prefill + decode
+        if is_gpt_oss {
-        let logits = match &model {
+            // GptOss uses paged KV cache
-            Model::GPT2(m) => m.forward_with_cache(&token_ids, &mut cache),
+            let max_seq = 2048;
-            Model::Qwen3(m) => m.forward_with_cache(&token_ids, &mut cache),
+            let max_blocks_per_seq = (max_seq + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        };
+            let total_blocks = max_blocks_per_seq + 64;
-        let mut next = match &model {
+            let mut paged_cache = PagedKVCache::new(
-            Model::GPT2(_) => xserv_model::gpt2::sample_greedy(&logits),
+                &config, total_blocks, 0, 4, max_blocks_per_seq, DType::BF16, 0,
-            Model::Qwen3(_) => xserv_model::qwen3::sample_greedy(&logits),
+            );
-        };
+            let slot = 0;
            paged_cache.register_sequence(slot).expect("register slot");
-        print!("{input}");
+            let model = match &model { Model::GptOss(m) => m, _ => unreachable!() };
-        io::stdout().flush().unwrap();
+            let logits = model.forward_prefill_paged(&token_ids, slot, &mut paged_cache);
            let mut next = sample_greedy_last(&logits);
-        for _ in 0..max_tokens {
+            print!("{input}");
            let text = tokenizer.decode(&[next]);
            print!("{text}");
            io::stdout().flush().unwrap();
-            if tokenizer.eos_token_id() == Some(next) { break; }
+            for _ in 0..max_tokens {
                let text = tokenizer.decode(&[next]);
                print!("{text}");
                io::stdout().flush().unwrap();
                if tokenizer.eos_token_id() == Some(next) { break; }
                let pos = paged_cache.seq_len(slot);
                let logits = model.forward_decode_paged(
                    &[next], &[pos], &[slot], &mut paged_cache,
                );
                next = sample_greedy_last(&logits);
            }
            println!();
            paged_cache.free_sequence(slot);
        } else {
            let kv_heads = if is_qwen3 { config.num_kv_heads() } else { config.num_heads() };
            let mut cache = KVCache::new(
                config.num_layers(), kv_heads, config.head_dim(), dtype, Device::Cuda(0),
            );
            let logits = match &model {
-                Model::GPT2(m) => m.forward_with_cache(&[next], &mut cache),
+                Model::GPT2(m) => m.forward_with_cache(&token_ids, &mut cache),
-                Model::Qwen3(m) => m.forward_with_cache(&[next], &mut cache),
+                Model::Qwen3(m) => m.forward_with_cache(&token_ids, &mut cache),
                Model::GptOss(_) => unreachable!(),
            };
-            next = match &model {
+            let mut next = match &model {
                Model::GPT2(_) => xserv_model::gpt2::sample_greedy(&logits),
                Model::Qwen3(_) => xserv_model::qwen3::sample_greedy(&logits),
                Model::GptOss(_) => unreachable!(),
            };
            print!("{input}");
            io::stdout().flush().unwrap();
            for _ in 0..max_tokens {
                let text = tokenizer.decode(&[next]);
                print!("{text}");
                io::stdout().flush().unwrap();
                if tokenizer.eos_token_id() == Some(next) { break; }
                let logits = match &model {
                    Model::GPT2(m) => m.forward_with_cache(&[next], &mut cache),
                    Model::Qwen3(m) => m.forward_with_cache(&[next], &mut cache),
                    Model::GptOss(_) => unreachable!(),
                };
                next = match &model {
                    Model::GPT2(_) => xserv_model::gpt2::sample_greedy(&logits),
                    Model::Qwen3(_) => xserv_model::qwen3::sample_greedy(&logits),
                    Model::GptOss(_) => unreachable!(),
                };
            }
            println!();
        }
        println!();
    }
 }
 fn sample_greedy_last(logits: &xserv_tensor::Tensor) -> u32 {
    use half::bf16;
    assert_eq!(logits.ndim(), 2);
    let logits_cpu = logits.to_device(Device::Cpu);
    let vocab_size = logits.shape()[1];
    let seq_len = logits.shape()[0];
    let data = logits_cpu.as_slice::<bf16>();
    let last = &data[(seq_len - 1) * vocab_size..seq_len * vocab_size];
    last.iter().enumerate()
        .max_by(|a, b| a.1.to_f32().partial_cmp(&b.1.to_f32()).unwrap())
        .map(|(i, _)| i as u32).unwrap()
 }
--- a/crates/xserv-model/src/config.rs
+++ b/crates/xserv-model/src/config.rs
@@ -1,6 +1,15 @@
 use serde::Deserialize;
 use std::path::Path;
 #[derive(Debug, Clone, Deserialize)]
 pub struct RopeScaling {
    pub rope_type: Option<String>,
    pub factor: Option<f64>,
    pub original_max_position_embeddings: Option<usize>,
    pub beta_fast: Option<f64>,
    pub beta_slow: Option<f64>,
 }
 #[derive(Debug, Clone, Deserialize)]
 pub struct ModelConfig {
    pub architectures: Option<Vec<String>>,
@@ -46,6 +55,24 @@ pub struct ModelConfig {
    pub rope_theta: Option<f64>,
    #[serde(default)]
    pub tie_word_embeddings: Option<bool>,
    // MoE (gpt-oss)
    #[serde(default)]
    pub num_local_experts: Option<usize>,
    #[serde(default)]
    pub num_experts_per_tok: Option<usize>,
    #[serde(default)]
    pub layer_types: Option<Vec<String>>,
    #[serde(default)]
    pub sliding_window: Option<usize>,
    #[serde(default)]
    pub attention_bias: Option<bool>,
    #[serde(default, rename = "head_dim")]
    pub explicit_head_dim: Option<usize>,
    #[serde(default)]
    pub rope_scaling: Option<RopeScaling>,
    #[serde(default)]
    pub swiglu_limit: Option<f64>,
 }
 impl ModelConfig {
@@ -81,7 +108,7 @@ impl ModelConfig {
    }
    pub fn head_dim(&self) -> usize {
-        self.hidden() / self.num_heads()
+        self.explicit_head_dim.unwrap_or_else(|| self.hidden() / self.num_heads())
    }
    pub fn ln_eps(&self) -> f32 {
@@ -93,4 +120,28 @@ impl ModelConfig {
    pub fn tied_embeddings(&self) -> bool {
        self.tie_word_embeddings.unwrap_or(true)
    }
    pub fn num_experts(&self) -> usize {
        self.num_local_experts.unwrap_or(0)
    }
    pub fn experts_per_token(&self) -> usize {
        self.num_experts_per_tok.unwrap_or(1)
    }
    pub fn is_moe(&self) -> bool {
        self.num_local_experts.unwrap_or(0) > 1
    }
    pub fn is_sliding_layer(&self, layer_idx: usize) -> bool {
        self.layer_types
            .as_ref()
            .and_then(|lt| lt.get(layer_idx))
            .map(|t| t == "sliding_attention")
            .unwrap_or(false)
    }
    pub fn window_size(&self) -> usize {
        self.sliding_window.unwrap_or(0)
    }
 }
--- a/crates/xserv-model/src/gpt_oss.rs
+++ b/crates/xserv-model/src/gpt_oss.rs
@@ -0,0 +1,594 @@
 use std::collections::HashMap;
 use std::ffi::c_void;
 use half::bf16;
 use xserv_kernels::*;
 use xserv_tensor::{Device, Tensor};
 use crate::config::ModelConfig;
 use crate::paged_kv_cache::PagedKVCache;
 pub struct GptOss {
    pub config: ModelConfig,
    embed_tokens: Tensor,
    layers: Vec<GptOssBlock>,
    norm: Tensor,
    lm_head_t: Tensor,
    rope_cache: RopeCache,
    tp: Option<std::sync::Arc<xserv_distributed::TpContext>>,
    local_num_heads: usize,
    local_num_kv_heads: usize,
 }
 struct GptOssBlock {
    input_norm: Tensor,
    // Attention (with bias)
    q_proj_wt: Tensor,
    q_proj_bias: Tensor,
    k_proj_wt: Tensor,
    k_proj_bias: Tensor,
    v_proj_wt: Tensor,
    v_proj_bias: Tensor,
    o_proj_wt: Tensor,
    o_proj_bias: Tensor,
    sinks: Tensor,
    #[allow(dead_code)]
    is_sliding: bool,
    window_size: usize,
    // MoE MLP
    post_norm: Tensor,
    router_wt: Tensor,
    router_bias: Tensor,
    expert_gate_up_wt: Vec<Tensor>,
    expert_gate_up_bias: Vec<Tensor>,
    expert_down_wt: Vec<Tensor>,
    expert_down_bias: Vec<Tensor>,
    // Activation params
    glu_alpha: f32,
    glu_limit: f32,
 }
 impl GptOss {
    pub fn from_weights(config: ModelConfig, w: HashMap<String, Tensor>) -> Self {
        Self::from_weights_tp(config, w, 0, 1, 0, None)
    }
    pub fn from_weights_tp(
        config: ModelConfig,
        mut w: HashMap<String, Tensor>,
        rank: usize,
        world: usize,
        device: u32,
        tp: Option<std::sync::Arc<xserv_distributed::TpContext>>,
    ) -> Self {
        crate::init_kernels();
        let dev = Device::Cuda(device);
        let take = |w: &mut HashMap<String, Tensor>, name: &str| -> Tensor {
            w.remove(name).unwrap_or_else(|| panic!("missing weight: {name}"))
        };
        let repl = |t: Tensor| -> Tensor { t.to_device(dev) };
        // column-parallel: shard rows of [out, in], transpose → [in, out/world]
        let col = |t: Tensor| -> Tensor {
            shard_rows(&t, rank, world).to_device(dev).transpose(0, 1).contiguous()
        };
        // row-parallel: shard cols of [out, in], transpose → [in/world, out]
        let row = |t: Tensor| -> Tensor {
            shard_cols(&t, rank, world).to_device(dev).transpose(0, 1).contiguous()
        };
        // Bias sharding helpers
        let col_bias = |t: Tensor| -> Tensor { shard_1d(&t, rank, world).to_device(dev) };
        let repl_bias = |t: Tensor| -> Tensor { t.to_device(dev) };
        let embed_tokens = repl(take(&mut w, "model.embed_tokens.weight"));
        let norm = repl(take(&mut w, "model.norm.weight"));
        let lm_head_t = repl(take(&mut w, "lm_head.weight")).transpose(0, 1).contiguous();
        let head_dim = config.head_dim();
        let rope_theta = config.rope_theta.unwrap_or(150000.0);
        let max_seq_len = config.max_seq_len().min(8192); // cap for memory
        let rope_cache = if let Some(ref rs) = config.rope_scaling {
            if rs.rope_type.as_deref() == Some("yarn") {
                RopeCache::new_yarn(
                    max_seq_len,
                    head_dim,
                    rope_theta,
                    rs.factor.unwrap_or(1.0),
                    rs.original_max_position_embeddings.unwrap_or(4096),
                    rs.beta_fast.unwrap_or(32.0),
                    rs.beta_slow.unwrap_or(1.0),
                )
            } else {
                RopeCache::new(max_seq_len, head_dim, rope_theta as f32)
            }
        } else {
            RopeCache::new(max_seq_len, head_dim, rope_theta as f32)
        };
        let num_layers = config.num_layers();
        let num_experts = config.num_experts();
        let glu_alpha = 1.702f32;
        let glu_limit = config.swiglu_limit.unwrap_or(7.0) as f32;
        let mut layers = Vec::with_capacity(num_layers);
        if rank == 0 {
            eprintln!(
                "Loading gpt-oss weights: {} layers, {} experts, world={world}...",
                num_layers, num_experts
            );
        }
        for i in 0..num_layers {
            let p = format!("model.layers.{i}");
            // Attention weights — column-parallel for Q/K/V, row-parallel for O
            let q_proj_wt = col(take(&mut w, &format!("{p}.self_attn.q_proj.weight")));
            let q_proj_bias = col_bias(take(&mut w, &format!("{p}.self_attn.q_proj.bias")));
            let k_proj_wt = col(take(&mut w, &format!("{p}.self_attn.k_proj.weight")));
            let k_proj_bias = col_bias(take(&mut w, &format!("{p}.self_attn.k_proj.bias")));
            let v_proj_wt = col(take(&mut w, &format!("{p}.self_attn.v_proj.weight")));
            let v_proj_bias = col_bias(take(&mut w, &format!("{p}.self_attn.v_proj.bias")));
            let o_proj_wt = row(take(&mut w, &format!("{p}.self_attn.o_proj.weight")));
            let o_proj_bias = repl_bias(take(&mut w, &format!("{p}.self_attn.o_proj.bias")));
            // Sinks: shard per-head across TP ranks
            let sinks_full = take(&mut w, &format!("{p}.self_attn.sinks"));
            let sinks = shard_1d(&sinks_full, rank, world).to_device(dev);
            let is_sliding = config.is_sliding_layer(i);
            let window_size = if is_sliding { config.window_size() } else { 0 };
            // MoE weights — router replicated, experts split across TP ranks
            let router_wt_raw = take(&mut w, &format!("{p}.mlp.router.weight"));
            let router_wt = router_wt_raw.to_device(dev).transpose(0, 1).contiguous();
            let router_bias = repl_bias(take(&mut w, &format!("{p}.mlp.router.bias")));
            // Expert weights: [num_experts, hidden, 2*inter] — stored as 3D tensors
            // Expert parallelism: rank owns experts [rank*E/world .. (rank+1)*E/world)
            let gate_up_3d = take(&mut w, &format!("{p}.mlp.experts.gate_up_proj"));
            let gate_up_bias_2d = take(&mut w, &format!("{p}.mlp.experts.gate_up_proj_bias"));
            let down_3d = take(&mut w, &format!("{p}.mlp.experts.down_proj"));
            let down_bias_2d = take(&mut w, &format!("{p}.mlp.experts.down_proj_bias"));
            let local_experts = num_experts / world;
            let expert_start = rank * local_experts;
            let mut expert_gate_up_wt = Vec::with_capacity(local_experts);
            let mut expert_gate_up_bias = Vec::with_capacity(local_experts);
            let mut expert_down_wt = Vec::with_capacity(local_experts);
            let mut expert_down_bias = Vec::with_capacity(local_experts);
            let inter2 = gate_up_3d.shape()[2]; // 2 * intermediate_size
            let hidden = gate_up_3d.shape()[1];
            let inter = down_3d.shape()[1]; // intermediate_size
            for local_e in 0..local_experts {
                let e = expert_start + local_e;
                let gu_slice = slice_expert_3d(&gate_up_3d, e, hidden, inter2);
                expert_gate_up_wt.push(gu_slice.to_device(dev));
                let gu_bias = slice_expert_2d(&gate_up_bias_2d, e, inter2);
                expert_gate_up_bias.push(gu_bias.to_device(dev));
                let d_slice = slice_expert_3d(&down_3d, e, inter, hidden);
                expert_down_wt.push(d_slice.to_device(dev));
                let d_bias = slice_expert_2d(&down_bias_2d, e, hidden);
                expert_down_bias.push(d_bias.to_device(dev));
            }
            xserv_cuda::allocator::cached_trim();
            layers.push(GptOssBlock {
                input_norm: repl(take(&mut w, &format!("{p}.input_layernorm.weight"))),
                q_proj_wt,
                q_proj_bias,
                k_proj_wt,
                k_proj_bias,
                v_proj_wt,
                v_proj_bias,
                o_proj_wt,
                o_proj_bias,
                sinks,
                is_sliding,
                window_size,
                post_norm: repl(take(&mut w, &format!("{p}.post_attention_layernorm.weight"))),
                router_wt,
                router_bias,
                expert_gate_up_wt,
                expert_gate_up_bias,
                expert_down_wt,
                expert_down_bias,
                glu_alpha,
                glu_limit,
            });
        }
        let local_num_heads = config.num_heads() / world;
        let local_num_kv_heads = config.num_kv_heads() / world;
        Self {
            config,
            embed_tokens,
            layers,
            norm,
            lm_head_t,
            rope_cache,
            tp,
            local_num_heads,
            local_num_kv_heads,
        }
    }
    #[inline]
    fn all_reduce(&self, t: &Tensor) {
        if let Some(tp) = &self.tp {
            if tp.world > 1 {
                let ptr = t.storage().gpu_buffer().as_ptr() as *mut c_void;
                tp.all_reduce_sum_bf16_ptr(ptr, t.numel());
            }
        }
    }
    /// Paged decode: process one token per sequence using paged KV cache.
    pub fn forward_decode_paged(
        &self,
        tokens: &[u32],
        positions: &[usize],
        seq_slots: &[usize],
        paged_cache: &mut PagedKVCache,
    ) -> Tensor {
        let batch = tokens.len();
        assert_eq!(positions.len(), batch);
        assert_eq!(seq_slots.len(), batch);
        assert!(batch > 0);
        let num_heads = self.local_num_heads;
        let num_kv_heads = self.local_num_kv_heads;
        let head_dim = self.config.head_dim();
        let eps = self.config.rms_norm_eps.unwrap_or(1e-5) as f32;
        let kv_lens: Vec<i32> = positions.iter().map(|&p| (p + 1) as i32).collect();
        for (b, &slot) in seq_slots.iter().enumerate() {
            paged_cache.ensure_capacity(slot, positions[b] + 1);
        }
        paged_cache.sync_active_batch_with_lens(seq_slots, &kv_lens);
        let bt_ptr = paged_cache.block_table_gpu().as_ptr() as *const i32;
        let cl_ptr = paged_cache.context_lens_gpu().as_ptr() as *const i32;
        let max_blocks = paged_cache.max_blocks_per_seq();
        let positions_u32: Vec<u32> = positions.iter().map(|&p| p as u32).collect();
        let mut x = embedding(&self.embed_tokens, tokens);
        for (layer_idx, layer) in self.layers.iter().enumerate() {
            let residual = x.clone();
            let normed = rmsnorm(&x, &layer.input_norm, eps);
            // Q/K/V projections with bias
            let q_all = add_bias(&matmul_2d(&normed, &layer.q_proj_wt), &layer.q_proj_bias);
            let k_all = add_bias(&matmul_2d(&normed, &layer.k_proj_wt), &layer.k_proj_bias);
            let v_all = add_bias(&matmul_2d(&normed, &layer.v_proj_wt), &layer.v_proj_bias);
            // Reshape for RoPE: [B, H*D] → [B, H, D]
            let q_3d = q_all.reshape(&[batch, num_heads, head_dim]);
            let k_3d = k_all.reshape(&[batch, num_kv_heads, head_dim]);
            // RoPE (no QK-norm for gpt-oss)
            rope_inplace(&q_3d, &self.rope_cache, &positions_u32);
            rope_inplace(&k_3d, &self.rope_cache, &positions_u32);
            let v_3d = v_all.reshape(&[batch, num_kv_heads, head_dim]);
            // KV cache scatter
            paged_cache.append_tokens_batched(layer_idx, &k_3d, &v_3d, batch);
            // Paged attention with sinks + sliding window
            let q_4d = q_3d.reshape(&[batch, num_heads, 1, head_dim]);
            let k_pool_ptr = paged_cache.k_pool(layer_idx).as_ptr() as *const c_void;
            let v_pool_ptr = paged_cache.v_pool(layer_idx).as_ptr() as *const c_void;
            let sinks_ptr = layer.sinks.data_ptr() as *const c_void;
            let attn_out = paged_decode_attention_sinks(
                &q_4d, k_pool_ptr, v_pool_ptr, bt_ptr, cl_ptr,
                sinks_ptr,
                batch, num_heads, num_kv_heads, head_dim, max_blocks,
                layer.window_size,
            );
            let attn_merged = attn_out.reshape(&[batch, num_heads * head_dim]);
            let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
            self.all_reduce(&attn_proj);
            let attn_proj = add_bias(&attn_proj, &layer.o_proj_bias);
            // Residual + post-norm
            let (normed, x_new) = add_rmsnorm(&attn_proj, &residual, &layer.post_norm, eps);
            let residual = x_new;
            let normed = normed.contiguous();
            // MoE MLP
            let moe_out = self.moe_forward(&normed, layer, batch);
            x = xserv_kernels::add(&residual, &moe_out);
        }
        // Advance KV cache
        for &slot in seq_slots {
            paged_cache.advance_seq_len(slot, 1);
        }
        unsafe { xserv_cuda::ffi::cudaDeviceSynchronize(); }
        let x = rmsnorm(&x, &self.norm, eps);
        unsafe { xserv_cuda::ffi::cudaDeviceSynchronize(); }
        let logits = matmul_2d(&x, &self.lm_head_t);
        unsafe { xserv_cuda::ffi::cudaDeviceSynchronize(); }
        logits
    }
    /// Paged prefill: process full prompt tokens.
    pub fn forward_prefill_paged(
        &self,
        token_ids: &[u32],
        slot: usize,
        paged_cache: &mut PagedKVCache,
    ) -> Tensor {
        let new_tokens = token_ids.len();
        let pos_offset = paged_cache.seq_len(slot);
        let num_heads = self.local_num_heads;
        let num_kv_heads = self.local_num_kv_heads;
        let head_dim = self.config.head_dim();
        let eps = self.config.rms_norm_eps.unwrap_or(1e-5) as f32;
        paged_cache.ensure_capacity(slot, pos_offset + new_tokens);
        paged_cache.advance_seq_len(slot, new_tokens);
        let mut x = embedding(&self.embed_tokens, token_ids);
        let positions: Vec<u32> = (pos_offset..pos_offset + new_tokens).map(|p| p as u32).collect();
        for (layer_idx, layer) in self.layers.iter().enumerate() {
            let residual = x.clone();
            let normed = rmsnorm(&x, &layer.input_norm, eps);
            let q = add_bias(&matmul_2d(&normed, &layer.q_proj_wt), &layer.q_proj_bias);
            let k = add_bias(&matmul_2d(&normed, &layer.k_proj_wt), &layer.k_proj_bias);
            let v = add_bias(&matmul_2d(&normed, &layer.v_proj_wt), &layer.v_proj_bias);
            let q = reshape_heads_gpu(&q, new_tokens, num_heads, head_dim);
            let k = reshape_heads_gpu(&k, new_tokens, num_kv_heads, head_dim);
            let v = reshape_heads_gpu(&v, new_tokens, num_kv_heads, head_dim);
            // RoPE
            let q = transpose_for_rope_gpu(&q, new_tokens, num_heads, head_dim);
            let k = transpose_for_rope_gpu(&k, new_tokens, num_kv_heads, head_dim);
            rope_inplace(&q, &self.rope_cache, &positions);
            rope_inplace(&k, &self.rope_cache, &positions);
            let q = transpose_from_rope_gpu(&q, new_tokens, num_heads, head_dim);
            let k = transpose_from_rope_gpu(&k, new_tokens, num_kv_heads, head_dim);
            // KV cache
            paged_cache.append_tokens(slot, layer_idx, &k, &v, new_tokens, pos_offset);
            let (k_full, v_full) = paged_cache.gather_kv_contiguous(slot, layer_idx);
            // Flash attention for prefill (sinks handled post-hoc for simplicity)
            // TODO: integrate sinks into flash attention for exact match
            let attn_out = flash_attention(&q, &k_full, &v_full, true);
            let attn_merged = merge_heads_gpu(&attn_out, new_tokens, num_heads, head_dim);
            let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
            self.all_reduce(&attn_proj);
            let attn_proj = add_bias(&attn_proj, &layer.o_proj_bias);
            let (normed, x_new) = add_rmsnorm(&attn_proj, &residual, &layer.post_norm, eps);
            let residual = x_new;
            // MoE MLP
            let moe_out = self.moe_forward(&normed, layer, new_tokens);
            x = xserv_kernels::add(&residual, &moe_out);
        }
        let x = rmsnorm(&x, &self.norm, eps);
        let logits = matmul_2d(&x, &self.lm_head_t);
        unsafe { xserv_cuda::ffi::cudaDeviceSynchronize(); }
        logits
    }
    /// MoE forward pass for one layer with expert parallelism.
    /// Each rank owns `num_experts / world` experts. Tokens routed to non-local
    /// experts get zero contribution from this rank; AllReduce sums all ranks.
    /// Input: [tokens, hidden], Output: [tokens, hidden]
    fn moe_forward(&self, x: &Tensor, layer: &GptOssBlock, num_tokens: usize) -> Tensor {
        let hidden = self.config.hidden();
        let num_experts = self.config.num_experts();
        let top_k = self.config.experts_per_token();
        let world = self.tp.as_ref().map(|tp| tp.world).unwrap_or(1);
        let rank = self.tp.as_ref().map(|tp| tp.rank).unwrap_or(0);
        let local_experts = num_experts / world;
        let expert_start = rank * local_experts;
        // Router: [tokens, hidden] @ [hidden, num_experts] + bias → [tokens, num_experts]
        unsafe { xserv_cuda::ffi::cudaDeviceSynchronize(); }
        // Pad to 2 rows to avoid GEMV path (workaround for GEMV NaN bug with small N)
        let x_padded = if num_tokens == 1 {
            let x_cpu_tmp = x.to_device(Device::Cpu);
            let xd = x_cpu_tmp.as_slice::<bf16>();
            let mut padded = xd.to_vec();
            padded.extend(vec![bf16::ZERO; hidden]);
            Tensor::from_slice(&padded, &[2, hidden]).to_device(x.device())
        } else {
            x.clone()
        };
        let router_logits_full = add_bias(
            &matmul_2d(&x_padded, &layer.router_wt),
            &layer.router_bias,
        );
        let router_logits = if num_tokens == 1 {
            router_logits_full.narrow(0, 0, 1).contiguous()
        } else {
            router_logits_full
        };
        unsafe { xserv_cuda::ffi::cudaDeviceSynchronize(); }
        let router_cpu = router_logits.to_device(Device::Cpu);
        let router_data = router_cpu.as_slice::<bf16>();
        // Copy x to CPU after all GPU ops are synced
        let x_cpu = x.to_device(Device::Cpu);
        let x_data = x_cpu.as_slice::<bf16>();
        let mut output_acc = vec![0.0f32; num_tokens * hidden];
        for t in 0..num_tokens {
            let row = &router_data[t * num_experts..(t + 1) * num_experts];
            // Find top-k expert indices (global)
            let mut indices: Vec<(usize, f32)> = row.iter()
                .enumerate()
                .map(|(i, &v)| (i, v.to_f32()))
                .collect();
            indices.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
            let top_indices: Vec<(usize, f32)> = indices[..top_k].to_vec();
            // Softmax over top-k logits
            let max_val = top_indices.iter().map(|x| x.1).fold(f32::NEG_INFINITY, f32::max);
            let exp_sum: f32 = top_indices.iter().map(|x| (x.1 - max_val).exp()).sum();
            let weights: Vec<f32> = top_indices.iter()
                .map(|x| (x.1 - max_val).exp() / exp_sum)
                .collect();
            // Fresh GPU upload of token data — immune to cached allocator buffer reuse
            let token_slice = &x_data[t * hidden..(t + 1) * hidden];
            let token_tensor = Tensor::from_slice(token_slice, &[1, hidden]).to_device(x.device());
            for (k_idx, &(expert_id, _)) in top_indices.iter().enumerate() {
                // Only process experts owned by this rank
                if expert_id < expert_start || expert_id >= expert_start + local_experts {
                    continue;
                }
                let local_id = expert_id - expert_start;
                let weight = weights[k_idx];
                let gate_up_raw = matmul_2d(&token_tensor, &layer.expert_gate_up_wt[local_id]);
                let gate_up = add_bias(&gate_up_raw, &layer.expert_gate_up_bias[local_id]);
                let activated = gpt_oss_glu(&gate_up, layer.glu_alpha, layer.glu_limit);
                let down_raw = matmul_2d(&activated, &layer.expert_down_wt[local_id]);
                let down = add_bias(&down_raw, &layer.expert_down_bias[local_id]);
                let down_cpu = down.to_device(Device::Cpu);
                let down_data = down_cpu.as_slice::<bf16>();
                for d in 0..hidden {
                    output_acc[t * hidden + d] += weight * down_data[d].to_f32();
                }
            }
        }
        // Convert accumulated output to BF16 tensor on GPU
        let output_bf16: Vec<bf16> = output_acc.iter().map(|&v| bf16::from_f32(v)).collect();
        let moe_out = Tensor::from_slice(&output_bf16, &[num_tokens, hidden]).to_device(x.device());
        self.all_reduce(&moe_out);
        moe_out
    }
 }
 // --- Helpers ---
 fn matmul_2d(a: &Tensor, b: &Tensor) -> Tensor {
    assert_eq!(a.ndim(), 2);
    assert_eq!(b.ndim(), 2);
    matmul(a, b, GemmBackend::CuBlas)
 }
 /// Add bias to a 2D tensor: [rows, cols] + [cols] → [rows, cols]
 fn add_bias(x: &Tensor, bias: &Tensor) -> Tensor {
    assert_eq!(x.ndim(), 2);
    assert_eq!(bias.ndim(), 1);
    let rows = x.shape()[0];
    let cols = x.shape()[1];
    assert_eq!(bias.shape()[0], cols, "bias size {} != cols {}", bias.shape()[0], cols);
    // Broadcast bias to each row using GPU kernels.
    // Tile bias [cols] into [rows, cols] by repeating rows, then add element-wise.
    let bias_cpu = bias.to_device(Device::Cpu);
    let bias_data = bias_cpu.as_slice::<bf16>();
    let mut tiled = Vec::with_capacity(rows * cols);
    for _ in 0..rows {
        tiled.extend_from_slice(bias_data);
    }
    let bias_tiled = Tensor::from_slice(&tiled, &[rows, cols]).to_device(x.device());
    let x_c = x.contiguous();
    xserv_kernels::add(&x_c, &bias_tiled)
 }
 fn shard_rows(t: &Tensor, rank: usize, world: usize) -> Tensor {
    if world == 1 { return t.clone(); }
    let shape = t.shape();
    assert_eq!(shape.len(), 2);
    let (rows, cols) = (shape[0], shape[1]);
    assert!(rows % world == 0, "rows {rows} not divisible by world {world}");
    let local = rows / world;
    let host = t.to_device(Device::Cpu);
    let data = host.as_slice::<bf16>();
    let start = rank * local * cols;
    let shard = data[start..start + local * cols].to_vec();
    Tensor::from_slice(&shard, &[local, cols])
 }
 fn shard_cols(t: &Tensor, rank: usize, world: usize) -> Tensor {
    if world == 1 { return t.clone(); }
    let shape = t.shape();
    assert_eq!(shape.len(), 2);
    let (rows, cols) = (shape[0], shape[1]);
    assert!(cols % world == 0, "cols {cols} not divisible by world {world}");
    let local = cols / world;
    let c0 = rank * local;
    let host = t.to_device(Device::Cpu);
    let data = host.as_slice::<bf16>();
    let mut shard = Vec::with_capacity(rows * local);
    for r in 0..rows {
        let base = r * cols + c0;
        shard.extend_from_slice(&data[base..base + local]);
    }
    Tensor::from_slice(&shard, &[rows, local])
 }
 fn shard_1d(t: &Tensor, rank: usize, world: usize) -> Tensor {
    if world == 1 { return t.clone(); }
    let shape = t.shape();
    assert_eq!(shape.len(), 1);
    let total = shape[0];
    assert!(total % world == 0, "dim {total} not divisible by world {world}");
    let local = total / world;
    let host = t.to_device(Device::Cpu);
    let data = host.as_slice::<bf16>();
    let start = rank * local;
    let shard = data[start..start + local].to_vec();
    Tensor::from_slice(&shard, &[local])
 }
 /// Extract expert `e` from a [num_experts, rows, cols] 3D tensor → [rows, cols] 2D
 fn slice_expert_3d(t: &Tensor, e: usize, rows: usize, cols: usize) -> Tensor {
    assert_eq!(t.ndim(), 3);
    let host = t.to_device(Device::Cpu);
    let data = host.as_slice::<bf16>();
    let stride = rows * cols;
    let start = e * stride;
    let slice = data[start..start + stride].to_vec();
    Tensor::from_slice(&slice, &[rows, cols])
 }
 /// Extract expert `e` from a [num_experts, dim] 2D tensor → [dim] 1D
 fn slice_expert_2d(t: &Tensor, e: usize, dim: usize) -> Tensor {
    assert_eq!(t.ndim(), 2);
    let host = t.to_device(Device::Cpu);
    let data = host.as_slice::<bf16>();
    let start = e * dim;
    let slice = data[start..start + dim].to_vec();
    Tensor::from_slice(&slice, &[dim])
 }
--- a/crates/xserv-model/src/lib.rs
+++ b/crates/xserv-model/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod config;
 pub mod decode_graph;
 pub mod gpt2;
 pub mod gpt_oss;
 pub mod kv_cache;
 pub mod loader;
 pub mod paged_kv_cache;
@@ -10,6 +11,7 @@ pub mod sampling;
 pub use config::ModelConfig;
 pub use decode_graph::{DecodeGraphState, LayerWeightPtrs};
 pub use gpt2::{GPT2, KVCache};
 pub use gpt_oss::GptOss;
 pub use kv_cache::GpuKVCache;
 pub use paged_kv_cache::{BlockAllocator, Location, PagedKVCache, BLOCK_SIZE};
 pub use qwen3::Qwen3;
--- a/crates/xserv-model/src/qwen3.rs
+++ b/crates/xserv-model/src/qwen3.rs
@@ -198,17 +198,27 @@ impl Qwen3 {
        );
        for i in lo..hi {
            let p = format!("model.layers.{i}");
            let q_proj_wt = wt(take(&mut w, &format!("{p}.self_attn.q_proj.weight")));
            let k_proj_wt = wt(take(&mut w, &format!("{p}.self_attn.k_proj.weight")));
            let v_proj_wt = wt(take(&mut w, &format!("{p}.self_attn.v_proj.weight")));
            let q_dim = q_proj_wt.shape()[1];
            let kv_dim = k_proj_wt.shape()[1];
            let qkv_proj_wt = cat_cols(&[&q_proj_wt, &k_proj_wt, &v_proj_wt]);
            drop((q_proj_wt, k_proj_wt, v_proj_wt));
            let gate_proj_wt = wt(take(&mut w, &format!("{p}.mlp.gate_proj.weight")));
            let up_proj_wt = wt(take(&mut w, &format!("{p}.mlp.up_proj.weight")));
            let gate_up_proj_wt = cat_cols(&[&gate_proj_wt, &up_proj_wt]);
            drop((gate_proj_wt, up_proj_wt));
            layers.push(Qwen3Block {
                input_norm: repl(take(&mut w, &format!("{p}.input_layernorm.weight"))),
-                q_proj_wt: wt(take(&mut w, &format!("{p}.self_attn.q_proj.weight"))),
+                qkv_proj_wt,
-                k_proj_wt: wt(take(&mut w, &format!("{p}.self_attn.k_proj.weight"))),
+                q_dim,
-                v_proj_wt: wt(take(&mut w, &format!("{p}.self_attn.v_proj.weight"))),
+                kv_dim,
                o_proj_wt: wt(take(&mut w, &format!("{p}.self_attn.o_proj.weight"))),
                q_norm: repl(take(&mut w, &format!("{p}.self_attn.q_norm.weight"))),
                k_norm: repl(take(&mut w, &format!("{p}.self_attn.k_norm.weight"))),
                post_norm: repl(take(&mut w, &format!("{p}.post_attention_layernorm.weight"))),
-                gate_proj_wt: wt(take(&mut w, &format!("{p}.mlp.gate_proj.weight"))),
+                gate_up_proj_wt,
                up_proj_wt: wt(take(&mut w, &format!("{p}.mlp.up_proj.weight"))),
                down_proj_wt: wt(take(&mut w, &format!("{p}.mlp.down_proj.weight"))),
            });
        }
@@ -272,9 +282,10 @@ impl Qwen3 {
            let residual = x.clone();
            let normed = rmsnorm(&x, &layer.input_norm, eps);
-            let q = matmul_2d(&normed, &layer.q_proj_wt);
+            let qkv = matmul_2d(&normed, &layer.qkv_proj_wt);
-            let k = matmul_2d(&normed, &layer.k_proj_wt);
+            let q = qkv.narrow(1, 0, layer.q_dim).contiguous();
-            let v = matmul_2d(&normed, &layer.v_proj_wt);
+            let k = qkv.narrow(1, layer.q_dim, layer.kv_dim).contiguous();
            let v = qkv.narrow(1, layer.q_dim + layer.kv_dim, layer.kv_dim).contiguous();
            let q = xserv_kernels::reshape_heads_gpu(&q, new_tokens, num_heads, head_dim);
            let k = xserv_kernels::reshape_heads_gpu(&k, new_tokens, num_kv_heads, head_dim);
@@ -300,8 +311,10 @@ impl Qwen3 {
            let (normed, x_new) = xserv_kernels::add_rmsnorm(&attn_proj, &residual, &layer.post_norm, eps);
            let residual = x_new.clone();
-            let gate = matmul_2d(&normed, &layer.gate_proj_wt);
+            let gate_up = matmul_2d(&normed, &layer.gate_up_proj_wt);
-            let up = matmul_2d(&normed, &layer.up_proj_wt);
+            let ffn_dim = gate_up.shape()[1] / 2;
            let gate = gate_up.narrow(1, 0, ffn_dim).contiguous();
            let up = gate_up.narrow(1, ffn_dim, ffn_dim).contiguous();
            let hidden_states = xserv_kernels::silu_mul(&gate, &up);
            let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
            x = add_any(&residual, &down);
@@ -340,9 +353,10 @@ impl Qwen3 {
            let residual = x.clone();
            let normed = rmsnorm(&x, &layer.input_norm, eps);
-            let q_all = matmul_2d(&normed, &layer.q_proj_wt);
+            let qkv_all = matmul_2d(&normed, &layer.qkv_proj_wt);
-            let k_all = matmul_2d(&normed, &layer.k_proj_wt);
+            let q_all = qkv_all.narrow(1, 0, layer.q_dim).contiguous();
-            let v_all = matmul_2d(&normed, &layer.v_proj_wt);
+            let k_all = qkv_all.narrow(1, layer.q_dim, layer.kv_dim).contiguous();
            let v_all = qkv_all.narrow(1, layer.q_dim + layer.kv_dim, layer.kv_dim).contiguous();
            let mut q_rows: Vec<Tensor> = Vec::with_capacity(batch);
            for b in 0..batch {
@@ -390,8 +404,10 @@ impl Qwen3 {
            let (normed, x_new) = xserv_kernels::add_rmsnorm(&attn_proj, &residual, &layer.post_norm, eps);
            let residual = x_new.clone();
-            let gate = matmul_2d(&normed, &layer.gate_proj_wt);
+            let gate_up = matmul_2d(&normed, &layer.gate_up_proj_wt);
-            let up = matmul_2d(&normed, &layer.up_proj_wt);
+            let ffn_dim = gate_up.shape()[1] / 2;
            let gate = gate_up.narrow(1, 0, ffn_dim).contiguous();
            let up = gate_up.narrow(1, ffn_dim, ffn_dim).contiguous();
            let hidden_states = xserv_kernels::silu_mul(&gate, &up);
            let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
            x = add_any(&residual, &down);
--- a/csrc/activation/activations.cu
+++ b/csrc/activation/activations.cu
@@ -58,6 +58,25 @@ __global__ void silu_mul_bf16_kernel(const __nv_bfloat16* gate, const __nv_bfloa
    }
 }
 // gpt-oss GLU: gate_up is [N, 2*D] with interleaved columns (gate=even, up=odd).
 // gate = gate_up[::2].clamp(max=limit)
 // up   = gate_up[1::2].clamp(-limit, limit)
 // glu  = gate * sigmoid(gate * alpha)
 // out  = (up + 1) * glu
 // Output: [N, D]
 __global__ void gpt_oss_glu_bf16_kernel(const __nv_bfloat16* gate_up, __nv_bfloat16* out,
                                         int n_elements, float alpha, float limit) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n_elements) {
        float g = __bfloat162float(gate_up[idx * 2]);
        float u = __bfloat162float(gate_up[idx * 2 + 1]);
        g = fminf(g, limit);
        u = fmaxf(fminf(u, limit), -limit);
        float glu = g / (1.0f + expf(-g * alpha));
        out[idx] = __float2bfloat16((u + 1.0f) * glu);
    }
 }
 // Element-wise add: out = a + b
 __global__ void add_f32_kernel(const float* a, const float* b, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -163,4 +182,13 @@ void launch_silu_mul_bf16(const void* gate, const void* up, void* out, int n, vo
    CUDA_CHECK_LAST_ERROR();
 }
 void launch_gpt_oss_glu_bf16(const void* gate_up, void* out, int n_elements,
                              float alpha, float limit, void* stream) {
    int block = 256;
    int grid = (n_elements + block - 1) / block;
    gpt_oss_glu_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
        (const __nv_bfloat16*)gate_up, (__nv_bfloat16*)out, n_elements, alpha, limit);
    CUDA_CHECK_LAST_ERROR();
 }
 }
--- a/csrc/attention/paged_attention.cu
+++ b/csrc/attention/paged_attention.cu
@@ -183,6 +183,173 @@ __global__ void paged_decode_attention_bf16_kernel(
    }
 }
 // Extended paged decode attention with attention sinks and sliding window.
 // sinks: [num_q_heads] BF16 — per-head extra logit appended before softmax.
 // window_size: >0 = sliding window (only attend to last `window_size` positions), 0 = full.
 __global__ void paged_decode_attention_sinks_bf16_kernel(
    const __nv_bfloat16* __restrict__ Q,
    const __nv_bfloat16* __restrict__ K_cache,
    const __nv_bfloat16* __restrict__ V_cache,
    __nv_bfloat16* __restrict__ O,
    const int* __restrict__ block_tables,
    const int* __restrict__ context_lens,
    const __nv_bfloat16* __restrict__ sinks, // [num_q_heads] or NULL
    int num_q_heads, int num_kv_heads,
    int head_dim, int max_blocks_per_seq,
    float scale, int window_size
 ) {
    int seq_idx = blockIdx.y;
    int q_head  = blockIdx.x;
    int tid     = threadIdx.x;
    int kv_len = context_lens[seq_idx];
    if (kv_len <= 0) {
        if (tid < head_dim) {
            O[((long long)seq_idx * num_q_heads + q_head) * head_dim + tid] =
                __float2bfloat16(0.0f);
        }
        return;
    }
    int heads_per_group = num_q_heads / num_kv_heads;
    int kv_head = q_head / heads_per_group;
    const __nv_bfloat16* Q_ptr = Q +
        ((long long)seq_idx * num_q_heads + q_head) * head_dim;
    __nv_bfloat16* O_ptr = O +
        ((long long)seq_idx * num_q_heads + q_head) * head_dim;
    const int* bt = block_tables + (long long)seq_idx * max_blocks_per_seq;
    // Sliding window: only attend to positions [kv_len - window_size, kv_len)
    int start_pos = 0;
    if (window_size > 0 && kv_len > window_size) {
        start_pos = kv_len - window_size;
    }
    float q_reg[PAGED_HEAD_DIM_MAX];
    for (int d = 0; d < head_dim; d++) {
        q_reg[d] = __bfloat162float(Q_ptr[d]);
    }
    float local_max = -INFINITY;
    float local_sum = 0.0f;
    float local_O[PAGED_HEAD_DIM_MAX];
    for (int d = 0; d < head_dim; d++) local_O[d] = 0.0f;
    int kv_stride_block = num_kv_heads * PAGED_BLOCK_SIZE * head_dim;
    int kv_stride_head  = PAGED_BLOCK_SIZE * head_dim;
    int attend_len = kv_len - start_pos;
    for (int rel = tid; rel < attend_len; rel += PAGED_THREADS) {
        int pos = start_pos + rel;
        int logical_blk = pos / PAGED_BLOCK_SIZE;
        int slot_in_blk = pos % PAGED_BLOCK_SIZE;
        int phys_blk = bt[logical_blk];
        const __nv_bfloat16* K_pos = K_cache
            + (long long)phys_blk * kv_stride_block
            + kv_head * kv_stride_head
            + slot_in_blk * head_dim;
        const __nv_bfloat16* V_pos = V_cache
            + (long long)phys_blk * kv_stride_block
            + kv_head * kv_stride_head
            + slot_in_blk * head_dim;
        float dot = 0.0f;
        for (int d = 0; d < head_dim; d++) {
            dot += q_reg[d] * __bfloat162float(K_pos[d]);
        }
        float s = dot * scale;
        float new_max = fmaxf(local_max, s);
        float correction = expf(local_max - new_max);
        float p = expf(s - new_max);
        local_sum = local_sum * correction + p;
        for (int d = 0; d < head_dim; d++) local_O[d] *= correction;
        for (int d = 0; d < head_dim; d++) {
            local_O[d] += p * __bfloat162float(V_pos[d]);
        }
        local_max = new_max;
    }
    // Include the sink logit (only thread 0 handles it to avoid double-counting)
    float sink_logit = -INFINITY;
    if (sinks != nullptr && tid == 0) {
        sink_logit = __bfloat162float(sinks[q_head]);
        float new_max = fmaxf(local_max, sink_logit);
        float correction = expf(local_max - new_max);
        float p = expf(sink_logit - new_max);
        local_sum = local_sum * correction + p;
        for (int d = 0; d < head_dim; d++) local_O[d] *= correction;
        // Sink absorbs probability but produces no value output (p * 0)
        local_max = new_max;
    }
    // ---- Block-level online softmax reduction (same as base kernel) ----
    __shared__ float smem_max[32];
    __shared__ float smem_sum[32];
    __shared__ float smem_O[PAGED_HEAD_DIM_MAX];
    int lane = tid & 31;
    int warp_id = tid >> 5;
    int num_warps = PAGED_THREADS >> 5;
    float warp_max = local_max;
    #pragma unroll
    for (int offset = 16; offset > 0; offset >>= 1)
        warp_max = fmaxf(warp_max, __shfl_down_sync(0xffffffff, warp_max, offset));
    if (lane == 0) smem_max[warp_id] = warp_max;
    __syncthreads();
    float global_max;
    if (tid == 0) {
        global_max = smem_max[0];
        for (int i = 1; i < num_warps; i++)
            global_max = fmaxf(global_max, smem_max[i]);
        smem_max[0] = global_max;
    }
    __syncthreads();
    global_max = smem_max[0];
    float rescale = (local_max == -INFINITY) ? 0.0f : expf(local_max - global_max);
    local_sum *= rescale;
    for (int d = 0; d < head_dim; d++) local_O[d] *= rescale;
    float warp_sum = local_sum;
    #pragma unroll
    for (int offset = 16; offset > 0; offset >>= 1)
        warp_sum += __shfl_down_sync(0xffffffff, warp_sum, offset);
    if (lane == 0) smem_sum[warp_id] = warp_sum;
    __syncthreads();
    float global_sum;
    if (tid == 0) {
        global_sum = 0.0f;
        for (int i = 0; i < num_warps; i++) global_sum += smem_sum[i];
        smem_sum[0] = global_sum;
    }
    __syncthreads();
    global_sum = smem_sum[0];
    for (int d = tid; d < head_dim; d += PAGED_THREADS) smem_O[d] = 0.0f;
    __syncthreads();
    for (int d = 0; d < head_dim; d++) {
        float val = local_O[d];
        #pragma unroll
        for (int offset = 16; offset > 0; offset >>= 1)
            val += __shfl_down_sync(0xffffffff, val, offset);
        if (lane == 0) atomicAdd(&smem_O[d], val);
    }
    __syncthreads();
    float inv_sum = (global_sum > 0.0f) ? (1.0f / global_sum) : 0.0f;
    for (int d = tid; d < head_dim; d += PAGED_THREADS) {
        O_ptr[d] = __float2bfloat16(smem_O[d] * inv_sum);
    }
 }
 extern "C" {
 void launch_paged_decode_attention_bf16(
@@ -212,4 +379,33 @@ void launch_paged_decode_attention_bf16(
    CUDA_CHECK_LAST_ERROR();
 }
 void launch_paged_decode_attention_sinks_bf16(
    const void* Q,
    const void* K_cache,
    const void* V_cache,
    void* O,
    const int* block_tables,
    const int* context_lens,
    const void* sinks,
    int batch, int num_q_heads, int num_kv_heads,
    int head_dim, int max_blocks_per_seq,
    float scale, int window_size, void* stream
 ) {
    dim3 grid(num_q_heads, batch);
    int block = PAGED_THREADS;
    paged_decode_attention_sinks_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
        (const __nv_bfloat16*)Q,
        (const __nv_bfloat16*)K_cache,
        (const __nv_bfloat16*)V_cache,
        (__nv_bfloat16*)O,
        block_tables, context_lens,
        (const __nv_bfloat16*)sinks,
        num_q_heads, num_kv_heads,
        head_dim, max_blocks_per_seq,
        scale, window_size
    );
    CUDA_CHECK_LAST_ERROR();
 }
 }