diff --git a/crates/xserv-kernels/src/embedding.rs b/crates/xserv-kernels/src/embedding.rs
index 42d37eb..6655e37 100644
--- a/crates/xserv-kernels/src/embedding.rs
+++ b/crates/xserv-kernels/src/embedding.rs
@@ -1,5 +1,4 @@
 use std::ffi::c_void;
-use xserv_cuda::GpuBuffer;
 use xserv_tensor::{DType, Device, Tensor};
 
 unsafe extern "C" {
diff --git a/crates/xserv-kernels/src/moe.rs b/crates/xserv-kernels/src/moe.rs
index cc25996..4dd37e1 100644
--- a/crates/xserv-kernels/src/moe.rs
+++ b/crates/xserv-kernels/src/moe.rs
@@ -1,5 +1,5 @@
 use std::ffi::c_void;
-use xserv_tensor::{DType, Device, Tensor};
+use xserv_tensor::{DType, Tensor};
 
 use crate::gemm::{cublas_handle, CublasHandle};
 
@@ -88,6 +88,9 @@ pub fn moe_topk_softmax(
     let num_tokens = router_logits.shape()[0];
     assert_eq!(router_logits.shape()[1], num_experts);
 
+    // NOTE: topk_ids actually holds i32 expert indices; DType has no I32, so
+    // this is a raw 4-byte buffer mislabeled F32. Never read it as floats —
+    // all consumers (weighted-sum / sparse GEMV kernels) cast to int*.
     let topk_ids = Tensor::empty(&[num_tokens, top_k], DType::F32, router_logits.device());
     let topk_weights = Tensor::empty(&[num_tokens, top_k], DType::F32, router_logits.device());
 
@@ -201,8 +204,12 @@ pub fn moe_sparse_gemv_fp8(
 ) -> Tensor {
     assert_eq!(x.dtype(), DType::BF16);
     assert!(x.is_contiguous());
+    assert_eq!(w_fp8_t.dtype(), DType::FP8E4M3);
     let n = w_fp8_t.shape()[1];
     let k = w_fp8_t.shape()[2];
+    // The kernel reads weights as uint4 (16 FP8 values per lane) and would
+    // silently skip a K%16 tail.
+    assert_eq!(k % 16, 0, "sparse FP8 GEMV requires K % 16 == 0, got {k}");
     assert_eq!(x.shape()[x.ndim() - 1], k);
     assert_eq!(x.shape()[0], if x_per_slot { num_tokens * top_k } else { num_tokens });
 
@@ -233,6 +240,8 @@ pub fn moe_sparse_gemv_mxfp4(
 ) -> Tensor {
     assert_eq!(x.dtype(), DType::BF16);
     assert!(x.is_contiguous());
+    // 32-element MXFP4 blocks, read as uint4 (32 nibbles) per lane.
+    assert_eq!(k % 32, 0, "sparse MXFP4 GEMV requires K % 32 == 0, got {k}");
     assert_eq!(x.shape()[x.ndim() - 1], k);
     assert_eq!(x.shape()[0], if x_per_slot { num_tokens * top_k } else { num_tokens });
 
diff --git a/crates/xserv-kernels/src/quantization.rs b/crates/xserv-kernels/src/quantization.rs
index 292d24b..a937926 100644
--- a/crates/xserv-kernels/src/quantization.rs
+++ b/crates/xserv-kernels/src/quantization.rs
@@ -107,17 +107,11 @@ const CUDA_R_8F_E4M3: i32 = 28;
 // MatmulDesc attributes
 const CUBLASLT_MATMUL_DESC_A_SCALE_POINTER: i32 = 17;
 const CUBLASLT_MATMUL_DESC_B_SCALE_POINTER: i32 = 18;
-const CUBLASLT_MATMUL_DESC_A_SCALE_MODE: i32 = 31;
-const CUBLASLT_MATMUL_DESC_B_SCALE_MODE: i32 = 32;
 
 // MatrixLayout attributes
 const CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT: i32 = 5;
 const CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET: i32 = 6;
 
-// Scale modes
-const CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR: i32 = 0;
-const CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F: i32 = 3;
-
 // MatmulPreference attributes
 const CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES: i32 = 1;
 
diff --git a/crates/xserv-model/src/gpt2.rs b/crates/xserv-model/src/gpt2.rs
index 4d575e2..f44c3c4 100644
--- a/crates/xserv-model/src/gpt2.rs
+++ b/crates/xserv-model/src/gpt2.rs
@@ -199,7 +199,7 @@ impl GPT2 {
         layer: &GPT2Block,
         x: &Tensor,
         cache: Option<(&mut KVCache, usize)>,
-        pos_offset: usize,
+        _pos_offset: usize,
         new_tokens: usize,
         num_heads: usize,
         head_dim: usize,
diff --git a/crates/xserv-model/src/kv_cache.rs b/crates/xserv-model/src/kv_cache.rs
index ef5ef2d..5daf6cd 100644
--- a/crates/xserv-model/src/kv_cache.rs
+++ b/crates/xserv-model/src/kv_cache.rs
@@ -1,5 +1,5 @@
 use xserv_cuda::GpuBuffer;
-use xserv_tensor::{DType, Device, Tensor};
+use xserv_tensor::{DType, Tensor};
 use crate::config::ModelConfig;
 
 /// GPU-resident KV cache. Pre-allocates max_seq_len on GPU,
diff --git a/crates/xserv-model/src/qwen3.rs b/crates/xserv-model/src/qwen3.rs
index 0137b9a..30998ef 100644
--- a/crates/xserv-model/src/qwen3.rs
+++ b/crates/xserv-model/src/qwen3.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 use half::bf16;
 use xserv_kernels::*;
-use xserv_tensor::{DType, Device, Tensor};
+use xserv_tensor::{Device, Tensor};
 
 use crate::config::ModelConfig;
 use crate::gpt2::KVCache;
@@ -798,7 +798,7 @@ impl Qwen3 {
     pub fn forward_gpu_cache(&self, token_ids: &[u32], cache: &mut GpuKVCache) -> Tensor {
         let new_tokens = token_ids.len();
         let pos_offset = cache.seq_len();
-        let hidden = self.config.hidden();
+
         let num_heads = self.config.num_heads();
         let num_kv_heads = self.config.num_kv_heads();
         let head_dim = self.config.head_dim();
diff --git a/crates/xserv-server/src/main.rs b/crates/xserv-server/src/main.rs
index 497ca8d..f7937fa 100644
--- a/crates/xserv-server/src/main.rs
+++ b/crates/xserv-server/src/main.rs
@@ -65,6 +65,13 @@ async fn main() {
         std::process::exit(1);
     }
     let model_config = ModelConfig::from_file(&model_dir.join("config.json"));
+    // gpt-oss is only implemented in the TP engine; route it there even at
+    // tp=1 (single-rank world) so quantized models can serve on one GPU.
+    let is_gpt_oss = model_config.model_type.as_deref() == Some("gpt_oss");
+    if pp > 1 && is_gpt_oss {
+        eprintln!("gpt-oss is not supported by the pipeline-parallel engine (Qwen3 only); use --tp instead");
+        std::process::exit(1);
+    }
     let model_max_seq_len = model_config.max_seq_len();
     if model_max_seq_len == 0 {
         eprintln!("model config has invalid max_seq_len=0");
@@ -87,9 +94,6 @@ async fn main() {
     let (tx, rx) = mpsc::channel::<GenerateRequest>();
 
     let model_dir_clone = model_dir.clone();
-    // gpt-oss is only implemented in the TP engine; route it there even at
-    // tp=1 (single-rank world) so quantized models can serve on one GPU.
-    let is_gpt_oss = model_config.model_type.as_deref() == Some("gpt_oss");
     std::thread::spawn(move || {
         if pp > 1 {
             // Pipeline-parallel path: stage-0 coordinator + worker stage threads.
diff --git a/csrc/moe/moe_kernels.cu b/csrc/moe/moe_kernels.cu
index 9b58552..441cd7c 100644
--- a/csrc/moe/moe_kernels.cu
+++ b/csrc/moe/moe_kernels.cu
@@ -93,11 +93,9 @@ __global__ void moe_replicate_bf16_kernel(
     int total = local_experts * num_tokens * hidden;
     if (idx >= total) return;
 
-    int expert = idx / (num_tokens * hidden);
     int remainder = idx % (num_tokens * hidden);
     // x_rep[expert, token, dim] = x[token, dim]
     x_rep[idx] = x[remainder];
-    (void)expert; // suppress unused warning
 }
 
 // ============================================================