From cfbd64d20695e2a760b18ed34438dd0630d22986 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Wed, 1 Jul 2026 12:37:21 +0800
Subject: [PATCH] cuda: fix int32 overflow in MoE dense kernels; surface launch
 errors in release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dense MoE kernels (moe_replicate, moe_bias_add_3d, moe_weighted_sum)
computed total / expert_stride / element indices in int32. gpt-oss prefill
runs the whole prompt through the dense path unchunked (SPARSE_MAX_TOKENS=8),
so local_experts*num_tokens*hidden (and batch*num_tokens*dim, and
local_id*expert_stride) overflow int32 at ~3.6k-23k prefill tokens
(TP-dependent) — well inside the supported context window. The launch then
fails silently because CUDA_CHECK_LAST_ERROR was ((void)0) under NDEBUG, so
the bias / weighted-sum simply never runs and the forward pass is corrupted
with no error reported.

Fix: switch the three kernels and their launchers to long long, mirroring the
(long long) indexing already used in moe_sparse.cu. Also make
CUDA_CHECK_LAST_ERROR always-on — cudaGetLastError does not sync, so the
per-launch host cost is negligible, and a silent launch failure is exactly
the class of bug this one was.

Verified on dash5 (RTX 5090): a direct kernel test at 2.21B elements (>2^31)
for both moe_replicate and moe_bias_add_3d produces correct results with no
launch error; bench-gpt-oss TP=2 holds at 5.9ms TPOT, output unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 csrc/common.cuh         | 11 ++++-----
 csrc/moe/moe_kernels.cu | 49 ++++++++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/csrc/common.cuh b/csrc/common.cuh
index 27b6ae1..40565b4 100644
--- a/csrc/common.cuh
+++ b/csrc/common.cuh
@@ -49,10 +49,12 @@ __device__ __forceinline__ float block_reduce_max(float val) {
     return val;
 }
 
-// --- Launch error checking (debug builds only) ---
-#ifdef NDEBUG
-#define CUDA_CHECK_LAST_ERROR() ((void)0)
-#else
+// --- Launch error checking ---
+// Always on, including release builds. A launch with an invalid config
+// (e.g. 32-bit overflow in grid/index math) is otherwise silent and produces
+// garbage with no clue — the MoE int32-overflow bug was found exactly because
+// release swallowed the launch failure. `cudaGetLastError()` does not
+// synchronize the stream, so the per-launch host cost is negligible.
 #include <cstdio>
 #define CUDA_CHECK_LAST_ERROR() do { \
     cudaError_t err = cudaGetLastError(); \
@@ -61,4 +63,3 @@ __device__ __forceinline__ float block_reduce_max(float val) {
                 __FILE__, __LINE__, cudaGetErrorString(err)); \
     } \
 } while(0)
-#endif
diff --git a/csrc/moe/moe_kernels.cu b/csrc/moe/moe_kernels.cu
index 441cd7c..4a1ab99 100644
--- a/csrc/moe/moe_kernels.cu
+++ b/csrc/moe/moe_kernels.cu
@@ -89,13 +89,17 @@ __global__ void moe_replicate_bf16_kernel(
     __nv_bfloat16* __restrict__ x_rep,
     int num_tokens, int hidden, int local_experts
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total = local_experts * num_tokens * hidden;
+    // 64-bit index: local_experts * num_tokens * hidden overflows int32 at
+    // ~2.3k prefill tokens (gpt-oss TP=1, 32 experts), which is inside the
+    // supported context window. A 32-bit `total` silently wraps, the launch
+    // fails, and (in release) the error is invisible — see common.cuh.
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    long long total = (long long)local_experts * num_tokens * hidden;
     if (idx >= total) return;
 
-    int remainder = idx % (num_tokens * hidden);
     // x_rep[expert, token, dim] = x[token, dim]
-    x_rep[idx] = x[remainder];
+    long long row_stride = (long long)num_tokens * hidden;
+    x_rep[idx] = x[idx % row_stride];
 }
 
 // ============================================================
@@ -112,13 +116,16 @@ __global__ void moe_bias_add_3d_bf16_kernel(
     const __nv_bfloat16* __restrict__ bias,
     int batch, int num_tokens, int dim
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total = batch * num_tokens * dim;
+    // 64-bit index: batch * num_tokens * dim overflows int32 at ~3.6k prefill
+    // tokens (gpt-oss TP=1, 32 experts, 2*intermediate dim) — see moe_replicate.
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    long long total = (long long)batch * num_tokens * dim;
     if (idx >= total) return;
 
-    int b = idx / (num_tokens * dim);
-    int d = idx % dim;
-    float v = __bfloat162float(x[idx]) + __bfloat162float(bias[b * dim + d]);
+    long long td = (long long)num_tokens * dim;
+    int b = (int)(idx / td);   // < batch (small)
+    int d = (int)(idx % dim);  // < dim
+    float v = __bfloat162float(x[idx]) + __bfloat162float(bias[(long long)b * dim + d]);
     x[idx] = __float2bfloat16(v);
 }
 
@@ -151,14 +158,16 @@ __global__ void moe_weighted_sum_bf16_kernel(
     int num_tokens, int hidden, int top_k,
     int expert_start, int local_experts
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total = num_tokens * hidden;
+    // 64-bit index: `local_id * expert_stride` overflows int32 for long prefills
+    // (expert_stride = num_tokens * hidden), reading the wrong expert element.
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    long long total = (long long)num_tokens * hidden;
     if (idx >= total) return;
 
-    int token = idx / hidden;
-    int dim = idx % hidden;
+    long long token = idx / hidden;
+    int dim = (int)(idx % hidden);
 
-    int expert_stride = num_tokens * hidden; // stride between experts in expert_out
+    long long expert_stride = (long long)num_tokens * hidden; // stride between experts in expert_out
 
     float sum = 0.0f;
     for (int k = 0; k < top_k; k++) {
@@ -196,9 +205,9 @@ void launch_moe_replicate_bf16(
     int num_tokens, int hidden, int local_experts,
     void* stream
 ) {
-    int total = local_experts * num_tokens * hidden;
+    long long total = (long long)local_experts * num_tokens * hidden;
     int block = 256;
-    int grid = (total + block - 1) / block;
+    int grid = (int)((total + block - 1) / block);
     moe_replicate_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
         (const __nv_bfloat16*)x, (__nv_bfloat16*)x_rep,
         num_tokens, hidden, local_experts
@@ -211,9 +220,9 @@ void launch_moe_bias_add_3d_bf16(
     int batch, int num_tokens, int dim,
     void* stream
 ) {
-    int total = batch * num_tokens * dim;
+    long long total = (long long)batch * num_tokens * dim;
     int block = 256;
-    int grid = (total + block - 1) / block;
+    int grid = (int)((total + block - 1) / block);
     moe_bias_add_3d_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
         (__nv_bfloat16*)x, (const __nv_bfloat16*)bias,
         batch, num_tokens, dim
@@ -229,9 +238,9 @@ void launch_moe_weighted_sum_bf16(
     int expert_start, int local_experts,
     void* stream
 ) {
-    int total = num_tokens * hidden;
+    long long total = (long long)num_tokens * hidden;
     int block = 256;
-    int grid = (total + block - 1) / block;
+    int grid = (int)((total + block - 1) / block);
     moe_weighted_sum_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
         (const __nv_bfloat16*)expert_out,
         (const int*)topk_ids, (const float*)topk_weights,