cuda: fix int32 overflow in MoE dense kernels; surface launch errors in release

The dense MoE kernels (moe_replicate, moe_bias_add_3d, moe_weighted_sum) computed total / expert_stride / element indices in int32. gpt-oss prefill runs the whole prompt through the dense path unchunked (SPARSE_MAX_TOKENS=8), so local_experts*num_tokens*hidden (and batch*num_tokens*dim, and local_id*expert_stride) overflow int32 at ~3.6k-23k prefill tokens (TP-dependent) — well inside the supported context window. The launch then fails silently because CUDA_CHECK_LAST_ERROR was ((void)0) under NDEBUG, so the bias / weighted-sum simply never runs and the forward pass is corrupted with no error reported. Fix: switch the three kernels and their launchers to long long, mirroring the (long long) indexing already used in moe_sparse.cu. Also make CUDA_CHECK_LAST_ERROR always-on — cudaGetLastError does not sync, so the per-launch host cost is negligible, and a silent launch failure is exactly the class of bug this one was. Verified on dash5 (RTX 5090): a direct kernel test at 2.21B elements (>2^31) for both moe_replicate and moe_bias_add_3d produces correct results with no launch error; bench-gpt-oss TP=2 holds at 5.9ms TPOT, output unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-01 12:37:21 +08:00
parent 531cd3fe08
commit cfbd64d206
2 changed files with 35 additions and 25 deletions
--- a/csrc/common.cuh
+++ b/csrc/common.cuh
@@ -49,10 +49,12 @@ __device__ __forceinline__ float block_reduce_max(float val) {
    return val;
 }
-// --- Launch error checking (debug builds only) ---
+// --- Launch error checking ---
-#ifdef NDEBUG
+// Always on, including release builds. A launch with an invalid config
-#define CUDA_CHECK_LAST_ERROR() ((void)0)
+// (e.g. 32-bit overflow in grid/index math) is otherwise silent and produces
-#else
+// garbage with no clue — the MoE int32-overflow bug was found exactly because
 // release swallowed the launch failure. `cudaGetLastError()` does not
 // synchronize the stream, so the per-launch host cost is negligible.
 #include <cstdio>
 #define CUDA_CHECK_LAST_ERROR() do { \
    cudaError_t err = cudaGetLastError(); \
@@ -61,4 +63,3 @@ __device__ __forceinline__ float block_reduce_max(float val) {
                __FILE__, __LINE__, cudaGetErrorString(err)); \
    } \
 } while(0)
 #endif
--- a/csrc/moe/moe_kernels.cu
+++ b/csrc/moe/moe_kernels.cu
@@ -89,13 +89,17 @@ __global__ void moe_replicate_bf16_kernel(
    __nv_bfloat16* __restrict__ x_rep,
    int num_tokens, int hidden, int local_experts
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // 64-bit index: local_experts * num_tokens * hidden overflows int32 at
-    int total = local_experts * num_tokens * hidden;
+    // ~2.3k prefill tokens (gpt-oss TP=1, 32 experts), which is inside the
    // supported context window. A 32-bit `total` silently wraps, the launch
    // fails, and (in release) the error is invisible — see common.cuh.
    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
    long long total = (long long)local_experts * num_tokens * hidden;
    if (idx >= total) return;
    int remainder = idx % (num_tokens * hidden);
    // x_rep[expert, token, dim] = x[token, dim]
-    x_rep[idx] = x[remainder];
+    long long row_stride = (long long)num_tokens * hidden;
    x_rep[idx] = x[idx % row_stride];
 }
 // ============================================================
@@ -112,13 +116,16 @@ __global__ void moe_bias_add_3d_bf16_kernel(
    const __nv_bfloat16* __restrict__ bias,
    int batch, int num_tokens, int dim
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // 64-bit index: batch * num_tokens * dim overflows int32 at ~3.6k prefill
-    int total = batch * num_tokens * dim;
+    // tokens (gpt-oss TP=1, 32 experts, 2*intermediate dim) — see moe_replicate.
    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
    long long total = (long long)batch * num_tokens * dim;
    if (idx >= total) return;
-    int b = idx / (num_tokens * dim);
+    long long td = (long long)num_tokens * dim;
-    int d = idx % dim;
+    int b = (int)(idx / td);   // < batch (small)
-    float v = __bfloat162float(x[idx]) + __bfloat162float(bias[b * dim + d]);
+    int d = (int)(idx % dim);  // < dim
    float v = __bfloat162float(x[idx]) + __bfloat162float(bias[(long long)b * dim + d]);
    x[idx] = __float2bfloat16(v);
 }
@@ -151,14 +158,16 @@ __global__ void moe_weighted_sum_bf16_kernel(
    int num_tokens, int hidden, int top_k,
    int expert_start, int local_experts
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // 64-bit index: `local_id * expert_stride` overflows int32 for long prefills
-    int total = num_tokens * hidden;
+    // (expert_stride = num_tokens * hidden), reading the wrong expert element.
    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
    long long total = (long long)num_tokens * hidden;
    if (idx >= total) return;
-    int token = idx / hidden;
+    long long token = idx / hidden;
-    int dim = idx % hidden;
+    int dim = (int)(idx % hidden);
-    int expert_stride = num_tokens * hidden; // stride between experts in expert_out
+    long long expert_stride = (long long)num_tokens * hidden; // stride between experts in expert_out
    float sum = 0.0f;
    for (int k = 0; k < top_k; k++) {
@@ -196,9 +205,9 @@ void launch_moe_replicate_bf16(
    int num_tokens, int hidden, int local_experts,
    void* stream
 ) {
-    int total = local_experts * num_tokens * hidden;
+    long long total = (long long)local_experts * num_tokens * hidden;
    int block = 256;
-    int grid = (total + block - 1) / block;
+    int grid = (int)((total + block - 1) / block);
    moe_replicate_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
        (const __nv_bfloat16*)x, (__nv_bfloat16*)x_rep,
        num_tokens, hidden, local_experts
@@ -211,9 +220,9 @@ void launch_moe_bias_add_3d_bf16(
    int batch, int num_tokens, int dim,
    void* stream
 ) {
-    int total = batch * num_tokens * dim;
+    long long total = (long long)batch * num_tokens * dim;
    int block = 256;
-    int grid = (total + block - 1) / block;
+    int grid = (int)((total + block - 1) / block);
    moe_bias_add_3d_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
        (__nv_bfloat16*)x, (const __nv_bfloat16*)bias,
        batch, num_tokens, dim
@@ -229,9 +238,9 @@ void launch_moe_weighted_sum_bf16(
    int expert_start, int local_experts,
    void* stream
 ) {
-    int total = num_tokens * hidden;
+    long long total = (long long)num_tokens * hidden;
    int block = 256;
-    int grid = (total + block - 1) / block;
+    int grid = (int)((total + block - 1) / block);
    moe_weighted_sum_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
        (const __nv_bfloat16*)expert_out,
        (const int*)topk_ids, (const float*)topk_weights,