From cfbd64d20695e2a760b18ed34438dd0630d22986 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Wed, 1 Jul 2026 12:37:21 +0800 Subject: [PATCH] cuda: fix int32 overflow in MoE dense kernels; surface launch errors in release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dense MoE kernels (moe_replicate, moe_bias_add_3d, moe_weighted_sum) computed total / expert_stride / element indices in int32. gpt-oss prefill runs the whole prompt through the dense path unchunked (SPARSE_MAX_TOKENS=8), so local_experts*num_tokens*hidden (and batch*num_tokens*dim, and local_id*expert_stride) overflow int32 at ~3.6k-23k prefill tokens (TP-dependent) — well inside the supported context window. The launch then fails silently because CUDA_CHECK_LAST_ERROR was ((void)0) under NDEBUG, so the bias / weighted-sum simply never runs and the forward pass is corrupted with no error reported. Fix: switch the three kernels and their launchers to long long, mirroring the (long long) indexing already used in moe_sparse.cu. Also make CUDA_CHECK_LAST_ERROR always-on — cudaGetLastError does not sync, so the per-launch host cost is negligible, and a silent launch failure is exactly the class of bug this one was. Verified on dash5 (RTX 5090): a direct kernel test at 2.21B elements (>2^31) for both moe_replicate and moe_bias_add_3d produces correct results with no launch error; bench-gpt-oss TP=2 holds at 5.9ms TPOT, output unchanged. Co-Authored-By: Claude Opus 4.8 --- csrc/common.cuh | 11 ++++----- csrc/moe/moe_kernels.cu | 49 ++++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/csrc/common.cuh b/csrc/common.cuh index 27b6ae1..40565b4 100644 --- a/csrc/common.cuh +++ b/csrc/common.cuh @@ -49,10 +49,12 @@ __device__ __forceinline__ float block_reduce_max(float val) { return val; } -// --- Launch error checking (debug builds only) --- -#ifdef NDEBUG -#define CUDA_CHECK_LAST_ERROR() ((void)0) -#else +// --- Launch error checking --- +// Always on, including release builds. A launch with an invalid config +// (e.g. 32-bit overflow in grid/index math) is otherwise silent and produces +// garbage with no clue — the MoE int32-overflow bug was found exactly because +// release swallowed the launch failure. `cudaGetLastError()` does not +// synchronize the stream, so the per-launch host cost is negligible. #include #define CUDA_CHECK_LAST_ERROR() do { \ cudaError_t err = cudaGetLastError(); \ @@ -61,4 +63,3 @@ __device__ __forceinline__ float block_reduce_max(float val) { __FILE__, __LINE__, cudaGetErrorString(err)); \ } \ } while(0) -#endif diff --git a/csrc/moe/moe_kernels.cu b/csrc/moe/moe_kernels.cu index 441cd7c..4a1ab99 100644 --- a/csrc/moe/moe_kernels.cu +++ b/csrc/moe/moe_kernels.cu @@ -89,13 +89,17 @@ __global__ void moe_replicate_bf16_kernel( __nv_bfloat16* __restrict__ x_rep, int num_tokens, int hidden, int local_experts ) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int total = local_experts * num_tokens * hidden; + // 64-bit index: local_experts * num_tokens * hidden overflows int32 at + // ~2.3k prefill tokens (gpt-oss TP=1, 32 experts), which is inside the + // supported context window. A 32-bit `total` silently wraps, the launch + // fails, and (in release) the error is invisible — see common.cuh. + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + long long total = (long long)local_experts * num_tokens * hidden; if (idx >= total) return; - int remainder = idx % (num_tokens * hidden); // x_rep[expert, token, dim] = x[token, dim] - x_rep[idx] = x[remainder]; + long long row_stride = (long long)num_tokens * hidden; + x_rep[idx] = x[idx % row_stride]; } // ============================================================ @@ -112,13 +116,16 @@ __global__ void moe_bias_add_3d_bf16_kernel( const __nv_bfloat16* __restrict__ bias, int batch, int num_tokens, int dim ) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int total = batch * num_tokens * dim; + // 64-bit index: batch * num_tokens * dim overflows int32 at ~3.6k prefill + // tokens (gpt-oss TP=1, 32 experts, 2*intermediate dim) — see moe_replicate. + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + long long total = (long long)batch * num_tokens * dim; if (idx >= total) return; - int b = idx / (num_tokens * dim); - int d = idx % dim; - float v = __bfloat162float(x[idx]) + __bfloat162float(bias[b * dim + d]); + long long td = (long long)num_tokens * dim; + int b = (int)(idx / td); // < batch (small) + int d = (int)(idx % dim); // < dim + float v = __bfloat162float(x[idx]) + __bfloat162float(bias[(long long)b * dim + d]); x[idx] = __float2bfloat16(v); } @@ -151,14 +158,16 @@ __global__ void moe_weighted_sum_bf16_kernel( int num_tokens, int hidden, int top_k, int expert_start, int local_experts ) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int total = num_tokens * hidden; + // 64-bit index: `local_id * expert_stride` overflows int32 for long prefills + // (expert_stride = num_tokens * hidden), reading the wrong expert element. + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + long long total = (long long)num_tokens * hidden; if (idx >= total) return; - int token = idx / hidden; - int dim = idx % hidden; + long long token = idx / hidden; + int dim = (int)(idx % hidden); - int expert_stride = num_tokens * hidden; // stride between experts in expert_out + long long expert_stride = (long long)num_tokens * hidden; // stride between experts in expert_out float sum = 0.0f; for (int k = 0; k < top_k; k++) { @@ -196,9 +205,9 @@ void launch_moe_replicate_bf16( int num_tokens, int hidden, int local_experts, void* stream ) { - int total = local_experts * num_tokens * hidden; + long long total = (long long)local_experts * num_tokens * hidden; int block = 256; - int grid = (total + block - 1) / block; + int grid = (int)((total + block - 1) / block); moe_replicate_bf16_kernel<<>>( (const __nv_bfloat16*)x, (__nv_bfloat16*)x_rep, num_tokens, hidden, local_experts @@ -211,9 +220,9 @@ void launch_moe_bias_add_3d_bf16( int batch, int num_tokens, int dim, void* stream ) { - int total = batch * num_tokens * dim; + long long total = (long long)batch * num_tokens * dim; int block = 256; - int grid = (total + block - 1) / block; + int grid = (int)((total + block - 1) / block); moe_bias_add_3d_bf16_kernel<<>>( (__nv_bfloat16*)x, (const __nv_bfloat16*)bias, batch, num_tokens, dim @@ -229,9 +238,9 @@ void launch_moe_weighted_sum_bf16( int expert_start, int local_experts, void* stream ) { - int total = num_tokens * hidden; + long long total = (long long)num_tokens * hidden; int block = 256; - int grid = (total + block - 1) / block; + int grid = (int)((total + block - 1) / block); moe_weighted_sum_bf16_kernel<<>>( (const __nv_bfloat16*)expert_out, (const int*)topk_ids, (const float*)topk_weights,