cuda: fix int32 overflow in MoE dense kernels; surface launch errors in release

The dense MoE kernels (moe_replicate, moe_bias_add_3d, moe_weighted_sum)
computed total / expert_stride / element indices in int32. gpt-oss prefill
runs the whole prompt through the dense path unchunked (SPARSE_MAX_TOKENS=8),
so local_experts*num_tokens*hidden (and batch*num_tokens*dim, and
local_id*expert_stride) overflow int32 at ~3.6k-23k prefill tokens
(TP-dependent) — well inside the supported context window. The launch then
fails silently because CUDA_CHECK_LAST_ERROR was ((void)0) under NDEBUG, so
the bias / weighted-sum simply never runs and the forward pass is corrupted
with no error reported.

Fix: switch the three kernels and their launchers to long long, mirroring the
(long long) indexing already used in moe_sparse.cu. Also make
CUDA_CHECK_LAST_ERROR always-on — cudaGetLastError does not sync, so the
per-launch host cost is negligible, and a silent launch failure is exactly
the class of bug this one was.

Verified on dash5 (RTX 5090): a direct kernel test at 2.21B elements (>2^31)
for both moe_replicate and moe_bias_add_3d produces correct results with no
launch error; bench-gpt-oss TP=2 holds at 5.9ms TPOT, output unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-07-01 12:37:21 +08:00
parent 531cd3fe08
commit cfbd64d206
2 changed files with 35 additions and 25 deletions

View File

@@ -49,10 +49,12 @@ __device__ __forceinline__ float block_reduce_max(float val) {
return val; return val;
} }
// --- Launch error checking (debug builds only) --- // --- Launch error checking ---
#ifdef NDEBUG // Always on, including release builds. A launch with an invalid config
#define CUDA_CHECK_LAST_ERROR() ((void)0) // (e.g. 32-bit overflow in grid/index math) is otherwise silent and produces
#else // garbage with no clue — the MoE int32-overflow bug was found exactly because
// release swallowed the launch failure. `cudaGetLastError()` does not
// synchronize the stream, so the per-launch host cost is negligible.
#include <cstdio> #include <cstdio>
#define CUDA_CHECK_LAST_ERROR() do { \ #define CUDA_CHECK_LAST_ERROR() do { \
cudaError_t err = cudaGetLastError(); \ cudaError_t err = cudaGetLastError(); \
@@ -61,4 +63,3 @@ __device__ __forceinline__ float block_reduce_max(float val) {
__FILE__, __LINE__, cudaGetErrorString(err)); \ __FILE__, __LINE__, cudaGetErrorString(err)); \
} \ } \
} while(0) } while(0)
#endif

View File

@@ -89,13 +89,17 @@ __global__ void moe_replicate_bf16_kernel(
__nv_bfloat16* __restrict__ x_rep, __nv_bfloat16* __restrict__ x_rep,
int num_tokens, int hidden, int local_experts int num_tokens, int hidden, int local_experts
) { ) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; // 64-bit index: local_experts * num_tokens * hidden overflows int32 at
int total = local_experts * num_tokens * hidden; // ~2.3k prefill tokens (gpt-oss TP=1, 32 experts), which is inside the
// supported context window. A 32-bit `total` silently wraps, the launch
// fails, and (in release) the error is invisible — see common.cuh.
long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
long long total = (long long)local_experts * num_tokens * hidden;
if (idx >= total) return; if (idx >= total) return;
int remainder = idx % (num_tokens * hidden);
// x_rep[expert, token, dim] = x[token, dim] // x_rep[expert, token, dim] = x[token, dim]
x_rep[idx] = x[remainder]; long long row_stride = (long long)num_tokens * hidden;
x_rep[idx] = x[idx % row_stride];
} }
// ============================================================ // ============================================================
@@ -112,13 +116,16 @@ __global__ void moe_bias_add_3d_bf16_kernel(
const __nv_bfloat16* __restrict__ bias, const __nv_bfloat16* __restrict__ bias,
int batch, int num_tokens, int dim int batch, int num_tokens, int dim
) { ) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; // 64-bit index: batch * num_tokens * dim overflows int32 at ~3.6k prefill
int total = batch * num_tokens * dim; // tokens (gpt-oss TP=1, 32 experts, 2*intermediate dim) — see moe_replicate.
long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
long long total = (long long)batch * num_tokens * dim;
if (idx >= total) return; if (idx >= total) return;
int b = idx / (num_tokens * dim); long long td = (long long)num_tokens * dim;
int d = idx % dim; int b = (int)(idx / td); // < batch (small)
float v = __bfloat162float(x[idx]) + __bfloat162float(bias[b * dim + d]); int d = (int)(idx % dim); // < dim
float v = __bfloat162float(x[idx]) + __bfloat162float(bias[(long long)b * dim + d]);
x[idx] = __float2bfloat16(v); x[idx] = __float2bfloat16(v);
} }
@@ -151,14 +158,16 @@ __global__ void moe_weighted_sum_bf16_kernel(
int num_tokens, int hidden, int top_k, int num_tokens, int hidden, int top_k,
int expert_start, int local_experts int expert_start, int local_experts
) { ) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; // 64-bit index: `local_id * expert_stride` overflows int32 for long prefills
int total = num_tokens * hidden; // (expert_stride = num_tokens * hidden), reading the wrong expert element.
long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
long long total = (long long)num_tokens * hidden;
if (idx >= total) return; if (idx >= total) return;
int token = idx / hidden; long long token = idx / hidden;
int dim = idx % hidden; int dim = (int)(idx % hidden);
int expert_stride = num_tokens * hidden; // stride between experts in expert_out long long expert_stride = (long long)num_tokens * hidden; // stride between experts in expert_out
float sum = 0.0f; float sum = 0.0f;
for (int k = 0; k < top_k; k++) { for (int k = 0; k < top_k; k++) {
@@ -196,9 +205,9 @@ void launch_moe_replicate_bf16(
int num_tokens, int hidden, int local_experts, int num_tokens, int hidden, int local_experts,
void* stream void* stream
) { ) {
int total = local_experts * num_tokens * hidden; long long total = (long long)local_experts * num_tokens * hidden;
int block = 256; int block = 256;
int grid = (total + block - 1) / block; int grid = (int)((total + block - 1) / block);
moe_replicate_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>( moe_replicate_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)x, (__nv_bfloat16*)x_rep, (const __nv_bfloat16*)x, (__nv_bfloat16*)x_rep,
num_tokens, hidden, local_experts num_tokens, hidden, local_experts
@@ -211,9 +220,9 @@ void launch_moe_bias_add_3d_bf16(
int batch, int num_tokens, int dim, int batch, int num_tokens, int dim,
void* stream void* stream
) { ) {
int total = batch * num_tokens * dim; long long total = (long long)batch * num_tokens * dim;
int block = 256; int block = 256;
int grid = (total + block - 1) / block; int grid = (int)((total + block - 1) / block);
moe_bias_add_3d_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>( moe_bias_add_3d_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
(__nv_bfloat16*)x, (const __nv_bfloat16*)bias, (__nv_bfloat16*)x, (const __nv_bfloat16*)bias,
batch, num_tokens, dim batch, num_tokens, dim
@@ -229,9 +238,9 @@ void launch_moe_weighted_sum_bf16(
int expert_start, int local_experts, int expert_start, int local_experts,
void* stream void* stream
) { ) {
int total = num_tokens * hidden; long long total = (long long)num_tokens * hidden;
int block = 256; int block = 256;
int grid = (total + block - 1) / block; int grid = (int)((total + block - 1) / block);
moe_weighted_sum_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>( moe_weighted_sum_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)expert_out, (const __nv_bfloat16*)expert_out,
(const int*)topk_ids, (const float*)topk_weights, (const int*)topk_ids, (const float*)topk_weights,