cuda: fix int32 overflow in MoE dense kernels; surface launch errors in release
The dense MoE kernels (moe_replicate, moe_bias_add_3d, moe_weighted_sum) computed total / expert_stride / element indices in int32. gpt-oss prefill runs the whole prompt through the dense path unchunked (SPARSE_MAX_TOKENS=8), so local_experts*num_tokens*hidden (and batch*num_tokens*dim, and local_id*expert_stride) overflow int32 at ~3.6k-23k prefill tokens (TP-dependent) — well inside the supported context window. The launch then fails silently because CUDA_CHECK_LAST_ERROR was ((void)0) under NDEBUG, so the bias / weighted-sum simply never runs and the forward pass is corrupted with no error reported. Fix: switch the three kernels and their launchers to long long, mirroring the (long long) indexing already used in moe_sparse.cu. Also make CUDA_CHECK_LAST_ERROR always-on — cudaGetLastError does not sync, so the per-launch host cost is negligible, and a silent launch failure is exactly the class of bug this one was. Verified on dash5 (RTX 5090): a direct kernel test at 2.21B elements (>2^31) for both moe_replicate and moe_bias_add_3d produces correct results with no launch error; bench-gpt-oss TP=2 holds at 5.9ms TPOT, output unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -49,10 +49,12 @@ __device__ __forceinline__ float block_reduce_max(float val) {
|
|||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Launch error checking (debug builds only) ---
|
// --- Launch error checking ---
|
||||||
#ifdef NDEBUG
|
// Always on, including release builds. A launch with an invalid config
|
||||||
#define CUDA_CHECK_LAST_ERROR() ((void)0)
|
// (e.g. 32-bit overflow in grid/index math) is otherwise silent and produces
|
||||||
#else
|
// garbage with no clue — the MoE int32-overflow bug was found exactly because
|
||||||
|
// release swallowed the launch failure. `cudaGetLastError()` does not
|
||||||
|
// synchronize the stream, so the per-launch host cost is negligible.
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#define CUDA_CHECK_LAST_ERROR() do { \
|
#define CUDA_CHECK_LAST_ERROR() do { \
|
||||||
cudaError_t err = cudaGetLastError(); \
|
cudaError_t err = cudaGetLastError(); \
|
||||||
@@ -61,4 +63,3 @@ __device__ __forceinline__ float block_reduce_max(float val) {
|
|||||||
__FILE__, __LINE__, cudaGetErrorString(err)); \
|
__FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
#endif
|
|
||||||
|
|||||||
@@ -89,13 +89,17 @@ __global__ void moe_replicate_bf16_kernel(
|
|||||||
__nv_bfloat16* __restrict__ x_rep,
|
__nv_bfloat16* __restrict__ x_rep,
|
||||||
int num_tokens, int hidden, int local_experts
|
int num_tokens, int hidden, int local_experts
|
||||||
) {
|
) {
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
// 64-bit index: local_experts * num_tokens * hidden overflows int32 at
|
||||||
int total = local_experts * num_tokens * hidden;
|
// ~2.3k prefill tokens (gpt-oss TP=1, 32 experts), which is inside the
|
||||||
|
// supported context window. A 32-bit `total` silently wraps, the launch
|
||||||
|
// fails, and (in release) the error is invisible — see common.cuh.
|
||||||
|
long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
long long total = (long long)local_experts * num_tokens * hidden;
|
||||||
if (idx >= total) return;
|
if (idx >= total) return;
|
||||||
|
|
||||||
int remainder = idx % (num_tokens * hidden);
|
|
||||||
// x_rep[expert, token, dim] = x[token, dim]
|
// x_rep[expert, token, dim] = x[token, dim]
|
||||||
x_rep[idx] = x[remainder];
|
long long row_stride = (long long)num_tokens * hidden;
|
||||||
|
x_rep[idx] = x[idx % row_stride];
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
@@ -112,13 +116,16 @@ __global__ void moe_bias_add_3d_bf16_kernel(
|
|||||||
const __nv_bfloat16* __restrict__ bias,
|
const __nv_bfloat16* __restrict__ bias,
|
||||||
int batch, int num_tokens, int dim
|
int batch, int num_tokens, int dim
|
||||||
) {
|
) {
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
// 64-bit index: batch * num_tokens * dim overflows int32 at ~3.6k prefill
|
||||||
int total = batch * num_tokens * dim;
|
// tokens (gpt-oss TP=1, 32 experts, 2*intermediate dim) — see moe_replicate.
|
||||||
|
long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
long long total = (long long)batch * num_tokens * dim;
|
||||||
if (idx >= total) return;
|
if (idx >= total) return;
|
||||||
|
|
||||||
int b = idx / (num_tokens * dim);
|
long long td = (long long)num_tokens * dim;
|
||||||
int d = idx % dim;
|
int b = (int)(idx / td); // < batch (small)
|
||||||
float v = __bfloat162float(x[idx]) + __bfloat162float(bias[b * dim + d]);
|
int d = (int)(idx % dim); // < dim
|
||||||
|
float v = __bfloat162float(x[idx]) + __bfloat162float(bias[(long long)b * dim + d]);
|
||||||
x[idx] = __float2bfloat16(v);
|
x[idx] = __float2bfloat16(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,14 +158,16 @@ __global__ void moe_weighted_sum_bf16_kernel(
|
|||||||
int num_tokens, int hidden, int top_k,
|
int num_tokens, int hidden, int top_k,
|
||||||
int expert_start, int local_experts
|
int expert_start, int local_experts
|
||||||
) {
|
) {
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
// 64-bit index: `local_id * expert_stride` overflows int32 for long prefills
|
||||||
int total = num_tokens * hidden;
|
// (expert_stride = num_tokens * hidden), reading the wrong expert element.
|
||||||
|
long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
long long total = (long long)num_tokens * hidden;
|
||||||
if (idx >= total) return;
|
if (idx >= total) return;
|
||||||
|
|
||||||
int token = idx / hidden;
|
long long token = idx / hidden;
|
||||||
int dim = idx % hidden;
|
int dim = (int)(idx % hidden);
|
||||||
|
|
||||||
int expert_stride = num_tokens * hidden; // stride between experts in expert_out
|
long long expert_stride = (long long)num_tokens * hidden; // stride between experts in expert_out
|
||||||
|
|
||||||
float sum = 0.0f;
|
float sum = 0.0f;
|
||||||
for (int k = 0; k < top_k; k++) {
|
for (int k = 0; k < top_k; k++) {
|
||||||
@@ -196,9 +205,9 @@ void launch_moe_replicate_bf16(
|
|||||||
int num_tokens, int hidden, int local_experts,
|
int num_tokens, int hidden, int local_experts,
|
||||||
void* stream
|
void* stream
|
||||||
) {
|
) {
|
||||||
int total = local_experts * num_tokens * hidden;
|
long long total = (long long)local_experts * num_tokens * hidden;
|
||||||
int block = 256;
|
int block = 256;
|
||||||
int grid = (total + block - 1) / block;
|
int grid = (int)((total + block - 1) / block);
|
||||||
moe_replicate_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
moe_replicate_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||||
(const __nv_bfloat16*)x, (__nv_bfloat16*)x_rep,
|
(const __nv_bfloat16*)x, (__nv_bfloat16*)x_rep,
|
||||||
num_tokens, hidden, local_experts
|
num_tokens, hidden, local_experts
|
||||||
@@ -211,9 +220,9 @@ void launch_moe_bias_add_3d_bf16(
|
|||||||
int batch, int num_tokens, int dim,
|
int batch, int num_tokens, int dim,
|
||||||
void* stream
|
void* stream
|
||||||
) {
|
) {
|
||||||
int total = batch * num_tokens * dim;
|
long long total = (long long)batch * num_tokens * dim;
|
||||||
int block = 256;
|
int block = 256;
|
||||||
int grid = (total + block - 1) / block;
|
int grid = (int)((total + block - 1) / block);
|
||||||
moe_bias_add_3d_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
moe_bias_add_3d_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||||
(__nv_bfloat16*)x, (const __nv_bfloat16*)bias,
|
(__nv_bfloat16*)x, (const __nv_bfloat16*)bias,
|
||||||
batch, num_tokens, dim
|
batch, num_tokens, dim
|
||||||
@@ -229,9 +238,9 @@ void launch_moe_weighted_sum_bf16(
|
|||||||
int expert_start, int local_experts,
|
int expert_start, int local_experts,
|
||||||
void* stream
|
void* stream
|
||||||
) {
|
) {
|
||||||
int total = num_tokens * hidden;
|
long long total = (long long)num_tokens * hidden;
|
||||||
int block = 256;
|
int block = 256;
|
||||||
int grid = (total + block - 1) / block;
|
int grid = (int)((total + block - 1) / block);
|
||||||
moe_weighted_sum_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
moe_weighted_sum_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||||
(const __nv_bfloat16*)expert_out,
|
(const __nv_bfloat16*)expert_out,
|
||||||
(const int*)topk_ids, (const float*)topk_weights,
|
(const int*)topk_ids, (const float*)topk_weights,
|
||||||
|
|||||||
Reference in New Issue
Block a user