Files
xserv/csrc/common.cuh
Gahow Wang cfbd64d206 cuda: fix int32 overflow in MoE dense kernels; surface launch errors in release
The dense MoE kernels (moe_replicate, moe_bias_add_3d, moe_weighted_sum)
computed total / expert_stride / element indices in int32. gpt-oss prefill
runs the whole prompt through the dense path unchunked (SPARSE_MAX_TOKENS=8),
so local_experts*num_tokens*hidden (and batch*num_tokens*dim, and
local_id*expert_stride) overflow int32 at ~3.6k-23k prefill tokens
(TP-dependent) — well inside the supported context window. The launch then
fails silently because CUDA_CHECK_LAST_ERROR was ((void)0) under NDEBUG, so
the bias / weighted-sum simply never runs and the forward pass is corrupted
with no error reported.

Fix: switch the three kernels and their launchers to long long, mirroring the
(long long) indexing already used in moe_sparse.cu. Also make
CUDA_CHECK_LAST_ERROR always-on — cudaGetLastError does not sync, so the
per-launch host cost is negligible, and a silent launch failure is exactly
the class of bug this one was.

Verified on dash5 (RTX 5090): a direct kernel test at 2.21B elements (>2^31)
for both moe_replicate and moe_bias_add_3d produces correct results with no
launch error; bench-gpt-oss TP=2 holds at 5.9ms TPOT, output unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-01 12:37:21 +08:00

66 lines
2.1 KiB
Plaintext

#pragma once
#include <cuda_bf16.h>
// --- Warp-level reductions (no shared memory needed) ---
__device__ __forceinline__ float warp_reduce_sum(float val) {
#pragma unroll
for (int offset = 16; offset > 0; offset >>= 1)
val += __shfl_down_sync(0xffffffff, val, offset);
return val;
}
__device__ __forceinline__ float warp_reduce_max(float val) {
#pragma unroll
for (int offset = 16; offset > 0; offset >>= 1)
val = fmaxf(val, __shfl_down_sync(0xffffffff, val, offset));
return val;
}
// --- Block-level reductions ---
__device__ __forceinline__ float block_reduce_sum(float val) {
__shared__ float shared[32];
int lane = threadIdx.x & 31;
int warp_id = threadIdx.x >> 5;
int num_warps = (blockDim.x + 31) >> 5;
val = warp_reduce_sum(val);
if (lane == 0) shared[warp_id] = val;
__syncthreads();
val = (threadIdx.x < num_warps) ? shared[threadIdx.x] : 0.0f;
if (warp_id == 0) val = warp_reduce_sum(val);
return val;
}
__device__ __forceinline__ float block_reduce_max(float val) {
__shared__ float shared[32];
int lane = threadIdx.x & 31;
int warp_id = threadIdx.x >> 5;
int num_warps = (blockDim.x + 31) >> 5;
val = warp_reduce_max(val);
if (lane == 0) shared[warp_id] = val;
__syncthreads();
val = (threadIdx.x < num_warps) ? shared[threadIdx.x] : -INFINITY;
if (warp_id == 0) val = warp_reduce_max(val);
return val;
}
// --- Launch error checking ---
// Always on, including release builds. A launch with an invalid config
// (e.g. 32-bit overflow in grid/index math) is otherwise silent and produces
// garbage with no clue — the MoE int32-overflow bug was found exactly because
// release swallowed the launch failure. `cudaGetLastError()` does not
// synchronize the stream, so the per-launch host cost is negligible.
#include <cstdio>
#define CUDA_CHECK_LAST_ERROR() do { \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA kernel launch error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
} \
} while(0)