The dense MoE kernels (moe_replicate, moe_bias_add_3d, moe_weighted_sum) computed total / expert_stride / element indices in int32. gpt-oss prefill runs the whole prompt through the dense path unchunked (SPARSE_MAX_TOKENS=8), so local_experts*num_tokens*hidden (and batch*num_tokens*dim, and local_id*expert_stride) overflow int32 at ~3.6k-23k prefill tokens (TP-dependent) — well inside the supported context window. The launch then fails silently because CUDA_CHECK_LAST_ERROR was ((void)0) under NDEBUG, so the bias / weighted-sum simply never runs and the forward pass is corrupted with no error reported. Fix: switch the three kernels and their launchers to long long, mirroring the (long long) indexing already used in moe_sparse.cu. Also make CUDA_CHECK_LAST_ERROR always-on — cudaGetLastError does not sync, so the per-launch host cost is negligible, and a silent launch failure is exactly the class of bug this one was. Verified on dash5 (RTX 5090): a direct kernel test at 2.21B elements (>2^31) for both moe_replicate and moe_bias_add_3d produces correct results with no launch error; bench-gpt-oss TP=2 holds at 5.9ms TPOT, output unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
66 lines
2.1 KiB
Plaintext
66 lines
2.1 KiB
Plaintext
#pragma once
|
|
#include <cuda_bf16.h>
|
|
|
|
// --- Warp-level reductions (no shared memory needed) ---
|
|
|
|
__device__ __forceinline__ float warp_reduce_sum(float val) {
|
|
#pragma unroll
|
|
for (int offset = 16; offset > 0; offset >>= 1)
|
|
val += __shfl_down_sync(0xffffffff, val, offset);
|
|
return val;
|
|
}
|
|
|
|
__device__ __forceinline__ float warp_reduce_max(float val) {
|
|
#pragma unroll
|
|
for (int offset = 16; offset > 0; offset >>= 1)
|
|
val = fmaxf(val, __shfl_down_sync(0xffffffff, val, offset));
|
|
return val;
|
|
}
|
|
|
|
// --- Block-level reductions ---
|
|
|
|
__device__ __forceinline__ float block_reduce_sum(float val) {
|
|
__shared__ float shared[32];
|
|
int lane = threadIdx.x & 31;
|
|
int warp_id = threadIdx.x >> 5;
|
|
int num_warps = (blockDim.x + 31) >> 5;
|
|
|
|
val = warp_reduce_sum(val);
|
|
if (lane == 0) shared[warp_id] = val;
|
|
__syncthreads();
|
|
|
|
val = (threadIdx.x < num_warps) ? shared[threadIdx.x] : 0.0f;
|
|
if (warp_id == 0) val = warp_reduce_sum(val);
|
|
return val;
|
|
}
|
|
|
|
__device__ __forceinline__ float block_reduce_max(float val) {
|
|
__shared__ float shared[32];
|
|
int lane = threadIdx.x & 31;
|
|
int warp_id = threadIdx.x >> 5;
|
|
int num_warps = (blockDim.x + 31) >> 5;
|
|
|
|
val = warp_reduce_max(val);
|
|
if (lane == 0) shared[warp_id] = val;
|
|
__syncthreads();
|
|
|
|
val = (threadIdx.x < num_warps) ? shared[threadIdx.x] : -INFINITY;
|
|
if (warp_id == 0) val = warp_reduce_max(val);
|
|
return val;
|
|
}
|
|
|
|
// --- Launch error checking ---
|
|
// Always on, including release builds. A launch with an invalid config
|
|
// (e.g. 32-bit overflow in grid/index math) is otherwise silent and produces
|
|
// garbage with no clue — the MoE int32-overflow bug was found exactly because
|
|
// release swallowed the launch failure. `cudaGetLastError()` does not
|
|
// synchronize the stream, so the per-launch host cost is negligible.
|
|
#include <cstdio>
|
|
#define CUDA_CHECK_LAST_ERROR() do { \
|
|
cudaError_t err = cudaGetLastError(); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA kernel launch error at %s:%d: %s\n", \
|
|
__FILE__, __LINE__, cudaGetErrorString(err)); \
|
|
} \
|
|
} while(0)
|