Files
xserv/csrc/normalization/rmsnorm.cu
Gahow Wang c8e8153702 phase 4: transformer core kernels
CUDA kernels (csrc/):
- common.cuh: shared warp_reduce_sum/max, block_reduce_sum/max
- normalization/rmsnorm.cu: RMSNorm (F32 + BF16)
- normalization/layernorm.cu: LayerNorm with Welford (F32 + BF16)
- activation/activations.cu: GELU tanh-approx + SiLU (F32 + BF16)
- reduce/softmax.cu: safe softmax, 3-pass (F32 + BF16)
- embedding/embedding.cu: gather lookup (F32 + BF16)
- embedding/rope.cu: RoPE in-place + precomputed cos/sin cache (F32 + BF16)

Rust wrappers (xserv-kernels/src/):
- rmsnorm.rs, layernorm.rs, activation.rs, softmax.rs, embedding.rs, rope.rs
- RopeCache struct with GPU-side precomputation

Tests: 12 new tests (ops_test.rs), all passing with good precision:
- F32: max_err 1e-6 ~ 1e-9
- BF16: max_err 2e-3 ~ 7e-3
Total: 29 kernel tests + 27 prior = 56 tests passing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 21:07:24 +08:00

84 lines
2.5 KiB
Plaintext

#include "../common.cuh"
// RMSNorm: y[i] = x[i] * rsqrt(mean(x²) + eps) * gamma[i]
// Each block processes one row of shape [hidden_size].
__global__ void rmsnorm_f32(
const float* __restrict__ x,
const float* __restrict__ gamma,
float* __restrict__ out,
int hidden_size, float eps
) {
int row = blockIdx.x;
const float* x_row = x + row * hidden_size;
float* out_row = out + row * hidden_size;
float sum_sq = 0.0f;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float v = x_row[i];
sum_sq += v * v;
}
sum_sq = block_reduce_sum(sum_sq);
__shared__ float s_rms_inv;
if (threadIdx.x == 0) {
s_rms_inv = rsqrtf(sum_sq / hidden_size + eps);
}
__syncthreads();
float rms_inv = s_rms_inv;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
out_row[i] = x_row[i] * rms_inv * gamma[i];
}
}
__global__ void rmsnorm_bf16(
const __nv_bfloat16* __restrict__ x,
const __nv_bfloat16* __restrict__ gamma,
__nv_bfloat16* __restrict__ out,
int hidden_size, float eps
) {
int row = blockIdx.x;
const __nv_bfloat16* x_row = x + row * hidden_size;
__nv_bfloat16* out_row = out + row * hidden_size;
float sum_sq = 0.0f;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float v = __bfloat162float(x_row[i]);
sum_sq += v * v;
}
sum_sq = block_reduce_sum(sum_sq);
__shared__ float s_rms_inv;
if (threadIdx.x == 0) {
s_rms_inv = rsqrtf(sum_sq / hidden_size + eps);
}
__syncthreads();
float rms_inv = s_rms_inv;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float v = __bfloat162float(x_row[i]);
float g = __bfloat162float(gamma[i]);
out_row[i] = __float2bfloat16(v * rms_inv * g);
}
}
extern "C" {
void launch_rmsnorm_f32(const void* x, const void* gamma, void* out,
int rows, int hidden_size, float eps, void* stream) {
int block = (hidden_size < 1024) ? hidden_size : 1024;
rmsnorm_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
(const float*)x, (const float*)gamma, (float*)out, hidden_size, eps);
}
void launch_rmsnorm_bf16(const void* x, const void* gamma, void* out,
int rows, int hidden_size, float eps, void* stream) {
int block = (hidden_size < 1024) ? hidden_size : 1024;
rmsnorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)x, (const __nv_bfloat16*)gamma,
(__nv_bfloat16*)out, hidden_size, eps);
}
}