phase 4: transformer core kernels

CUDA kernels (csrc/): - common.cuh: shared warp_reduce_sum/max, block_reduce_sum/max - normalization/rmsnorm.cu: RMSNorm (F32 + BF16) - normalization/layernorm.cu: LayerNorm with Welford (F32 + BF16) - activation/activations.cu: GELU tanh-approx + SiLU (F32 + BF16) - reduce/softmax.cu: safe softmax, 3-pass (F32 + BF16) - embedding/embedding.cu: gather lookup (F32 + BF16) - embedding/rope.cu: RoPE in-place + precomputed cos/sin cache (F32 + BF16) Rust wrappers (xserv-kernels/src/): - rmsnorm.rs, layernorm.rs, activation.rs, softmax.rs, embedding.rs, rope.rs - RopeCache struct with GPU-side precomputation Tests: 12 new tests (ops_test.rs), all passing with good precision: - F32: max_err 1e-6 ~ 1e-9 - BF16: max_err 2e-3 ~ 7e-3 Total: 29 kernel tests + 27 prior = 56 tests passing Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 21:07:24 +08:00
parent 51a0f2eb14
commit c8e8153702
17 changed files with 1402 additions and 3 deletions
--- a/csrc/embedding/embedding.cu
+++ b/csrc/embedding/embedding.cu
@@ -0,0 +1,55 @@
+#include <cuda_bf16.h>
+
+// Embedding lookup: out[seq_idx] = table[token_ids[seq_idx]]
+// Grid: num_tokens, Block: handles hidden_size elements per token.
+
+__global__ void embedding_f32(
+    const float* __restrict__ table,    // [vocab_size, hidden_size]
+    const int* __restrict__ token_ids,  // [num_tokens]
+    float* __restrict__ out,            // [num_tokens, hidden_size]
+    int hidden_size
+) {
+    int token_idx = blockIdx.x;
+    int tid = token_ids[token_idx];
+    const float* row = table + tid * hidden_size;
+    float* dst = out + token_idx * hidden_size;
+
+    for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+        dst[i] = row[i];
+    }
+}
+
+__global__ void embedding_bf16(
+    const __nv_bfloat16* __restrict__ table,
+    const int* __restrict__ token_ids,
+    __nv_bfloat16* __restrict__ out,
+    int hidden_size
+) {
+    int token_idx = blockIdx.x;
+    int tid = token_ids[token_idx];
+    const __nv_bfloat16* row = table + tid * hidden_size;
+    __nv_bfloat16* dst = out + token_idx * hidden_size;
+
+    for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+        dst[i] = row[i];
+    }
+}
+
+extern "C" {
+
+void launch_embedding_f32(const void* table, const void* token_ids, void* out,
+                          int num_tokens, int hidden_size, void* stream) {
+    int block = (hidden_size < 256) ? hidden_size : 256;
+    embedding_f32<<<num_tokens, block, 0, (cudaStream_t)stream>>>(
+        (const float*)table, (const int*)token_ids, (float*)out, hidden_size);
+}
+
+void launch_embedding_bf16(const void* table, const void* token_ids, void* out,
+                           int num_tokens, int hidden_size, void* stream) {
+    int block = (hidden_size < 256) ? hidden_size : 256;
+    embedding_bf16<<<num_tokens, block, 0, (cudaStream_t)stream>>>(
+        (const __nv_bfloat16*)table, (const int*)token_ids,
+        (__nv_bfloat16*)out, hidden_size);
+}
+
+}
--- a/csrc/embedding/rope.cu
+++ b/csrc/embedding/rope.cu
@@ -0,0 +1,116 @@
+#include <cuda_bf16.h>
+#include <math.h>
+
+// RoPE: Rotary Position Embedding
+// For each pair (x[2i], x[2i+1]) at position `pos`:
+//   y[2i]   = x[2i] * cos - x[2i+1] * sin
+//   y[2i+1] = x[2i] * sin + x[2i+1] * cos
+// where cos/sin come from precomputed cos_cache/sin_cache.
+//
+// cos_cache[pos][i] = cos(pos * freq[i])
+// sin_cache[pos][i] = sin(pos * freq[i])
+// freq[i] = 1.0 / (theta ^ (2i / head_dim))
+
+// Apply RoPE in-place to Q or K tensor.
+// x shape: [num_tokens, num_heads, head_dim]
+// cos_cache, sin_cache shape: [max_seq_len, head_dim/2]
+// positions: [num_tokens] — the position index for each token
+
+__global__ void rope_f32(
+    float* __restrict__ x,                    // [num_tokens, num_heads, head_dim]
+    const float* __restrict__ cos_cache,      // [max_seq_len, half_dim]
+    const float* __restrict__ sin_cache,      // [max_seq_len, half_dim]
+    const int* __restrict__ positions,        // [num_tokens]
+    int num_heads, int head_dim
+) {
+    int token_idx = blockIdx.x;
+    int head_idx = blockIdx.y;
+    int half_dim = head_dim / 2;
+    int pair_idx = threadIdx.x;  // which pair (0..half_dim)
+
+    if (pair_idx >= half_dim) return;
+
+    int pos = positions[token_idx];
+    float cos_val = cos_cache[pos * half_dim + pair_idx];
+    float sin_val = sin_cache[pos * half_dim + pair_idx];
+
+    int base = (token_idx * num_heads + head_idx) * head_dim;
+    float x0 = x[base + 2 * pair_idx];
+    float x1 = x[base + 2 * pair_idx + 1];
+
+    x[base + 2 * pair_idx]     = x0 * cos_val - x1 * sin_val;
+    x[base + 2 * pair_idx + 1] = x0 * sin_val + x1 * cos_val;
+}
+
+__global__ void rope_bf16(
+    __nv_bfloat16* __restrict__ x,
+    const float* __restrict__ cos_cache,
+    const float* __restrict__ sin_cache,
+    const int* __restrict__ positions,
+    int num_heads, int head_dim
+) {
+    int token_idx = blockIdx.x;
+    int head_idx = blockIdx.y;
+    int half_dim = head_dim / 2;
+    int pair_idx = threadIdx.x;
+
+    if (pair_idx >= half_dim) return;
+
+    int pos = positions[token_idx];
+    float cos_val = cos_cache[pos * half_dim + pair_idx];
+    float sin_val = sin_cache[pos * half_dim + pair_idx];
+
+    int base = (token_idx * num_heads + head_idx) * head_dim;
+    float x0 = __bfloat162float(x[base + 2 * pair_idx]);
+    float x1 = __bfloat162float(x[base + 2 * pair_idx + 1]);
+
+    x[base + 2 * pair_idx]     = __float2bfloat16(x0 * cos_val - x1 * sin_val);
+    x[base + 2 * pair_idx + 1] = __float2bfloat16(x0 * sin_val + x1 * cos_val);
+}
+
+// Precompute cos/sin cache on GPU
+__global__ void compute_rope_cache(
+    float* __restrict__ cos_cache,  // [max_seq_len, half_dim]
+    float* __restrict__ sin_cache,
+    int max_seq_len, int half_dim, float theta
+) {
+    int pos = blockIdx.x;
+    int i = threadIdx.x;
+    if (i >= half_dim) return;
+
+    float freq = 1.0f / powf(theta, (float)(2 * i) / (float)(2 * half_dim));
+    float angle = (float)pos * freq;
+    cos_cache[pos * half_dim + i] = cosf(angle);
+    sin_cache[pos * half_dim + i] = sinf(angle);
+}
+
+extern "C" {
+
+void launch_rope_f32(void* x, const void* cos_cache, const void* sin_cache,
+                     const void* positions, int num_tokens, int num_heads,
+                     int head_dim, void* stream) {
+    dim3 grid(num_tokens, num_heads);
+    int block = head_dim / 2;
+    rope_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
+        (float*)x, (const float*)cos_cache, (const float*)sin_cache,
+        (const int*)positions, num_heads, head_dim);
+}
+
+void launch_rope_bf16(void* x, const void* cos_cache, const void* sin_cache,
+                      const void* positions, int num_tokens, int num_heads,
+                      int head_dim, void* stream) {
+    dim3 grid(num_tokens, num_heads);
+    int block = head_dim / 2;
+    rope_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
+        (__nv_bfloat16*)x, (const float*)cos_cache, (const float*)sin_cache,
+        (const int*)positions, num_heads, head_dim);
+}
+
+void launch_compute_rope_cache(void* cos_cache, void* sin_cache,
+                               int max_seq_len, int half_dim, float theta,
+                               void* stream) {
+    compute_rope_cache<<<max_seq_len, half_dim, 0, (cudaStream_t)stream>>>(
+        (float*)cos_cache, (float*)sin_cache, max_seq_len, half_dim, theta);
+}
+
+}