CUDA layer for the paged-KV + swap work: - csrc: new paged_attention.cu plus updates across attention/gemm/norm/ activation/embedding/reduce kernels and common.cuh. - xserv-kernels: new dispatch module and kernel-binding updates. - xserv-cuda: cudaMallocHost/FreeHost bindings + PinnedBuffer (host swap pool backing) and offset-aware D2H/H2D copies used to move KV blocks between the GPU pool and pinned host memory. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
121 lines
4.4 KiB
Plaintext
121 lines
4.4 KiB
Plaintext
#include <cuda_bf16.h>
|
|
#include <math.h>
|
|
#include "../common.cuh"
|
|
|
|
// RoPE: Rotary Position Embedding, using the Qwen/Llama rotate_half layout.
|
|
// For each dimension i in the first half at position `pos`:
|
|
// y[i] = x[i] * cos - x[i + half_dim] * sin
|
|
// y[i + half_dim] = x[i + half_dim] * cos + x[i] * sin
|
|
// where cos/sin come from precomputed cos_cache/sin_cache.
|
|
//
|
|
// cos_cache[pos][i] = cos(pos * freq[i])
|
|
// sin_cache[pos][i] = sin(pos * freq[i])
|
|
// freq[i] = 1.0 / (theta ^ (2i / head_dim))
|
|
|
|
// Apply RoPE in-place to Q or K tensor.
|
|
// x shape: [num_tokens, num_heads, head_dim]
|
|
// cos_cache, sin_cache shape: [max_seq_len, head_dim/2]
|
|
// positions: [num_tokens] — the position index for each token
|
|
|
|
__global__ void rope_f32(
|
|
float* __restrict__ x, // [num_tokens, num_heads, head_dim]
|
|
const float* __restrict__ cos_cache, // [max_seq_len, half_dim]
|
|
const float* __restrict__ sin_cache, // [max_seq_len, half_dim]
|
|
const int* __restrict__ positions, // [num_tokens]
|
|
int num_heads, int head_dim
|
|
) {
|
|
int token_idx = blockIdx.x;
|
|
int head_idx = blockIdx.y;
|
|
int half_dim = head_dim / 2;
|
|
int pair_idx = threadIdx.x; // which pair (0..half_dim)
|
|
|
|
if (pair_idx >= half_dim) return;
|
|
|
|
int pos = positions[token_idx];
|
|
float cos_val = cos_cache[pos * half_dim + pair_idx];
|
|
float sin_val = sin_cache[pos * half_dim + pair_idx];
|
|
|
|
int base = (token_idx * num_heads + head_idx) * head_dim;
|
|
float x0 = x[base + pair_idx];
|
|
float x1 = x[base + pair_idx + half_dim];
|
|
|
|
x[base + pair_idx] = x0 * cos_val - x1 * sin_val;
|
|
x[base + pair_idx + half_dim] = x1 * cos_val + x0 * sin_val;
|
|
}
|
|
|
|
__global__ void rope_bf16(
|
|
__nv_bfloat16* __restrict__ x,
|
|
const float* __restrict__ cos_cache,
|
|
const float* __restrict__ sin_cache,
|
|
const int* __restrict__ positions,
|
|
int num_heads, int head_dim
|
|
) {
|
|
int token_idx = blockIdx.x;
|
|
int head_idx = blockIdx.y;
|
|
int half_dim = head_dim / 2;
|
|
int pair_idx = threadIdx.x;
|
|
|
|
if (pair_idx >= half_dim) return;
|
|
|
|
int pos = positions[token_idx];
|
|
float cos_val = cos_cache[pos * half_dim + pair_idx];
|
|
float sin_val = sin_cache[pos * half_dim + pair_idx];
|
|
|
|
int base = (token_idx * num_heads + head_idx) * head_dim;
|
|
float x0 = __bfloat162float(x[base + pair_idx]);
|
|
float x1 = __bfloat162float(x[base + pair_idx + half_dim]);
|
|
|
|
x[base + pair_idx] = __float2bfloat16(x0 * cos_val - x1 * sin_val);
|
|
x[base + pair_idx + half_dim] = __float2bfloat16(x1 * cos_val + x0 * sin_val);
|
|
}
|
|
|
|
// Precompute cos/sin cache on GPU
|
|
__global__ void compute_rope_cache(
|
|
float* __restrict__ cos_cache, // [max_seq_len, half_dim]
|
|
float* __restrict__ sin_cache,
|
|
int max_seq_len, int half_dim, float theta
|
|
) {
|
|
int pos = blockIdx.x;
|
|
int i = threadIdx.x;
|
|
if (i >= half_dim) return;
|
|
|
|
float freq = 1.0f / powf(theta, (float)(2 * i) / (float)(2 * half_dim));
|
|
float angle = (float)pos * freq;
|
|
cos_cache[pos * half_dim + i] = cosf(angle);
|
|
sin_cache[pos * half_dim + i] = sinf(angle);
|
|
}
|
|
|
|
extern "C" {
|
|
|
|
void launch_rope_f32(void* x, const void* cos_cache, const void* sin_cache,
|
|
const void* positions, int num_tokens, int num_heads,
|
|
int head_dim, void* stream) {
|
|
dim3 grid(num_tokens, num_heads);
|
|
int block = head_dim / 2;
|
|
rope_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
|
|
(float*)x, (const float*)cos_cache, (const float*)sin_cache,
|
|
(const int*)positions, num_heads, head_dim);
|
|
CUDA_CHECK_LAST_ERROR();
|
|
}
|
|
|
|
void launch_rope_bf16(void* x, const void* cos_cache, const void* sin_cache,
|
|
const void* positions, int num_tokens, int num_heads,
|
|
int head_dim, void* stream) {
|
|
dim3 grid(num_tokens, num_heads);
|
|
int block = head_dim / 2;
|
|
rope_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
|
|
(__nv_bfloat16*)x, (const float*)cos_cache, (const float*)sin_cache,
|
|
(const int*)positions, num_heads, head_dim);
|
|
CUDA_CHECK_LAST_ERROR();
|
|
}
|
|
|
|
void launch_compute_rope_cache(void* cos_cache, void* sin_cache,
|
|
int max_seq_len, int half_dim, float theta,
|
|
void* stream) {
|
|
compute_rope_cache<<<max_seq_len, half_dim, 0, (cudaStream_t)stream>>>(
|
|
(float*)cos_cache, (float*)sin_cache, max_seq_len, half_dim, theta);
|
|
CUDA_CHECK_LAST_ERROR();
|
|
}
|
|
|
|
}
|