CUDA layer for the paged-KV + swap work: - csrc: new paged_attention.cu plus updates across attention/gemm/norm/ activation/embedding/reduce kernels and common.cuh. - xserv-kernels: new dispatch module and kernel-binding updates. - xserv-cuda: cudaMallocHost/FreeHost bindings + PinnedBuffer (host swap pool backing) and offset-aware D2H/H2D copies used to move KV blocks between the GPU pool and pinned host memory. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
63 lines
2.1 KiB
Plaintext
63 lines
2.1 KiB
Plaintext
#include <cuda_bf16.h>
|
|
#include "../common.cuh"
|
|
|
|
// Embedding lookup: out[seq_idx] = table[token_ids[seq_idx]]
|
|
// Grid: num_tokens, Block: handles hidden_size elements per token.
|
|
|
|
__global__ void embedding_f32(
|
|
const float* __restrict__ table, // [vocab_size, hidden_size]
|
|
const int* __restrict__ token_ids, // [num_tokens]
|
|
float* __restrict__ out, // [num_tokens, hidden_size]
|
|
int hidden_size,
|
|
int vocab_size
|
|
) {
|
|
int token_idx = blockIdx.x;
|
|
int tid = token_ids[token_idx];
|
|
if (tid < 0 || tid >= vocab_size) return;
|
|
const float* row = table + tid * hidden_size;
|
|
float* dst = out + token_idx * hidden_size;
|
|
|
|
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
|
dst[i] = row[i];
|
|
}
|
|
}
|
|
|
|
__global__ void embedding_bf16(
|
|
const __nv_bfloat16* __restrict__ table,
|
|
const int* __restrict__ token_ids,
|
|
__nv_bfloat16* __restrict__ out,
|
|
int hidden_size,
|
|
int vocab_size
|
|
) {
|
|
int token_idx = blockIdx.x;
|
|
int tid = token_ids[token_idx];
|
|
if (tid < 0 || tid >= vocab_size) return;
|
|
const __nv_bfloat16* row = table + tid * hidden_size;
|
|
__nv_bfloat16* dst = out + token_idx * hidden_size;
|
|
|
|
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
|
dst[i] = row[i];
|
|
}
|
|
}
|
|
|
|
extern "C" {
|
|
|
|
void launch_embedding_f32(const void* table, const void* token_ids, void* out,
|
|
int num_tokens, int hidden_size, int vocab_size, void* stream) {
|
|
int block = (hidden_size < 256) ? hidden_size : 256;
|
|
embedding_f32<<<num_tokens, block, 0, (cudaStream_t)stream>>>(
|
|
(const float*)table, (const int*)token_ids, (float*)out, hidden_size, vocab_size);
|
|
CUDA_CHECK_LAST_ERROR();
|
|
}
|
|
|
|
void launch_embedding_bf16(const void* table, const void* token_ids, void* out,
|
|
int num_tokens, int hidden_size, int vocab_size, void* stream) {
|
|
int block = (hidden_size < 256) ? hidden_size : 256;
|
|
embedding_bf16<<<num_tokens, block, 0, (cudaStream_t)stream>>>(
|
|
(const __nv_bfloat16*)table, (const int*)token_ids,
|
|
(__nv_bfloat16*)out, hidden_size, vocab_size);
|
|
CUDA_CHECK_LAST_ERROR();
|
|
}
|
|
|
|
}
|