Files
xserv/csrc/attention/causal_mask.cu
Gahow Wang 4c3f914459 kernels/cuda: paged-attention kernel, dispatch, pinned host memory
CUDA layer for the paged-KV + swap work:
- csrc: new paged_attention.cu plus updates across attention/gemm/norm/
  activation/embedding/reduce kernels and common.cuh.
- xserv-kernels: new dispatch module and kernel-binding updates.
- xserv-cuda: cudaMallocHost/FreeHost bindings + PinnedBuffer (host swap
  pool backing) and offset-aware D2H/H2D copies used to move KV blocks
  between the GPU pool and pinned host memory.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 19:58:36 +08:00

56 lines
1.7 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <cuda_bf16.h>
#include "../common.cuh"
// Apply causal mask: set scores[row][col] = -inf where col > row + offset.
// offset is used for KV cache: when query starts at position `offset`,
// we allow attending to positions [0, offset + row].
// scores: [batch, rows, cols] (flattened batch×heads)
__global__ void causal_mask_f32(
float* __restrict__ scores,
int rows, int cols, int offset
) {
int batch_idx = blockIdx.z;
int row = blockIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (col < cols && col > row + offset) {
scores[batch_idx * rows * cols + row * cols + col] = -INFINITY;
}
}
__global__ void causal_mask_bf16(
__nv_bfloat16* __restrict__ scores,
int rows, int cols, int offset
) {
int batch_idx = blockIdx.z;
int row = blockIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (col < cols && col > row + offset) {
scores[batch_idx * rows * cols + row * cols + col] = __float2bfloat16(-INFINITY);
}
}
extern "C" {
void launch_causal_mask_f32(void* scores, int batch, int rows, int cols,
int offset, void* stream) {
int block = 256;
dim3 grid((cols + block - 1) / block, rows, batch);
causal_mask_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
(float*)scores, rows, cols, offset);
CUDA_CHECK_LAST_ERROR();
}
void launch_causal_mask_bf16(void* scores, int batch, int rows, int cols,
int offset, void* stream) {
int block = 256;
dim3 grid((cols + block - 1) / block, rows, batch);
causal_mask_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
(__nv_bfloat16*)scores, rows, cols, offset);
CUDA_CHECK_LAST_ERROR();
}
}