xserv/csrc/attention/causal_mask.cu

#include <cuda_bf16.h>
#include "../common.cuh"

// Apply causal mask: set scores[row][col] = -inf where col > row + offset.
// offset is used for KV cache: when query starts at position `offset`,
// we allow attending to positions [0, offset + row].
// scores: [batch, rows, cols]  (flattened batch×heads)

__global__ void causal_mask_f32(
    float* __restrict__ scores,
    int rows, int cols, int offset
) {
    int batch_idx = blockIdx.z;
    int row = blockIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (col < cols && col > row + offset) {
        scores[batch_idx * rows * cols + row * cols + col] = -INFINITY;
    }
}

__global__ void causal_mask_bf16(
    __nv_bfloat16* __restrict__ scores,
    int rows, int cols, int offset
) {
    int batch_idx = blockIdx.z;
    int row = blockIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (col < cols && col > row + offset) {
        scores[batch_idx * rows * cols + row * cols + col] = __float2bfloat16(-INFINITY);
    }
}

extern "C" {

void launch_causal_mask_f32(void* scores, int batch, int rows, int cols,
                            int offset, void* stream) {
    int block = 256;
    dim3 grid((cols + block - 1) / block, rows, batch);
    causal_mask_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
        (float*)scores, rows, cols, offset);
    CUDA_CHECK_LAST_ERROR();
}

void launch_causal_mask_bf16(void* scores, int batch, int rows, int cols,
                             int offset, void* stream) {
    int block = 256;
    dim3 grid((cols + block - 1) / block, rows, batch);
    causal_mask_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
        (__nv_bfloat16*)scores, rows, cols, offset);
    CUDA_CHECK_LAST_ERROR();
}

}