xserv/csrc/attention/causal_mask.cu

#include <cuda_bf16.h>
#include "../common.cuh"

// Apply causal mask: set scores[row][col] = -inf where col > row + offset.
// offset is used for KV cache: when query starts at position `offset`,
// we allow attending to positions [0, offset + row].
// scores: [batch, rows, cols]  (flattened batch×heads)

__global__ void causal_mask_f32(
    float* __restrict__ scores,
    int rows, int cols, int offset
) {
    int batch_idx = blockIdx.z;
    int row = blockIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (col < cols && col > row + offset) {
        // 64-bit index: batch * rows * cols overflows int32 at moderate batch
        // and long context (e.g. batch=128 * heads=28 * seq=32768).
        long long idx = ((long long)batch_idx * rows + row) * cols + col;
        scores[idx] = -INFINITY;
    }
}

__global__ void causal_mask_bf16(
    __nv_bfloat16* __restrict__ scores,
    int rows, int cols, int offset
) {
    int batch_idx = blockIdx.z;
    int row = blockIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (col < cols && col > row + offset) {
        long long idx = ((long long)batch_idx * rows + row) * cols + col;
        scores[idx] = __float2bfloat16(-INFINITY);
    }
}

extern "C" {

void launch_causal_mask_f32(void* scores, int batch, int rows, int cols,
                            int offset, void* stream) {
    int block = 256;
    dim3 grid((cols + block - 1) / block, rows, batch);
    causal_mask_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
        (float*)scores, rows, cols, offset);
    CUDA_CHECK_LAST_ERROR();
}

void launch_causal_mask_bf16(void* scores, int batch, int rows, int cols,
                             int offset, void* stream) {
    int block = 256;
    dim3 grid((cols + block - 1) / block, rows, batch);
    causal_mask_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
        (__nv_bfloat16*)scores, rows, cols, offset);
    CUDA_CHECK_LAST_ERROR();
}

}