phase 14: Flash Attention 2 for SM120 (RTX 5090)
Implement Flash Attention 2 forward kernel targeting SM120 (CC 12.0). FA4 requires TMEM (only on data-center Blackwell SM100), so FA2 is the correct target for consumer Blackwell GPUs like the RTX 5090. CUDA kernel (csrc/attention/flash_attention.cu): - Online softmax with tiled Q/K/V — O(1) extra memory, no S×S matrix - Tile sizes: BR=BC=64, head_dim up to 128 (runtime parameter) - BF16 input, FP32 accumulation, BF16 output - Native GQA: kv_head = q_head / (num_q_heads / num_kv_heads) - Causal mask with tile-level skip optimization - Shared memory: 32 KB (Q_tile 16KB + KV_tile 16KB, fits in 48KB default) - Grid: (q_tiles, batch × num_q_heads), Block: 128 threads Integration: - flash_attention() Rust wrapper in xserv-kernels with shape/dtype validation - Qwen3 forward_gpu_cache uses flash_attention directly (no repeat_kv_gpu) - Eliminates repeat_kv memory allocation + copy per layer per step - Naive attention() preserved for testing/comparison Validated on dash5 (RTX 5090, CUDA 12.9): - Correctness: 9/10 top-1 match vs HF (identical to pre-FA baseline) - Throughput: 12.9 tok/s (up from 10.3, +25% improvement) - Now at 35% of HF transformers baseline (up from 30%) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
225
csrc/attention/flash_attention.cu
Normal file
225
csrc/attention/flash_attention.cu
Normal file
@@ -0,0 +1,225 @@
|
||||
#include <cuda_bf16.h>
|
||||
#include <float.h>
|
||||
|
||||
// Flash Attention 2 forward kernel for BF16 with FP32 accumulation.
|
||||
//
|
||||
// Algorithm: outer loop over Q tiles (BR rows), inner loop over K/V tiles (BC rows).
|
||||
// Uses online softmax — no O(S^2) memory.
|
||||
//
|
||||
// Layout: Q [batch, num_q_heads, q_len, head_dim]
|
||||
// K [batch, num_kv_heads, kv_len, head_dim]
|
||||
// V [batch, num_kv_heads, kv_len, head_dim]
|
||||
// O [batch, num_q_heads, q_len, head_dim]
|
||||
//
|
||||
// Shared memory (BF16):
|
||||
// smem_q[BR][head_dim] — 64 * 128 * 2 = 16 KB (loaded once per Q tile)
|
||||
// smem_kv[BC][head_dim] — 64 * 128 * 2 = 16 KB (alternates K and V)
|
||||
// Total: 32 KB (fits in default 48 KB shared memory)
|
||||
|
||||
#define BR 64
|
||||
#define BC 64
|
||||
#define THREADS_PER_BLOCK 128
|
||||
|
||||
__global__ void flash_attention_bf16_kernel(
|
||||
const __nv_bfloat16* __restrict__ Q,
|
||||
const __nv_bfloat16* __restrict__ K,
|
||||
const __nv_bfloat16* __restrict__ V,
|
||||
__nv_bfloat16* __restrict__ O,
|
||||
int num_q_heads, int num_kv_heads,
|
||||
int q_len, int kv_len, int head_dim,
|
||||
float scale, int causal
|
||||
) {
|
||||
// Grid: (ceil(q_len / BR), batch * num_q_heads)
|
||||
int q_tile_idx = blockIdx.x;
|
||||
int bh = blockIdx.y;
|
||||
int batch_idx = bh / num_q_heads;
|
||||
int q_head = bh % num_q_heads;
|
||||
|
||||
// GQA: map Q head to KV head
|
||||
int heads_per_group = num_q_heads / num_kv_heads;
|
||||
int kv_head = q_head / heads_per_group;
|
||||
|
||||
int q_tile_start = q_tile_idx * BR;
|
||||
if (q_tile_start >= q_len) return;
|
||||
int q_tile_rows = min(BR, q_len - q_tile_start);
|
||||
|
||||
// Pointers to this batch/head's data
|
||||
const __nv_bfloat16* Q_head = Q + ((long long)batch_idx * num_q_heads + q_head) * q_len * head_dim;
|
||||
const __nv_bfloat16* K_head = K + ((long long)batch_idx * num_kv_heads + kv_head) * kv_len * head_dim;
|
||||
const __nv_bfloat16* V_head = V + ((long long)batch_idx * num_kv_heads + kv_head) * kv_len * head_dim;
|
||||
__nv_bfloat16* O_head = O + ((long long)batch_idx * num_q_heads + q_head) * q_len * head_dim;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// Dynamic shared memory
|
||||
extern __shared__ __nv_bfloat16 smem[];
|
||||
__nv_bfloat16* smem_q = smem; // BR * head_dim elements
|
||||
__nv_bfloat16* smem_kv = smem + BR * head_dim; // BC * head_dim elements
|
||||
|
||||
// ---- Load Q tile into shared memory (cooperative) ----
|
||||
int q_elems = q_tile_rows * head_dim;
|
||||
for (int i = tid; i < q_elems; i += THREADS_PER_BLOCK) {
|
||||
int row = i / head_dim;
|
||||
int col = i % head_dim;
|
||||
smem_q[row * head_dim + col] = Q_head[(q_tile_start + row) * head_dim + col];
|
||||
}
|
||||
// Zero-pad if q_tile_rows < BR
|
||||
for (int i = q_elems + tid; i < BR * head_dim; i += THREADS_PER_BLOCK) {
|
||||
smem_q[i] = __float2bfloat16(0.0f);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Thread t (0 <= t < q_tile_rows) owns Q row t
|
||||
bool owns_row = (tid < q_tile_rows);
|
||||
|
||||
// Per-thread FP32 accumulators (head_dim up to 128)
|
||||
float O_acc[128];
|
||||
float m_val = -INFINITY;
|
||||
float l_val = 0.0f;
|
||||
if (owns_row) {
|
||||
for (int d = 0; d < head_dim; d++) {
|
||||
O_acc[d] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// kv_offset handles cached KV longer than Q (decode step)
|
||||
int kv_offset = kv_len - q_len;
|
||||
int num_kv_tiles = (kv_len + BC - 1) / BC;
|
||||
|
||||
// ---- Inner loop over K/V tiles ----
|
||||
for (int j = 0; j < num_kv_tiles; j++) {
|
||||
int kv_tile_start = j * BC;
|
||||
int kv_tile_cols = min(BC, kv_len - kv_tile_start);
|
||||
|
||||
// Causal: skip entire tile if all K positions are in the future
|
||||
if (causal) {
|
||||
int max_allowed_kv = (q_tile_start + q_tile_rows - 1) + kv_offset;
|
||||
if (kv_tile_start > max_allowed_kv) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Load K tile into smem_kv ----
|
||||
int kv_elems = kv_tile_cols * head_dim;
|
||||
for (int i = tid; i < kv_elems; i += THREADS_PER_BLOCK) {
|
||||
int row = i / head_dim;
|
||||
int col = i % head_dim;
|
||||
smem_kv[row * head_dim + col] = K_head[(kv_tile_start + row) * head_dim + col];
|
||||
}
|
||||
for (int i = kv_elems + tid; i < BC * head_dim; i += THREADS_PER_BLOCK) {
|
||||
smem_kv[i] = __float2bfloat16(0.0f);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ---- Compute S = Q @ K^T * scale, causal mask, online softmax ----
|
||||
float P[BC];
|
||||
|
||||
if (owns_row) {
|
||||
float row_max = -INFINITY;
|
||||
for (int c = 0; c < kv_tile_cols; c++) {
|
||||
float dot = 0.0f;
|
||||
for (int d = 0; d < head_dim; d++) {
|
||||
dot += __bfloat162float(smem_q[tid * head_dim + d])
|
||||
* __bfloat162float(smem_kv[c * head_dim + d]);
|
||||
}
|
||||
float s = dot * scale;
|
||||
|
||||
if (causal) {
|
||||
int q_pos = q_tile_start + tid;
|
||||
int kv_pos = kv_tile_start + c;
|
||||
if (kv_pos > q_pos + kv_offset) {
|
||||
s = -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
P[c] = s; // store score temporarily in P
|
||||
row_max = fmaxf(row_max, s);
|
||||
}
|
||||
|
||||
// Online softmax: m_new, P = exp(S - m_new), l_new
|
||||
float m_new = fmaxf(m_val, row_max);
|
||||
|
||||
float psum = 0.0f;
|
||||
for (int c = 0; c < kv_tile_cols; c++) {
|
||||
P[c] = expf(P[c] - m_new);
|
||||
psum += P[c];
|
||||
}
|
||||
|
||||
// Rescale previous accumulator
|
||||
float correction = expf(m_val - m_new);
|
||||
l_val = correction * l_val + psum;
|
||||
|
||||
for (int d = 0; d < head_dim; d++) {
|
||||
O_acc[d] *= correction;
|
||||
}
|
||||
|
||||
m_val = m_new;
|
||||
}
|
||||
|
||||
// Sync before overwriting smem_kv with V tile
|
||||
__syncthreads();
|
||||
|
||||
// ---- Load V tile (reuse smem_kv) ----
|
||||
int v_elems = kv_tile_cols * head_dim;
|
||||
for (int i = tid; i < v_elems; i += THREADS_PER_BLOCK) {
|
||||
int row = i / head_dim;
|
||||
int col = i % head_dim;
|
||||
smem_kv[row * head_dim + col] = V_head[(kv_tile_start + row) * head_dim + col];
|
||||
}
|
||||
for (int i = v_elems + tid; i < BC * head_dim; i += THREADS_PER_BLOCK) {
|
||||
smem_kv[i] = __float2bfloat16(0.0f);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ---- Accumulate O += P @ V_tile ----
|
||||
if (owns_row) {
|
||||
for (int c = 0; c < kv_tile_cols; c++) {
|
||||
float p = P[c];
|
||||
if (p != 0.0f) {
|
||||
for (int d = 0; d < head_dim; d++) {
|
||||
O_acc[d] += p * __bfloat162float(smem_kv[c * head_dim + d]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// ---- Final normalize and write output (convert FP32 → BF16) ----
|
||||
if (owns_row) {
|
||||
float inv_l = (l_val > 0.0f) ? (1.0f / l_val) : 0.0f;
|
||||
int global_row = q_tile_start + tid;
|
||||
for (int d = 0; d < head_dim; d++) {
|
||||
O_head[global_row * head_dim + d] = __float2bfloat16(O_acc[d] * inv_l);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_flash_attention_bf16(
|
||||
const void* Q, const void* K, const void* V, void* O,
|
||||
int batch, int num_q_heads, int num_kv_heads,
|
||||
int q_len, int kv_len, int head_dim,
|
||||
float scale, int causal, void* stream
|
||||
) {
|
||||
int q_tiles = (q_len + BR - 1) / BR;
|
||||
dim3 grid(q_tiles, batch * num_q_heads);
|
||||
int block = THREADS_PER_BLOCK;
|
||||
|
||||
// Shared memory: smem_q[BR * head_dim] + smem_kv[BC * head_dim], all BF16
|
||||
int smem_bytes = (BR + BC) * head_dim * (int)sizeof(__nv_bfloat16);
|
||||
|
||||
flash_attention_bf16_kernel<<<grid, block, smem_bytes, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)Q,
|
||||
(const __nv_bfloat16*)K,
|
||||
(const __nv_bfloat16*)V,
|
||||
(__nv_bfloat16*)O,
|
||||
num_q_heads, num_kv_heads,
|
||||
q_len, kv_len, head_dim,
|
||||
scale, causal
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user