fix: 12 bug fixes from comprehensive review — 51 tok/s verified on RTX 5090
P0 fixes (blocking usability): - FIX-01: thread-local cuBLAS handle (was creating/destroying per matmul) - FIX-16: EOS token no longer leaks into API responses - FIX-17: max_seq_len configurable via --max-seq-len (default 2048, was hardcoded 256) - FIX-18: max_tokens clamped to available seq space, prompt overflow returns 400 P1 fixes (bugs & performance): - FIX-07: CachingAllocator wired into all hot paths (to_device, embedding, rope, concat) - FIX-08: CudaDeviceProp buffer increased to 32KB for CUDA 12.9 safety - FIX-09: tokenizer byte_fallback graceful degradation (was panic) - FIX-19: causal mask uses -INFINITY instead of -1e9 (BF16 supports inf) - FIX-20: LayerNorm rewritten to numerically stable two-pass algorithm - FIX-21: min block size guard (32 threads) for LayerNorm/RMSNorm launches P2 fixes (improvements): - FIX-22: Option<GpuKVCache> + take() eliminates dummy KV cache allocations - FIX-23: RoPE cache no longer artificially capped at 8192 positions Verified on dash5 (RTX 5090): 51 tok/s batch=1, 74 tok/s 2-concurrent, 1.7-3.3x HF transformers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -27,8 +27,7 @@ __global__ void causal_mask_bf16(
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (col < cols && col > row + offset) {
|
||||
// BF16 doesn't have proper -inf literal, use a very large negative
|
||||
scores[batch_idx * rows * cols + row * cols + col] = __float2bfloat16(-1e9f);
|
||||
scores[batch_idx * rows * cols + row * cols + col] = __float2bfloat16(-INFINITY);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,27 +14,34 @@ __global__ void layernorm_f32(
|
||||
const float* x_row = x + row * hidden_size;
|
||||
float* out_row = out + row * hidden_size;
|
||||
|
||||
// Welford online: compute mean and variance in one pass
|
||||
// Pass 1: compute mean
|
||||
float local_sum = 0.0f;
|
||||
float local_sum_sq = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = x_row[i];
|
||||
local_sum += v;
|
||||
local_sum_sq += v * v;
|
||||
local_sum += x_row[i];
|
||||
}
|
||||
local_sum = block_reduce_sum(local_sum);
|
||||
local_sum_sq = block_reduce_sum(local_sum_sq);
|
||||
|
||||
__shared__ float s_mean, s_inv_std;
|
||||
if (threadIdx.x == 0) {
|
||||
float mean = local_sum / hidden_size;
|
||||
float var = local_sum_sq / hidden_size - mean * mean;
|
||||
s_mean = mean;
|
||||
s_inv_std = rsqrtf(var + eps);
|
||||
s_mean = local_sum / hidden_size;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float mean = s_mean;
|
||||
|
||||
// Pass 2: compute variance = sum((x - mean)^2) / N
|
||||
float local_var = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float d = x_row[i] - mean;
|
||||
local_var += d * d;
|
||||
}
|
||||
local_var = block_reduce_sum(local_var);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_inv_std = rsqrtf(local_var / hidden_size + eps);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float inv_std = s_inv_std;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
out_row[i] = gamma[i] * (x_row[i] - mean) * inv_std + beta[i];
|
||||
@@ -52,26 +59,34 @@ __global__ void layernorm_bf16(
|
||||
const __nv_bfloat16* x_row = x + row * hidden_size;
|
||||
__nv_bfloat16* out_row = out + row * hidden_size;
|
||||
|
||||
// Pass 1: compute mean
|
||||
float local_sum = 0.0f;
|
||||
float local_sum_sq = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = __bfloat162float(x_row[i]);
|
||||
local_sum += v;
|
||||
local_sum_sq += v * v;
|
||||
local_sum += __bfloat162float(x_row[i]);
|
||||
}
|
||||
local_sum = block_reduce_sum(local_sum);
|
||||
local_sum_sq = block_reduce_sum(local_sum_sq);
|
||||
|
||||
__shared__ float s_mean, s_inv_std;
|
||||
if (threadIdx.x == 0) {
|
||||
float mean = local_sum / hidden_size;
|
||||
float var = local_sum_sq / hidden_size - mean * mean;
|
||||
s_mean = mean;
|
||||
s_inv_std = rsqrtf(var + eps);
|
||||
s_mean = local_sum / hidden_size;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float mean = s_mean;
|
||||
|
||||
// Pass 2: compute variance = sum((x - mean)^2) / N
|
||||
float local_var = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float d = __bfloat162float(x_row[i]) - mean;
|
||||
local_var += d * d;
|
||||
}
|
||||
local_var = block_reduce_sum(local_var);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_inv_std = rsqrtf(local_var / hidden_size + eps);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float inv_std = s_inv_std;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = __bfloat162float(x_row[i]);
|
||||
@@ -86,6 +101,7 @@ extern "C" {
|
||||
void launch_layernorm_f32(const void* x, const void* gamma, const void* beta,
|
||||
void* out, int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
if (block < 32) block = 32;
|
||||
layernorm_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)x, (const float*)gamma, (const float*)beta,
|
||||
(float*)out, hidden_size, eps);
|
||||
@@ -94,6 +110,7 @@ void launch_layernorm_f32(const void* x, const void* gamma, const void* beta,
|
||||
void launch_layernorm_bf16(const void* x, const void* gamma, const void* beta,
|
||||
void* out, int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
if (block < 32) block = 32;
|
||||
layernorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (const __nv_bfloat16*)gamma, (const __nv_bfloat16*)beta,
|
||||
(__nv_bfloat16*)out, hidden_size, eps);
|
||||
|
||||
@@ -108,6 +108,7 @@ extern "C" {
|
||||
void launch_rmsnorm_f32(const void* x, const void* gamma, void* out,
|
||||
int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
if (block < 32) block = 32;
|
||||
rmsnorm_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)x, (const float*)gamma, (float*)out, hidden_size, eps);
|
||||
}
|
||||
@@ -115,6 +116,7 @@ void launch_rmsnorm_f32(const void* x, const void* gamma, void* out,
|
||||
void launch_rmsnorm_bf16(const void* x, const void* gamma, void* out,
|
||||
int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
if (block < 32) block = 32;
|
||||
rmsnorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (const __nv_bfloat16*)gamma,
|
||||
(__nv_bfloat16*)out, hidden_size, eps);
|
||||
@@ -124,6 +126,7 @@ void launch_add_rmsnorm_bf16(const void* x, const void* residual, const void* ga
|
||||
void* normed_out, void* sum_out,
|
||||
int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
if (block < 32) block = 32;
|
||||
add_rmsnorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (const __nv_bfloat16*)residual,
|
||||
(const __nv_bfloat16*)gamma,
|
||||
|
||||
Reference in New Issue
Block a user