fix: 12 bug fixes from comprehensive review — 51 tok/s verified on RTX 5090

P0 fixes (blocking usability):
- FIX-01: thread-local cuBLAS handle (was creating/destroying per matmul)
- FIX-16: EOS token no longer leaks into API responses
- FIX-17: max_seq_len configurable via --max-seq-len (default 2048, was hardcoded 256)
- FIX-18: max_tokens clamped to available seq space, prompt overflow returns 400

P1 fixes (bugs & performance):
- FIX-07: CachingAllocator wired into all hot paths (to_device, embedding, rope, concat)
- FIX-08: CudaDeviceProp buffer increased to 32KB for CUDA 12.9 safety
- FIX-09: tokenizer byte_fallback graceful degradation (was panic)
- FIX-19: causal mask uses -INFINITY instead of -1e9 (BF16 supports inf)
- FIX-20: LayerNorm rewritten to numerically stable two-pass algorithm
- FIX-21: min block size guard (32 threads) for LayerNorm/RMSNorm launches

P2 fixes (improvements):
- FIX-22: Option<GpuKVCache> + take() eliminates dummy KV cache allocations
- FIX-23: RoPE cache no longer artificially capped at 8192 positions

Verified on dash5 (RTX 5090): 51 tok/s batch=1, 74 tok/s 2-concurrent, 1.7-3.3x HF transformers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-23 14:13:43 +08:00
parent a67e724119
commit 986a289616
14 changed files with 285 additions and 292 deletions

View File

@@ -27,8 +27,7 @@ __global__ void causal_mask_bf16(
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (col < cols && col > row + offset) {
// BF16 doesn't have proper -inf literal, use a very large negative
scores[batch_idx * rows * cols + row * cols + col] = __float2bfloat16(-1e9f);
scores[batch_idx * rows * cols + row * cols + col] = __float2bfloat16(-INFINITY);
}
}

View File

@@ -14,27 +14,34 @@ __global__ void layernorm_f32(
const float* x_row = x + row * hidden_size;
float* out_row = out + row * hidden_size;
// Welford online: compute mean and variance in one pass
// Pass 1: compute mean
float local_sum = 0.0f;
float local_sum_sq = 0.0f;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float v = x_row[i];
local_sum += v;
local_sum_sq += v * v;
local_sum += x_row[i];
}
local_sum = block_reduce_sum(local_sum);
local_sum_sq = block_reduce_sum(local_sum_sq);
__shared__ float s_mean, s_inv_std;
if (threadIdx.x == 0) {
float mean = local_sum / hidden_size;
float var = local_sum_sq / hidden_size - mean * mean;
s_mean = mean;
s_inv_std = rsqrtf(var + eps);
s_mean = local_sum / hidden_size;
}
__syncthreads();
float mean = s_mean;
// Pass 2: compute variance = sum((x - mean)^2) / N
float local_var = 0.0f;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float d = x_row[i] - mean;
local_var += d * d;
}
local_var = block_reduce_sum(local_var);
if (threadIdx.x == 0) {
s_inv_std = rsqrtf(local_var / hidden_size + eps);
}
__syncthreads();
float inv_std = s_inv_std;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
out_row[i] = gamma[i] * (x_row[i] - mean) * inv_std + beta[i];
@@ -52,26 +59,34 @@ __global__ void layernorm_bf16(
const __nv_bfloat16* x_row = x + row * hidden_size;
__nv_bfloat16* out_row = out + row * hidden_size;
// Pass 1: compute mean
float local_sum = 0.0f;
float local_sum_sq = 0.0f;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float v = __bfloat162float(x_row[i]);
local_sum += v;
local_sum_sq += v * v;
local_sum += __bfloat162float(x_row[i]);
}
local_sum = block_reduce_sum(local_sum);
local_sum_sq = block_reduce_sum(local_sum_sq);
__shared__ float s_mean, s_inv_std;
if (threadIdx.x == 0) {
float mean = local_sum / hidden_size;
float var = local_sum_sq / hidden_size - mean * mean;
s_mean = mean;
s_inv_std = rsqrtf(var + eps);
s_mean = local_sum / hidden_size;
}
__syncthreads();
float mean = s_mean;
// Pass 2: compute variance = sum((x - mean)^2) / N
float local_var = 0.0f;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float d = __bfloat162float(x_row[i]) - mean;
local_var += d * d;
}
local_var = block_reduce_sum(local_var);
if (threadIdx.x == 0) {
s_inv_std = rsqrtf(local_var / hidden_size + eps);
}
__syncthreads();
float inv_std = s_inv_std;
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
float v = __bfloat162float(x_row[i]);
@@ -86,6 +101,7 @@ extern "C" {
void launch_layernorm_f32(const void* x, const void* gamma, const void* beta,
void* out, int rows, int hidden_size, float eps, void* stream) {
int block = (hidden_size < 1024) ? hidden_size : 1024;
if (block < 32) block = 32;
layernorm_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
(const float*)x, (const float*)gamma, (const float*)beta,
(float*)out, hidden_size, eps);
@@ -94,6 +110,7 @@ void launch_layernorm_f32(const void* x, const void* gamma, const void* beta,
void launch_layernorm_bf16(const void* x, const void* gamma, const void* beta,
void* out, int rows, int hidden_size, float eps, void* stream) {
int block = (hidden_size < 1024) ? hidden_size : 1024;
if (block < 32) block = 32;
layernorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)x, (const __nv_bfloat16*)gamma, (const __nv_bfloat16*)beta,
(__nv_bfloat16*)out, hidden_size, eps);

View File

@@ -108,6 +108,7 @@ extern "C" {
void launch_rmsnorm_f32(const void* x, const void* gamma, void* out,
int rows, int hidden_size, float eps, void* stream) {
int block = (hidden_size < 1024) ? hidden_size : 1024;
if (block < 32) block = 32;
rmsnorm_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
(const float*)x, (const float*)gamma, (float*)out, hidden_size, eps);
}
@@ -115,6 +116,7 @@ void launch_rmsnorm_f32(const void* x, const void* gamma, void* out,
void launch_rmsnorm_bf16(const void* x, const void* gamma, void* out,
int rows, int hidden_size, float eps, void* stream) {
int block = (hidden_size < 1024) ? hidden_size : 1024;
if (block < 32) block = 32;
rmsnorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)x, (const __nv_bfloat16*)gamma,
(__nv_bfloat16*)out, hidden_size, eps);
@@ -124,6 +126,7 @@ void launch_add_rmsnorm_bf16(const void* x, const void* residual, const void* ga
void* normed_out, void* sum_out,
int rows, int hidden_size, float eps, void* stream) {
int block = (hidden_size < 1024) ? hidden_size : 1024;
if (block < 32) block = 32;
add_rmsnorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)x, (const __nv_bfloat16*)residual,
(const __nv_bfloat16*)gamma,