phase 10: GPU add/mul kernels + BF16 precision analysis

Kernel additions:
- add_f32/bf16, mul_f32/bf16 CUDA kernels (element-wise, on GPU)
- Refactored activation.rs with dispatch_unary/dispatch_binary helpers
- Qwen3 and GPT-2 now use GPU add/mul instead of CPU round-trips

GPT-2 add_bias also moved to GPU (broadcast via tile + GPU add)

BF16 precision analysis (docs/benchmarks/phase10-qwen3.md):
- Root cause: separate attention kernels materialize BF16 intermediates
  (QK^T→BF16→scale→BF16→mask→BF16→softmax→BF16 vs HF's fused FP32 path)
- HF itself SDPA vs Eager also differs by ~0.125 logit
- xserv vs HF: ~1-2 logit systematic offset, but same top-1 in 84% cases
- Industry standard for BF16: top-5 overlap (we achieve 100%)
- Fix path: Flash Attention (Phase 14) to fuse attention in FP32

Performance: TTFT 138→119ms, TBT 144→137ms (GPU ops faster than CPU)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-22 11:35:26 +08:00
parent 268e40d764
commit be5c64ea8a
6 changed files with 143 additions and 51 deletions

View File

@@ -45,6 +45,26 @@ __global__ void scale_bf16_kernel(const __nv_bfloat16* x, __nv_bfloat16* out, fl
if (idx < n) out[idx] = __float2bfloat16(__bfloat162float(x[idx]) * scale);
}
// Element-wise add: out = a + b
__global__ void add_f32_kernel(const float* a, const float* b, float* out, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = a[idx] + b[idx];
}
__global__ void add_bf16_kernel(const __nv_bfloat16* a, const __nv_bfloat16* b, __nv_bfloat16* out, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = __float2bfloat16(__bfloat162float(a[idx]) + __bfloat162float(b[idx]));
}
// Element-wise mul: out = a * b
__global__ void mul_f32_kernel(const float* a, const float* b, float* out, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = a[idx] * b[idx];
}
__global__ void mul_bf16_kernel(const __nv_bfloat16* a, const __nv_bfloat16* b, __nv_bfloat16* out, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = __float2bfloat16(__bfloat162float(a[idx]) * __bfloat162float(b[idx]));
}
extern "C" {
void launch_gelu_f32(const void* x, void* out, int n, void* stream) {
@@ -87,4 +107,29 @@ void launch_scale_bf16(const void* x, void* out, float scale, int n, void* strea
(const __nv_bfloat16*)x, (__nv_bfloat16*)out, scale, n);
}
void launch_add_f32(const void* a, const void* b, void* out, int n, void* stream) {
int block = 256;
int grid = (n + block - 1) / block;
add_f32_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
(const float*)a, (const float*)b, (float*)out, n);
}
void launch_add_bf16(const void* a, const void* b, void* out, int n, void* stream) {
int block = 256;
int grid = (n + block - 1) / block;
add_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)a, (const __nv_bfloat16*)b, (__nv_bfloat16*)out, n);
}
void launch_mul_f32(const void* a, const void* b, void* out, int n, void* stream) {
int block = 256;
int grid = (n + block - 1) / block;
mul_f32_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
(const float*)a, (const float*)b, (float*)out, n);
}
void launch_mul_bf16(const void* a, const void* b, void* out, int n, void* stream) {
int block = 256;
int grid = (n + block - 1) / block;
mul_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
(const __nv_bfloat16*)a, (const __nv_bfloat16*)b, (__nv_bfloat16*)out, n);
}
}