fix: comprehensive review + 14 bug fixes + Phase 12/14 overhaul

Strict code review identified 30+ issues across correctness, performance, and architecture. This commit addresses 14 of them with verified fixes, restructures Phase 12 for honest continuous batching, and updates Phase 14 to target FA2 (RTX 5090 SM120 lacks TMEM required by FA4). Bug fixes: - FIX-01: Global cuBLAS handle (thread-local singleton, was per-call) - FIX-02: Remove 19 unnecessary cudaDeviceSynchronize calls from kernels - FIX-03: Qwen3 ChatML template (was plain text concatenation) - FIX-04: EOS token from tokenizer (was hardcoded 151645) - FIX-05: Storage tracks actual GPU device ordinal (was always Cuda(0)) - FIX-06: unsqueeze stride preserves contiguous layout - FIX-08: CudaDeviceProp replaced with heap buffer (was UB-prone padding) - FIX-09: Tokenizer byte_fallback to <0xNN> tokens (was panic) Feature additions: - FIX-10: SSE streaming (/v1/chat/completions, OpenAI-compatible) - FIX-11: Correct usage statistics (prompt/completion/total tokens) - FIX-13: Temperature / top-k / top-p sampling with SamplingParams Performance improvements: - FIX-07: Caching allocator wired up (thread-local pool, pooled flag) - FIX-12: KV cache staging buffers (zero-alloc get_kv_len via borrow_raw) - FIX-14: GPU strided copy kernel (eliminates contiguous() CPU round-trip) Architecture: - Phase 12 engine restructured: prefill/decode separation, honest TODO for batched GPU forward (requires Flash Attention) - Phase 14 updated: FA2 for SM120 (FA4 requires TMEM, absent on 5090) - Qwen3-7B → Qwen3-8B typo fixed across all docs (36 layers, hidden 4096) Validated on dash5 (8x RTX 5090): - 52/52 API prompts pass (EN/CN/code), SSE streaming verified - Logits match HF transformers 9/10 top-1, 4.0/5 avg top-5 overlap - 8 concurrent requests: 5.99x scheduling speedup (batch_size=4) - Throughput: 10.3 tok/s (serial), 30% of HF baseline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 17:53:28 +08:00
parent d8493bd70f
commit ee68d3565d
38 changed files with 3012 additions and 259 deletions
--- a/csrc/embedding/transpose.cu
+++ b/csrc/embedding/transpose.cu
@@ -111,6 +111,55 @@ __global__ void repeat_kv_bf16(
    out[idx] = in[in_idx];
 }

+// ---- Generic strided copy (up to 4D) ----
+// Each thread copies one element. Maps flat contiguous output index to strided input index.
+// Unused dimensions are padded with shape=1, stride=0.
+
+__global__ void strided_copy_bf16(
+    const __nv_bfloat16* __restrict__ in,
+    __nv_bfloat16* __restrict__ out,
+    int numel,
+    int ndim,
+    int shape0, int shape1, int shape2, int shape3,
+    int in_stride0, int in_stride1, int in_stride2, int in_stride3,
+    int in_offset
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) return;
+
+    // Decompose flat output index into multi-dim indices (rightmost = fastest)
+    int remaining = idx;
+    int i3 = remaining % shape3; remaining /= shape3;
+    int i2 = remaining % shape2; remaining /= shape2;
+    int i1 = remaining % shape1; remaining /= shape1;
+    int i0 = remaining;
+
+    int in_idx = in_offset + i0 * in_stride0 + i1 * in_stride1 + i2 * in_stride2 + i3 * in_stride3;
+    out[idx] = in[in_idx];
+}
+
+__global__ void strided_copy_f32(
+    const float* __restrict__ in,
+    float* __restrict__ out,
+    int numel,
+    int ndim,
+    int shape0, int shape1, int shape2, int shape3,
+    int in_stride0, int in_stride1, int in_stride2, int in_stride3,
+    int in_offset
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) return;
+
+    int remaining = idx;
+    int i3 = remaining % shape3; remaining /= shape3;
+    int i2 = remaining % shape2; remaining /= shape2;
+    int i1 = remaining % shape1; remaining /= shape1;
+    int i0 = remaining;
+
+    int in_idx = in_offset + i0 * in_stride0 + i1 * in_stride1 + i2 * in_stride2 + i3 * in_stride3;
+    out[idx] = in[in_idx];
+}
+
 extern "C" {

 void launch_reshape_heads_bf16(const void* in, void* out,
@@ -158,4 +207,28 @@ void launch_repeat_kv_bf16(const void* in, void* out,
        (const __nv_bfloat16*)in, (__nv_bfloat16*)out, kv_heads, n_rep, seq_len, head_dim);
 }

+void launch_strided_copy_bf16(const void* in, void* out, int numel, int ndim,
+                              int shape0, int shape1, int shape2, int shape3,
+                              int in_stride0, int in_stride1, int in_stride2, int in_stride3,
+                              int in_offset, void* stream) {
+    int block = 256;
+    int grid = (numel + block - 1) / block;
+    strided_copy_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
+        (const __nv_bfloat16*)in, (__nv_bfloat16*)out, numel, ndim,
+        shape0, shape1, shape2, shape3,
+        in_stride0, in_stride1, in_stride2, in_stride3, in_offset);
+}
+
+void launch_strided_copy_f32(const void* in, void* out, int numel, int ndim,
+                             int shape0, int shape1, int shape2, int shape3,
+                             int in_stride0, int in_stride1, int in_stride2, int in_stride3,
+                             int in_offset, void* stream) {
+    int block = 256;
+    int grid = (numel + block - 1) / block;
+    strided_copy_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
+        (const float*)in, (float*)out, numel, ndim,
+        shape0, shape1, shape2, shape3,
+        in_stride0, in_stride1, in_stride2, in_stride3, in_offset);
+}
+
 }