fix: comprehensive review + 14 bug fixes + Phase 12/14 overhaul

Strict code review identified 30+ issues across correctness, performance, and architecture. This commit addresses 14 of them with verified fixes, restructures Phase 12 for honest continuous batching, and updates Phase 14 to target FA2 (RTX 5090 SM120 lacks TMEM required by FA4). Bug fixes: - FIX-01: Global cuBLAS handle (thread-local singleton, was per-call) - FIX-02: Remove 19 unnecessary cudaDeviceSynchronize calls from kernels - FIX-03: Qwen3 ChatML template (was plain text concatenation) - FIX-04: EOS token from tokenizer (was hardcoded 151645) - FIX-05: Storage tracks actual GPU device ordinal (was always Cuda(0)) - FIX-06: unsqueeze stride preserves contiguous layout - FIX-08: CudaDeviceProp replaced with heap buffer (was UB-prone padding) - FIX-09: Tokenizer byte_fallback to <0xNN> tokens (was panic) Feature additions: - FIX-10: SSE streaming (/v1/chat/completions, OpenAI-compatible) - FIX-11: Correct usage statistics (prompt/completion/total tokens) - FIX-13: Temperature / top-k / top-p sampling with SamplingParams Performance improvements: - FIX-07: Caching allocator wired up (thread-local pool, pooled flag) - FIX-12: KV cache staging buffers (zero-alloc get_kv_len via borrow_raw) - FIX-14: GPU strided copy kernel (eliminates contiguous() CPU round-trip) Architecture: - Phase 12 engine restructured: prefill/decode separation, honest TODO for batched GPU forward (requires Flash Attention) - Phase 14 updated: FA2 for SM120 (FA4 requires TMEM, absent on 5090) - Qwen3-7B → Qwen3-8B typo fixed across all docs (36 layers, hidden 4096) Validated on dash5 (8x RTX 5090): - 52/52 API prompts pass (EN/CN/code), SSE streaming verified - Logits match HF transformers 9/10 top-1, 4.0/5 avg top-5 overlap - 8 concurrent requests: 5.99x scheduling speedup (batch_size=4) - Throughput: 10.3 tok/s (serial), 30% of HF baseline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 17:53:28 +08:00
parent d8493bd70f
commit ee68d3565d
38 changed files with 3012 additions and 259 deletions
--- a/crates/xserv-cuda/src/allocator.rs
+++ b/crates/xserv-cuda/src/allocator.rs
@@ -1,6 +1,7 @@
 use crate::error::Result;
 use crate::ffi;
 use crate::memory::GpuBuffer;
+use std::cell::RefCell;
 use std::collections::HashMap;

 /// Caching allocator that reuses freed GPU buffers instead of calling
@@ -84,6 +85,33 @@ impl Drop for CachingAllocator {
    }
 }

+thread_local! {
+    static ALLOCATOR: RefCell<CachingAllocator> = RefCell::new(CachingAllocator::new());
+}
+
+/// Allocate a GPU buffer through the caching allocator.
+/// The returned buffer has `pooled = true` so it will be returned
+/// to the pool on drop instead of calling cudaFree.
+pub fn cached_alloc(size: usize) -> Result<GpuBuffer> {
+    ALLOCATOR.with(|cell| {
+        let mut buf = cell.borrow_mut().alloc(size)?;
+        buf.set_pooled(true);
+        Ok(buf)
+    })
+}
+
+/// Return a raw GPU pointer to the caching allocator's free list.
+/// Called from `GpuBuffer::Drop` for pooled buffers. Takes raw pointer
+/// and size to avoid re-triggering Drop.
+pub fn return_to_pool(ptr: *mut u8, len: usize) {
+    ALLOCATOR.with(|cell| {
+        let mut alloc = cell.borrow_mut();
+        let bucket = bucket_size(len);
+        alloc.stats.current_allocated = alloc.stats.current_allocated.saturating_sub(len);
+        alloc.free_lists.entry(bucket).or_default().push((ptr, len));
+    });
+}
+
 /// Round up to next power-of-2, minimum 512 bytes.
 fn bucket_size(size: usize) -> usize {
    let min = 512;
--- a/crates/xserv-cuda/src/device.rs
+++ b/crates/xserv-cuda/src/device.rs
@@ -1,6 +1,7 @@
 use crate::error::{self, Result};
 use crate::ffi;
 use std::ffi::CStr;
+use std::os::raw::c_char;

 #[derive(Debug, Clone)]
 pub struct DeviceInfo {
@@ -44,10 +45,13 @@ pub fn current_device() -> Result<u32> {
 }

 pub fn device_info(device: u32) -> Result<DeviceInfo> {
-    // Get device name from cudaGetDeviceProperties (only use the name field).
-    let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
-    error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
-    let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
+    // Heap-allocate oversized buffer for cudaDeviceProp (layout varies by CUDA version).
+    let mut prop_buf = vec![0u8; 16384];
+    error::check(unsafe {
+        ffi::cudaGetDeviceProperties(prop_buf.as_mut_ptr(), device as i32)
+    })?;
+    // Name is always the first field: char[256].
+    let name = unsafe { CStr::from_ptr(prop_buf.as_ptr() as *const c_char) }
        .to_string_lossy()
        .into_owned();

--- a/crates/xserv-cuda/src/ffi.rs
+++ b/crates/xserv-cuda/src/ffi.rs
@@ -11,31 +11,13 @@ pub const CUDA_MEMCPY_D2D: i32 = 3;
 pub const CUDA_SUCCESS: i32 = 0;
 pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;

-#[repr(C)]
-pub struct CudaDeviceProp {
-    pub name: [c_char; 256],
-    pub total_global_mem: usize,
-    pub shared_mem_per_block: usize,
-    pub regs_per_block: i32,
-    pub warp_size: i32,
-    pub max_threads_per_block: i32,
-    pub max_threads_dim: [i32; 3],
-    pub max_grid_size: [i32; 3],
-    pub clock_rate: i32,
-    pub total_const_mem: usize,
-    pub major: i32,
-    pub minor: i32,
-    // There are many more fields; we only read up to what we need.
-    // cudaDeviceProp is a large struct (~1KB). We pad the rest.
-    _pad: [u8; 4096],
-}
-
 unsafe extern "C" {
    // --- Device ---
    pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
    pub fn cudaSetDevice(device: i32) -> i32;
    pub fn cudaGetDevice(device: *mut i32) -> i32;
-    pub fn cudaGetDeviceProperties(prop: *mut CudaDeviceProp, device: i32) -> i32;
+    /// Takes a raw pointer; caller provides a heap buffer large enough for any CUDA version.
+    pub fn cudaGetDeviceProperties(prop: *mut u8, device: i32) -> i32;
    pub fn cudaDeviceSynchronize() -> i32;

    // --- Memory ---
--- a/crates/xserv-cuda/src/memory.rs
+++ b/crates/xserv-cuda/src/memory.rs
@@ -3,9 +3,18 @@ use crate::ffi;
 use crate::stream::CudaStream;

 /// RAII wrapper around a GPU memory allocation.
+///
+/// When `owned` is true (the default), dropping frees the GPU memory.
+/// A borrowed buffer (`owned = false`) does NOT free on drop — the
+/// caller must ensure the backing allocation outlives all borrows.
+///
+/// When `pooled` is true, dropping returns the buffer to the caching
+/// allocator's free list instead of calling cudaFree.
 pub struct GpuBuffer {
    ptr: *mut u8,
    len: usize,
+    owned: bool,
+    pooled: bool,
 }

 impl GpuBuffer {
@@ -13,7 +22,13 @@ impl GpuBuffer {
        assert!(len > 0, "cannot allocate 0 bytes on GPU");
        let mut ptr = std::ptr::null_mut();
        error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
-        Ok(Self { ptr, len })
+        Ok(Self { ptr, len, owned: true, pooled: false })
+    }
+
+    /// Mark this buffer as pooled (returned to caching allocator on drop)
+    /// or not. Called by `cached_alloc` after obtaining a buffer.
+    pub fn set_pooled(&mut self, pooled: bool) {
+        self.pooled = pooled;
    }

    pub fn len(&self) -> usize {
@@ -113,14 +128,29 @@ impl GpuBuffer {
    /// Reconstruct a GpuBuffer from a raw pointer + length.
    /// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
    pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
-        Self { ptr, len }
+        Self { ptr, len, owned: true, pooled: false }
+    }
+
+    /// Create a non-owning view of GPU memory. Dropping this buffer does NOT
+    /// call `cudaFree`. The caller must ensure the underlying allocation
+    /// outlives this borrow.
+    ///
+    /// # Safety
+    /// `ptr` must point to a valid GPU allocation of at least `len` bytes that
+    /// will remain live for the lifetime of the returned `GpuBuffer`.
+    pub unsafe fn borrow_raw(ptr: *mut u8, len: usize) -> Self {
+        Self { ptr, len, owned: false, pooled: false }
    }
 }

 impl Drop for GpuBuffer {
    fn drop(&mut self) {
-        if !self.ptr.is_null() {
-            unsafe { ffi::cudaFree(self.ptr) };
+        if self.owned && !self.ptr.is_null() {
+            if self.pooled {
+                crate::allocator::return_to_pool(self.ptr, self.len);
+            } else {
+                unsafe { ffi::cudaFree(self.ptr) };
+            }
        }
    }
 }