cuda: device caching allocator (pool GpuBuffer alloc)

Every tape op allocates its output via Tensor::zeros -> GpuBuffer::alloc -> cudaMalloc, a synchronous process-serialized driver call. Under the single- process thread-per-GPU DDP model the rank threads' hundreds of per-step allocs serialize through the driver (KI-5 root cause); it costs single-GPU too. Add a per-device, size-classed caching pool: GpuBuffer::alloc serves from a free-list (request rounded up to a size class so repeating training shapes reuse buffers), only cudaMalloc on a miss; Drop returns the buffer to the pool instead of cudaFree. Thread-safe via a global registry keyed by device id with each device's free-list behind its own Mutex (registry lock held only to clone out the per-device Arc<Mutex<_>>, so rank threads don't contend across devices). The buffer records its alloc-time device so Drop returns to the right pool. Transparent: physical capacity may be rounded up, but len()/memset/copy bounds all use the requested length, so the rounded tail is never read and numerics are unchanged. zeros() still memsets (reused buffers hold stale bytes). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 11:04:02 +08:00
parent d422c68704
commit 28801fbfe5
4 changed files with 153 additions and 7 deletions
--- a/crates/xtrain-cuda/src/ffi.rs
+++ b/crates/xtrain-cuda/src/ffi.rs
@@ -13,6 +13,7 @@ unsafe extern "C" {
    // --- Device ---
    pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
    pub fn cudaSetDevice(device: i32) -> i32;
+    pub fn cudaGetDevice(device: *mut i32) -> i32;
    pub fn cudaDeviceSynchronize() -> i32;

    // --- Memory ---
--- a/crates/xtrain-cuda/src/lib.rs
+++ b/crates/xtrain-cuda/src/lib.rs
@@ -4,6 +4,7 @@ pub mod device;
 pub mod error;
 pub mod ffi;
 pub mod memory;
+mod pool;

 pub use error::{CudaError, Result};
 pub use memory::GpuBuffer;
--- a/crates/xtrain-cuda/src/memory.rs
+++ b/crates/xtrain-cuda/src/memory.rs
@@ -1,18 +1,37 @@
 use crate::error::{self, Result};
 use crate::ffi;
+use crate::pool;

-/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
+/// RAII wrapper around a GPU memory allocation. Dropping returns the buffer to
+/// the per-device caching pool (see [`crate::pool`]) for reuse instead of
+/// calling `cudaFree`.
+///
+/// `len` is the logical (requested) length used for all copy/memset bounds and
+/// exposed via [`GpuBuffer::len`]; `cap` is the physical size class the pool
+/// rounded up to (>= `len`), used only to bucket the buffer for reuse. The
+/// extra `cap - len` bytes are never exposed to callers, so pooling is
+/// numerically transparent. `device` records which device pool to return to.
 pub struct GpuBuffer {
    ptr: *mut u8,
    len: usize,
+    cap: usize,
+    device: i32,
 }

 impl GpuBuffer {
+    /// Allocate at least `len` bytes on the calling thread's current device,
+    /// reusing a pooled buffer when one of the matching size class is free.
+    /// The contents are **uninitialized** (a reused buffer holds stale bytes);
+    /// callers that need zeros must memset (see [`crate::Storage::zeros`]).
    pub fn alloc(len: usize) -> Result<Self> {
        assert!(len > 0, "cannot allocate 0 bytes on GPU");
-        let mut ptr = std::ptr::null_mut();
-        error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
-        Ok(Self { ptr, len })
+        let a = pool::acquire(len)?;
+        Ok(Self {
+            ptr: a.ptr,
+            len,
+            cap: a.cap,
+            device: a.device,
+        })
    }

    pub fn len(&self) -> usize {
@@ -56,9 +75,10 @@ impl GpuBuffer {

 impl Drop for GpuBuffer {
    fn drop(&mut self) {
-        if !self.ptr.is_null() {
-            unsafe { ffi::cudaFree(self.ptr) };
-        }
+        // Return to the device pool for reuse (no cudaFree). The pool retains
+        // the raw pointer for the process lifetime; on process exit the OS
+        // reclaims the device context, so this is not a leak.
+        pool::release(self.ptr, self.device, self.cap);
    }
 }

--- a/crates/xtrain-cuda/src/pool.rs
+++ b/crates/xtrain-cuda/src/pool.rs
@@ -0,0 +1,124 @@
+//! Device caching / pool allocator (Phase T11, KI-5).
+//!
+//! Every tape op allocates its output buffer via [`crate::GpuBuffer::alloc`],
+//! which used to call `cudaMalloc` + (for `zeros`) `cudaMemset` on *every* op.
+//! `cudaMalloc`/`cudaFree` are synchronous, process-serialized driver calls; in
+//! the single-process thread-per-GPU DDP model the rank threads' hundreds of
+//! per-step allocations queue through the driver and serialize (KI-5). The cost
+//! hurts single-GPU too.
+//!
+//! Fix: cache freed device buffers in a per-device, size-classed free list and
+//! reuse them. Training has repeating shapes, so after warm-up the steady-state
+//! `cudaMalloc` count per step is ~0. The pool is **transparent**: a `GpuBuffer`
+//! handed out from the pool exposes exactly the bytes the caller requested (the
+//! physical allocation may be rounded up to its size class, but `len()` and all
+//! copy/memset bounds use the requested length), so numerics are unchanged.
+//!
+//! Thread-safety: DDP runs thread-per-GPU in one process. The pool is a global
+//! registry keyed by device id; each device's free list lives behind its own
+//! `Mutex`. A buffer remembers which device it was allocated on (the thread's
+//! current CUDA device at `alloc` time) so `Drop` returns it to the right pool.
+
+use crate::error::{self, Result};
+use crate::ffi;
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex, OnceLock};
+
+/// Allocation granularity. Requests are rounded *up* to a size class so that
+/// op outputs of the same shape (the common case in training) land in the same
+/// free list and are reused across steps.
+///
+/// Small allocations round up to a multiple of `MIN_CLASS`; larger ones round
+/// up to the next power of two. Powers of two keep the number of distinct
+/// classes bounded (so the free lists stay shallow) while wasting at most ~2×
+/// per buffer — fine for fixed-shape training, and freed memory is reused, not
+/// leaked.
+const MIN_CLASS: usize = 512;
+/// Below this threshold, round up to a multiple of `MIN_CLASS` (fine-grained);
+/// at or above it, round up to the next power of two.
+const POW2_THRESHOLD: usize = 1 << 20; // 1 MiB
+
+/// Round a byte length up to its size class (the physical allocation size).
+fn size_class(len: usize) -> usize {
+    debug_assert!(len > 0);
+    if len <= POW2_THRESHOLD {
+        len.div_ceil(MIN_CLASS) * MIN_CLASS
+    } else {
+        len.next_power_of_two()
+    }
+}
+
+/// Per-device free list: size class -> stack of cached raw device pointers.
+#[derive(Default)]
+struct DevicePool {
+    free: HashMap<usize, Vec<*mut u8>>,
+}
+
+// The raw pointers are device addresses, only ever dereferenced by the GPU.
+// They are guarded by a `Mutex` and moved between threads as plain handles.
+unsafe impl Send for DevicePool {}
+
+type SharedPool = Arc<Mutex<DevicePool>>;
+
+fn registry() -> &'static Mutex<HashMap<i32, SharedPool>> {
+    static REGISTRY: OnceLock<Mutex<HashMap<i32, SharedPool>>> = OnceLock::new();
+    REGISTRY.get_or_init(|| Mutex::new(HashMap::new()))
+}
+
+/// The CUDA device the calling thread is currently set to. DDP sets this once
+/// per rank-thread, so it identifies which pool to use.
+fn current_device() -> Result<i32> {
+    let mut dev = 0i32;
+    error::check(unsafe { ffi::cudaGetDevice(&mut dev) })?;
+    Ok(dev)
+}
+
+/// Run `f` with the (locked) pool for `device`, creating it on first use. The
+/// registry mutex is held only long enough to clone out this device's
+/// `Arc<Mutex<DevicePool>>`, so different devices' threads don't contend on the
+/// per-device free list — true per-rank concurrency.
+fn with_device_pool<R>(device: i32, f: impl FnOnce(&mut DevicePool) -> R) -> R {
+    let pool = {
+        let mut reg = registry().lock().unwrap();
+        reg.entry(device).or_default().clone()
+    };
+    let mut guard = pool.lock().unwrap();
+    f(&mut guard)
+}
+
+/// Allocation served by the pool: a raw device pointer plus the device it lives
+/// on and the size class (capacity) of the physical buffer.
+pub(crate) struct PoolAlloc {
+    pub ptr: *mut u8,
+    pub device: i32,
+    pub cap: usize,
+}
+
+/// Acquire a buffer of at least `len` bytes for the calling thread's current
+/// device. Reuses a cached buffer of the matching size class if one is free,
+/// otherwise `cudaMalloc`s a fresh one of the size-class capacity.
+pub(crate) fn acquire(len: usize) -> Result<PoolAlloc> {
+    let cap = size_class(len);
+    let device = current_device()?;
+
+    let cached = with_device_pool(device, |pool| {
+        pool.free.get_mut(&cap).and_then(|stack| stack.pop())
+    });
+    if let Some(ptr) = cached {
+        return Ok(PoolAlloc { ptr, device, cap });
+    }
+
+    let mut ptr = std::ptr::null_mut();
+    error::check(unsafe { ffi::cudaMalloc(&mut ptr, cap) })?;
+    Ok(PoolAlloc { ptr, device, cap })
+}
+
+/// Return a buffer to its device's free list for reuse. Does NOT `cudaFree`.
+pub(crate) fn release(ptr: *mut u8, device: i32, cap: usize) {
+    if ptr.is_null() {
+        return;
+    }
+    with_device_pool(device, |pool| {
+        pool.free.entry(cap).or_default().push(ptr);
+    });
+}