cuda: device caching allocator (pool GpuBuffer alloc)
Every tape op allocates its output via Tensor::zeros -> GpuBuffer::alloc -> cudaMalloc, a synchronous process-serialized driver call. Under the single- process thread-per-GPU DDP model the rank threads' hundreds of per-step allocs serialize through the driver (KI-5 root cause); it costs single-GPU too. Add a per-device, size-classed caching pool: GpuBuffer::alloc serves from a free-list (request rounded up to a size class so repeating training shapes reuse buffers), only cudaMalloc on a miss; Drop returns the buffer to the pool instead of cudaFree. Thread-safe via a global registry keyed by device id with each device's free-list behind its own Mutex (registry lock held only to clone out the per-device Arc<Mutex<_>>, so rank threads don't contend across devices). The buffer records its alloc-time device so Drop returns to the right pool. Transparent: physical capacity may be rounded up, but len()/memset/copy bounds all use the requested length, so the rounded tail is never read and numerics are unchanged. zeros() still memsets (reused buffers hold stale bytes). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -13,6 +13,7 @@ unsafe extern "C" {
|
||||
// --- Device ---
|
||||
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
|
||||
pub fn cudaSetDevice(device: i32) -> i32;
|
||||
pub fn cudaGetDevice(device: *mut i32) -> i32;
|
||||
pub fn cudaDeviceSynchronize() -> i32;
|
||||
|
||||
// --- Memory ---
|
||||
|
||||
@@ -4,6 +4,7 @@ pub mod device;
|
||||
pub mod error;
|
||||
pub mod ffi;
|
||||
pub mod memory;
|
||||
mod pool;
|
||||
|
||||
pub use error::{CudaError, Result};
|
||||
pub use memory::GpuBuffer;
|
||||
|
||||
@@ -1,18 +1,37 @@
|
||||
use crate::error::{self, Result};
|
||||
use crate::ffi;
|
||||
use crate::pool;
|
||||
|
||||
/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
|
||||
/// RAII wrapper around a GPU memory allocation. Dropping returns the buffer to
|
||||
/// the per-device caching pool (see [`crate::pool`]) for reuse instead of
|
||||
/// calling `cudaFree`.
|
||||
///
|
||||
/// `len` is the logical (requested) length used for all copy/memset bounds and
|
||||
/// exposed via [`GpuBuffer::len`]; `cap` is the physical size class the pool
|
||||
/// rounded up to (>= `len`), used only to bucket the buffer for reuse. The
|
||||
/// extra `cap - len` bytes are never exposed to callers, so pooling is
|
||||
/// numerically transparent. `device` records which device pool to return to.
|
||||
pub struct GpuBuffer {
|
||||
ptr: *mut u8,
|
||||
len: usize,
|
||||
cap: usize,
|
||||
device: i32,
|
||||
}
|
||||
|
||||
impl GpuBuffer {
|
||||
/// Allocate at least `len` bytes on the calling thread's current device,
|
||||
/// reusing a pooled buffer when one of the matching size class is free.
|
||||
/// The contents are **uninitialized** (a reused buffer holds stale bytes);
|
||||
/// callers that need zeros must memset (see [`crate::Storage::zeros`]).
|
||||
pub fn alloc(len: usize) -> Result<Self> {
|
||||
assert!(len > 0, "cannot allocate 0 bytes on GPU");
|
||||
let mut ptr = std::ptr::null_mut();
|
||||
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
|
||||
Ok(Self { ptr, len })
|
||||
let a = pool::acquire(len)?;
|
||||
Ok(Self {
|
||||
ptr: a.ptr,
|
||||
len,
|
||||
cap: a.cap,
|
||||
device: a.device,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
@@ -56,9 +75,10 @@ impl GpuBuffer {
|
||||
|
||||
impl Drop for GpuBuffer {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
unsafe { ffi::cudaFree(self.ptr) };
|
||||
}
|
||||
// Return to the device pool for reuse (no cudaFree). The pool retains
|
||||
// the raw pointer for the process lifetime; on process exit the OS
|
||||
// reclaims the device context, so this is not a leak.
|
||||
pool::release(self.ptr, self.device, self.cap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
124
crates/xtrain-cuda/src/pool.rs
Normal file
124
crates/xtrain-cuda/src/pool.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
//! Device caching / pool allocator (Phase T11, KI-5).
|
||||
//!
|
||||
//! Every tape op allocates its output buffer via [`crate::GpuBuffer::alloc`],
|
||||
//! which used to call `cudaMalloc` + (for `zeros`) `cudaMemset` on *every* op.
|
||||
//! `cudaMalloc`/`cudaFree` are synchronous, process-serialized driver calls; in
|
||||
//! the single-process thread-per-GPU DDP model the rank threads' hundreds of
|
||||
//! per-step allocations queue through the driver and serialize (KI-5). The cost
|
||||
//! hurts single-GPU too.
|
||||
//!
|
||||
//! Fix: cache freed device buffers in a per-device, size-classed free list and
|
||||
//! reuse them. Training has repeating shapes, so after warm-up the steady-state
|
||||
//! `cudaMalloc` count per step is ~0. The pool is **transparent**: a `GpuBuffer`
|
||||
//! handed out from the pool exposes exactly the bytes the caller requested (the
|
||||
//! physical allocation may be rounded up to its size class, but `len()` and all
|
||||
//! copy/memset bounds use the requested length), so numerics are unchanged.
|
||||
//!
|
||||
//! Thread-safety: DDP runs thread-per-GPU in one process. The pool is a global
|
||||
//! registry keyed by device id; each device's free list lives behind its own
|
||||
//! `Mutex`. A buffer remembers which device it was allocated on (the thread's
|
||||
//! current CUDA device at `alloc` time) so `Drop` returns it to the right pool.
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::ffi;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
|
||||
/// Allocation granularity. Requests are rounded *up* to a size class so that
|
||||
/// op outputs of the same shape (the common case in training) land in the same
|
||||
/// free list and are reused across steps.
|
||||
///
|
||||
/// Small allocations round up to a multiple of `MIN_CLASS`; larger ones round
|
||||
/// up to the next power of two. Powers of two keep the number of distinct
|
||||
/// classes bounded (so the free lists stay shallow) while wasting at most ~2×
|
||||
/// per buffer — fine for fixed-shape training, and freed memory is reused, not
|
||||
/// leaked.
|
||||
const MIN_CLASS: usize = 512;
|
||||
/// Below this threshold, round up to a multiple of `MIN_CLASS` (fine-grained);
|
||||
/// at or above it, round up to the next power of two.
|
||||
const POW2_THRESHOLD: usize = 1 << 20; // 1 MiB
|
||||
|
||||
/// Round a byte length up to its size class (the physical allocation size).
|
||||
fn size_class(len: usize) -> usize {
|
||||
debug_assert!(len > 0);
|
||||
if len <= POW2_THRESHOLD {
|
||||
len.div_ceil(MIN_CLASS) * MIN_CLASS
|
||||
} else {
|
||||
len.next_power_of_two()
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-device free list: size class -> stack of cached raw device pointers.
|
||||
#[derive(Default)]
|
||||
struct DevicePool {
|
||||
free: HashMap<usize, Vec<*mut u8>>,
|
||||
}
|
||||
|
||||
// The raw pointers are device addresses, only ever dereferenced by the GPU.
|
||||
// They are guarded by a `Mutex` and moved between threads as plain handles.
|
||||
unsafe impl Send for DevicePool {}
|
||||
|
||||
type SharedPool = Arc<Mutex<DevicePool>>;
|
||||
|
||||
fn registry() -> &'static Mutex<HashMap<i32, SharedPool>> {
|
||||
static REGISTRY: OnceLock<Mutex<HashMap<i32, SharedPool>>> = OnceLock::new();
|
||||
REGISTRY.get_or_init(|| Mutex::new(HashMap::new()))
|
||||
}
|
||||
|
||||
/// The CUDA device the calling thread is currently set to. DDP sets this once
|
||||
/// per rank-thread, so it identifies which pool to use.
|
||||
fn current_device() -> Result<i32> {
|
||||
let mut dev = 0i32;
|
||||
error::check(unsafe { ffi::cudaGetDevice(&mut dev) })?;
|
||||
Ok(dev)
|
||||
}
|
||||
|
||||
/// Run `f` with the (locked) pool for `device`, creating it on first use. The
|
||||
/// registry mutex is held only long enough to clone out this device's
|
||||
/// `Arc<Mutex<DevicePool>>`, so different devices' threads don't contend on the
|
||||
/// per-device free list — true per-rank concurrency.
|
||||
fn with_device_pool<R>(device: i32, f: impl FnOnce(&mut DevicePool) -> R) -> R {
|
||||
let pool = {
|
||||
let mut reg = registry().lock().unwrap();
|
||||
reg.entry(device).or_default().clone()
|
||||
};
|
||||
let mut guard = pool.lock().unwrap();
|
||||
f(&mut guard)
|
||||
}
|
||||
|
||||
/// Allocation served by the pool: a raw device pointer plus the device it lives
|
||||
/// on and the size class (capacity) of the physical buffer.
|
||||
pub(crate) struct PoolAlloc {
|
||||
pub ptr: *mut u8,
|
||||
pub device: i32,
|
||||
pub cap: usize,
|
||||
}
|
||||
|
||||
/// Acquire a buffer of at least `len` bytes for the calling thread's current
|
||||
/// device. Reuses a cached buffer of the matching size class if one is free,
|
||||
/// otherwise `cudaMalloc`s a fresh one of the size-class capacity.
|
||||
pub(crate) fn acquire(len: usize) -> Result<PoolAlloc> {
|
||||
let cap = size_class(len);
|
||||
let device = current_device()?;
|
||||
|
||||
let cached = with_device_pool(device, |pool| {
|
||||
pool.free.get_mut(&cap).and_then(|stack| stack.pop())
|
||||
});
|
||||
if let Some(ptr) = cached {
|
||||
return Ok(PoolAlloc { ptr, device, cap });
|
||||
}
|
||||
|
||||
let mut ptr = std::ptr::null_mut();
|
||||
error::check(unsafe { ffi::cudaMalloc(&mut ptr, cap) })?;
|
||||
Ok(PoolAlloc { ptr, device, cap })
|
||||
}
|
||||
|
||||
/// Return a buffer to its device's free list for reuse. Does NOT `cudaFree`.
|
||||
pub(crate) fn release(ptr: *mut u8, device: i32, cap: usize) {
|
||||
if ptr.is_null() {
|
||||
return;
|
||||
}
|
||||
with_device_pool(device, |pool| {
|
||||
pool.free.entry(cap).or_default().push(ptr);
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user