Files
xserv/crates/xserv-cuda/src/memory.rs
Gahow Wang 4c3f914459 kernels/cuda: paged-attention kernel, dispatch, pinned host memory
CUDA layer for the paged-KV + swap work:
- csrc: new paged_attention.cu plus updates across attention/gemm/norm/
  activation/embedding/reduce kernels and common.cuh.
- xserv-kernels: new dispatch module and kernel-binding updates.
- xserv-cuda: cudaMallocHost/FreeHost bindings + PinnedBuffer (host swap
  pool backing) and offset-aware D2H/H2D copies used to move KV blocks
  between the GPU pool and pinned host memory.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 19:58:36 +08:00

245 lines
8.0 KiB
Rust

use crate::error::{self, Result};
use crate::ffi;
use crate::stream::CudaStream;
/// RAII wrapper around a GPU memory allocation.
///
/// When `owned` is true (the default), dropping frees the GPU memory.
/// A borrowed buffer (`owned = false`) does NOT free on drop — the
/// caller must ensure the backing allocation outlives all borrows.
///
/// When `pooled` is true, dropping returns the buffer to the caching
/// allocator's free list instead of calling cudaFree.
pub struct GpuBuffer {
ptr: *mut u8,
len: usize,
owned: bool,
pooled: bool,
}
impl GpuBuffer {
pub fn alloc(len: usize) -> Result<Self> {
assert!(len > 0, "cannot allocate 0 bytes on GPU");
let mut ptr = std::ptr::null_mut();
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
Ok(Self { ptr, len, owned: true, pooled: false })
}
/// Mark this buffer as pooled (returned to caching allocator on drop)
/// or not. Called by `cached_alloc` after obtaining a buffer.
pub fn set_pooled(&mut self, pooled: bool) {
self.pooled = pooled;
}
pub fn len(&self) -> usize {
self.len
}
pub fn as_ptr(&self) -> *const u8 {
self.ptr
}
pub fn as_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
/// Copy data from host (CPU) slice to this GPU buffer.
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
assert!(src.len() <= self.len, "source larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
})
}
/// Copy data from this GPU buffer to a host (CPU) slice.
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
assert!(dst.len() <= self.len, "destination larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
})
}
/// Async copy from host to device on the given stream.
/// Safety: `src` must remain valid until the stream operation completes.
pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
assert!(src.len() <= self.len);
unsafe {
error::check(ffi::cudaMemcpyAsync(
self.ptr,
src.as_ptr(),
src.len(),
ffi::CUDA_MEMCPY_H2D,
stream.as_raw(),
))
}
}
/// Async copy from device to host on the given stream.
/// Safety: `dst` must remain valid until the stream operation completes.
pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
assert!(dst.len() <= self.len);
unsafe {
error::check(ffi::cudaMemcpyAsync(
dst.as_mut_ptr(),
self.ptr,
dst.len(),
ffi::CUDA_MEMCPY_D2H,
stream.as_raw(),
))
}
}
/// Copy from another GPU buffer (D2D).
pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
let n = src.len.min(self.len);
error::check(unsafe {
ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D)
})
}
/// Fill buffer with zeros.
pub fn zero(&mut self) -> Result<()> {
error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
}
/// Copy `count` bytes from `src` buffer at `src_offset` to this buffer at `dst_offset`.
pub fn copy_from_device_at(&mut self, src: &GpuBuffer, src_offset: usize, dst_offset: usize, count: usize) -> Result<()> {
assert!(src_offset + count <= src.len);
assert!(dst_offset + count <= self.len);
error::check(unsafe {
ffi::cudaMemcpy(
self.ptr.add(dst_offset),
src.ptr.add(src_offset),
count,
ffi::CUDA_MEMCPY_D2D,
)
})
}
/// Async copy `count` bytes from `src` at `src_offset` to `self` at `dst_offset` on `stream`.
pub fn copy_from_device_at_async(&mut self, src: &GpuBuffer, src_offset: usize, dst_offset: usize, count: usize, stream: &CudaStream) -> Result<()> {
assert!(src_offset + count <= src.len);
assert!(dst_offset + count <= self.len);
error::check(unsafe {
ffi::cudaMemcpyAsync(
self.ptr.add(dst_offset),
src.ptr.add(src_offset),
count,
ffi::CUDA_MEMCPY_D2D,
stream.as_raw(),
)
})
}
/// Copy `count` bytes from this GPU buffer at `src_offset` to a host slice (D2H).
pub fn copy_to_host_at(&self, dst: &mut [u8], src_offset: usize, count: usize) -> Result<()> {
assert!(src_offset + count <= self.len, "src range out of bounds");
assert!(count <= dst.len(), "host dst too small");
error::check(unsafe {
ffi::cudaMemcpy(
dst.as_mut_ptr(),
self.ptr.add(src_offset),
count,
ffi::CUDA_MEMCPY_D2H,
)
})
}
/// Copy `count` bytes from a host slice to this GPU buffer at `dst_offset` (H2D).
pub fn copy_from_host_at(&mut self, src: &[u8], dst_offset: usize, count: usize) -> Result<()> {
assert!(dst_offset + count <= self.len, "dst range out of bounds");
assert!(count <= src.len(), "host src too small");
error::check(unsafe {
ffi::cudaMemcpy(
self.ptr.add(dst_offset),
src.as_ptr(),
count,
ffi::CUDA_MEMCPY_H2D,
)
})
}
/// Async zero fill on stream.
pub fn zero_async(&mut self, stream: &CudaStream) -> Result<()> {
error::check(unsafe {
ffi::cudaMemsetAsync(self.ptr, 0, self.len, stream.as_raw())
})
}
/// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
/// Caller is responsible for eventually calling cudaFree.
pub fn into_raw(self) -> (*mut u8, usize) {
let ptr = self.ptr;
let len = self.len;
std::mem::forget(self);
(ptr, len)
}
/// Reconstruct a GpuBuffer from a raw pointer + length.
/// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
Self { ptr, len, owned: true, pooled: false }
}
/// Create a non-owning view of GPU memory. Dropping this buffer does NOT
/// call `cudaFree`. The caller must ensure the underlying allocation
/// outlives this borrow.
///
/// # Safety
/// `ptr` must point to a valid GPU allocation of at least `len` bytes that
/// will remain live for the lifetime of the returned `GpuBuffer`.
pub unsafe fn borrow_raw(ptr: *mut u8, len: usize) -> Self {
Self { ptr, len, owned: false, pooled: false }
}
}
impl Drop for GpuBuffer {
fn drop(&mut self) {
if self.owned && !self.ptr.is_null() {
if self.pooled {
crate::allocator::return_to_pool(self.ptr, self.len);
} else {
unsafe { ffi::cudaFree(self.ptr) };
}
}
}
}
unsafe impl Send for GpuBuffer {}
/// Pinned (page-locked) host memory for faster H2D/D2H transfers.
pub struct PinnedBuffer {
ptr: *mut u8,
len: usize,
}
impl PinnedBuffer {
pub fn alloc(len: usize) -> Result<Self> {
let mut ptr = std::ptr::null_mut();
error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
Ok(Self { ptr, len })
}
pub fn as_slice(&self) -> &[u8] {
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
}
pub fn as_mut_slice(&mut self) -> &mut [u8] {
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
}
pub fn len(&self) -> usize {
self.len
}
}
impl Drop for PinnedBuffer {
fn drop(&mut self) {
if !self.ptr.is_null() {
unsafe { ffi::cudaFreeHost(self.ptr) };
}
}
}
unsafe impl Send for PinnedBuffer {}