269 lines
8.2 KiB
Rust
269 lines
8.2 KiB
Rust
use crate::error::{self, Result};
|
|
use crate::ffi;
|
|
use crate::stream::CudaStream;
|
|
|
|
/// RAII wrapper around a GPU memory allocation.
|
|
///
|
|
/// When `owned` is true (the default), dropping frees the GPU memory.
|
|
/// A borrowed buffer (`owned = false`) does NOT free on drop — the
|
|
/// caller must ensure the backing allocation outlives all borrows.
|
|
///
|
|
/// When `pooled` is true, dropping returns the buffer to the caching
|
|
/// allocator's free list instead of calling cudaFree.
|
|
pub struct GpuBuffer {
|
|
ptr: *mut u8,
|
|
len: usize,
|
|
owned: bool,
|
|
pooled: bool,
|
|
}
|
|
|
|
impl GpuBuffer {
|
|
pub fn alloc(len: usize) -> Result<Self> {
|
|
assert!(len > 0, "cannot allocate 0 bytes on GPU");
|
|
let mut ptr = std::ptr::null_mut();
|
|
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
|
|
Ok(Self {
|
|
ptr,
|
|
len,
|
|
owned: true,
|
|
pooled: false,
|
|
})
|
|
}
|
|
|
|
/// Mark this buffer as pooled (returned to caching allocator on drop)
|
|
/// or not. Called by `cached_alloc` after obtaining a buffer.
|
|
pub fn set_pooled(&mut self, pooled: bool) {
|
|
self.pooled = pooled;
|
|
}
|
|
|
|
pub fn len(&self) -> usize {
|
|
self.len
|
|
}
|
|
|
|
pub fn as_ptr(&self) -> *const u8 {
|
|
self.ptr
|
|
}
|
|
|
|
pub fn as_mut_ptr(&mut self) -> *mut u8 {
|
|
self.ptr
|
|
}
|
|
|
|
/// Copy data from host (CPU) slice to this GPU buffer.
|
|
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
|
|
assert!(src.len() <= self.len, "source larger than buffer");
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
|
|
})
|
|
}
|
|
|
|
/// Copy data from this GPU buffer to a host (CPU) slice.
|
|
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
|
|
assert!(dst.len() <= self.len, "destination larger than buffer");
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
|
|
})
|
|
}
|
|
|
|
/// Async copy from host to device on the given stream.
|
|
/// Safety: `src` must remain valid until the stream operation completes.
|
|
pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
|
|
assert!(src.len() <= self.len);
|
|
unsafe {
|
|
error::check(ffi::cudaMemcpyAsync(
|
|
self.ptr,
|
|
src.as_ptr(),
|
|
src.len(),
|
|
ffi::CUDA_MEMCPY_H2D,
|
|
stream.as_raw(),
|
|
))
|
|
}
|
|
}
|
|
|
|
/// Async copy from device to host on the given stream.
|
|
/// Safety: `dst` must remain valid until the stream operation completes.
|
|
pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
|
|
assert!(dst.len() <= self.len);
|
|
unsafe {
|
|
error::check(ffi::cudaMemcpyAsync(
|
|
dst.as_mut_ptr(),
|
|
self.ptr,
|
|
dst.len(),
|
|
ffi::CUDA_MEMCPY_D2H,
|
|
stream.as_raw(),
|
|
))
|
|
}
|
|
}
|
|
|
|
/// Copy from another GPU buffer (D2D).
|
|
pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
|
|
let n = src.len.min(self.len);
|
|
error::check(unsafe { ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D) })
|
|
}
|
|
|
|
/// Fill buffer with zeros.
|
|
pub fn zero(&mut self) -> Result<()> {
|
|
error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
|
|
}
|
|
|
|
/// Copy `count` bytes from `src` buffer at `src_offset` to this buffer at `dst_offset`.
|
|
pub fn copy_from_device_at(
|
|
&mut self,
|
|
src: &GpuBuffer,
|
|
src_offset: usize,
|
|
dst_offset: usize,
|
|
count: usize,
|
|
) -> Result<()> {
|
|
assert!(src_offset + count <= src.len);
|
|
assert!(dst_offset + count <= self.len);
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpy(
|
|
self.ptr.add(dst_offset),
|
|
src.ptr.add(src_offset),
|
|
count,
|
|
ffi::CUDA_MEMCPY_D2D,
|
|
)
|
|
})
|
|
}
|
|
|
|
/// Async copy `count` bytes from `src` at `src_offset` to `self` at `dst_offset` on `stream`.
|
|
pub fn copy_from_device_at_async(
|
|
&mut self,
|
|
src: &GpuBuffer,
|
|
src_offset: usize,
|
|
dst_offset: usize,
|
|
count: usize,
|
|
stream: &CudaStream,
|
|
) -> Result<()> {
|
|
assert!(src_offset + count <= src.len);
|
|
assert!(dst_offset + count <= self.len);
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpyAsync(
|
|
self.ptr.add(dst_offset),
|
|
src.ptr.add(src_offset),
|
|
count,
|
|
ffi::CUDA_MEMCPY_D2D,
|
|
stream.as_raw(),
|
|
)
|
|
})
|
|
}
|
|
|
|
/// Copy `count` bytes from this GPU buffer at `src_offset` to a host slice (D2H).
|
|
pub fn copy_to_host_at(&self, dst: &mut [u8], src_offset: usize, count: usize) -> Result<()> {
|
|
assert!(src_offset + count <= self.len, "src range out of bounds");
|
|
assert!(count <= dst.len(), "host dst too small");
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpy(
|
|
dst.as_mut_ptr(),
|
|
self.ptr.add(src_offset),
|
|
count,
|
|
ffi::CUDA_MEMCPY_D2H,
|
|
)
|
|
})
|
|
}
|
|
|
|
/// Copy `count` bytes from a host slice to this GPU buffer at `dst_offset` (H2D).
|
|
pub fn copy_from_host_at(&mut self, src: &[u8], dst_offset: usize, count: usize) -> Result<()> {
|
|
assert!(dst_offset + count <= self.len, "dst range out of bounds");
|
|
assert!(count <= src.len(), "host src too small");
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpy(
|
|
self.ptr.add(dst_offset),
|
|
src.as_ptr(),
|
|
count,
|
|
ffi::CUDA_MEMCPY_H2D,
|
|
)
|
|
})
|
|
}
|
|
|
|
/// Async zero fill on stream.
|
|
pub fn zero_async(&mut self, stream: &CudaStream) -> Result<()> {
|
|
error::check(unsafe { ffi::cudaMemsetAsync(self.ptr, 0, self.len, stream.as_raw()) })
|
|
}
|
|
|
|
/// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
|
|
/// Caller is responsible for eventually calling cudaFree.
|
|
pub fn into_raw(self) -> (*mut u8, usize) {
|
|
let ptr = self.ptr;
|
|
let len = self.len;
|
|
std::mem::forget(self);
|
|
(ptr, len)
|
|
}
|
|
|
|
/// Reconstruct a GpuBuffer from a raw pointer + length.
|
|
/// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
|
|
pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
|
|
Self {
|
|
ptr,
|
|
len,
|
|
owned: true,
|
|
pooled: false,
|
|
}
|
|
}
|
|
|
|
/// Create a non-owning view of GPU memory. Dropping this buffer does NOT
|
|
/// call `cudaFree`. The caller must ensure the underlying allocation
|
|
/// outlives this borrow.
|
|
///
|
|
/// # Safety
|
|
/// `ptr` must point to a valid GPU allocation of at least `len` bytes that
|
|
/// will remain live for the lifetime of the returned `GpuBuffer`.
|
|
pub unsafe fn borrow_raw(ptr: *mut u8, len: usize) -> Self {
|
|
Self {
|
|
ptr,
|
|
len,
|
|
owned: false,
|
|
pooled: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Drop for GpuBuffer {
|
|
fn drop(&mut self) {
|
|
if self.owned && !self.ptr.is_null() {
|
|
if self.pooled {
|
|
crate::allocator::return_to_pool(self.ptr, self.len);
|
|
} else {
|
|
unsafe { ffi::cudaFree(self.ptr) };
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsafe impl Send for GpuBuffer {}
|
|
|
|
/// Pinned (page-locked) host memory for faster H2D/D2H transfers.
|
|
pub struct PinnedBuffer {
|
|
ptr: *mut u8,
|
|
len: usize,
|
|
}
|
|
|
|
impl PinnedBuffer {
|
|
pub fn alloc(len: usize) -> Result<Self> {
|
|
let mut ptr = std::ptr::null_mut();
|
|
error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
|
|
Ok(Self { ptr, len })
|
|
}
|
|
|
|
pub fn as_slice(&self) -> &[u8] {
|
|
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
|
|
}
|
|
|
|
pub fn as_mut_slice(&mut self) -> &mut [u8] {
|
|
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
|
|
}
|
|
|
|
pub fn len(&self) -> usize {
|
|
self.len
|
|
}
|
|
}
|
|
|
|
impl Drop for PinnedBuffer {
|
|
fn drop(&mut self) {
|
|
if !self.ptr.is_null() {
|
|
unsafe { ffi::cudaFreeHost(self.ptr) };
|
|
}
|
|
}
|
|
}
|
|
|
|
unsafe impl Send for PinnedBuffer {}
|