xtrain/crates/xtrain-cuda/src/memory.rs

use crate::error::{self, Result};
use crate::ffi;

/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
pub struct GpuBuffer {
    ptr: *mut u8,
    len: usize,
}

impl GpuBuffer {
    pub fn alloc(len: usize) -> Result<Self> {
        assert!(len > 0, "cannot allocate 0 bytes on GPU");
        let mut ptr = std::ptr::null_mut();
        error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
        Ok(Self { ptr, len })
    }

    pub fn len(&self) -> usize {
        self.len
    }

    pub fn is_empty(&self) -> bool {
        self.len == 0
    }

    pub fn as_ptr(&self) -> *const u8 {
        self.ptr
    }

    pub fn as_mut_ptr(&mut self) -> *mut u8 {
        self.ptr
    }

    /// Copy data from a host (CPU) slice to this GPU buffer.
    pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
        assert!(src.len() <= self.len, "source larger than buffer");
        error::check(unsafe {
            ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
        })
    }

    /// Copy data from this GPU buffer to a host (CPU) slice.
    pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
        assert!(dst.len() <= self.len, "destination larger than buffer");
        error::check(unsafe {
            ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
        })
    }

    /// Set every byte of the buffer to `value` on the device (no host copy).
    /// Used to zero op-output buffers without a blocking H2D memcpy of zeros.
    pub fn memset(&mut self, value: u8) -> Result<()> {
        error::check(unsafe { ffi::cudaMemset(self.ptr, value as i32, self.len) })
    }
}

impl Drop for GpuBuffer {
    fn drop(&mut self) {
        if !self.ptr.is_null() {
            unsafe { ffi::cudaFree(self.ptr) };
        }
    }
}

unsafe impl Send for GpuBuffer {}