xserv/crates/xserv-cuda/src/memory.rs

use crate::error::{self, Result};
use crate::ffi;
use crate::stream::CudaStream;

/// RAII wrapper around a GPU memory allocation.
///
/// When `owned` is true (the default), dropping frees the GPU memory.
/// A borrowed buffer (`owned = false`) does NOT free on drop — the
/// caller must ensure the backing allocation outlives all borrows.
///
/// When `pooled` is true, dropping returns the buffer to the caching
/// allocator's free list instead of calling cudaFree.
pub struct GpuBuffer {
    ptr: *mut u8,
    len: usize,
    owned: bool,
    pooled: bool,
}

impl GpuBuffer {
    pub fn alloc(len: usize) -> Result<Self> {
        assert!(len > 0, "cannot allocate 0 bytes on GPU");
        let mut ptr = std::ptr::null_mut();
        error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
        Ok(Self { ptr, len, owned: true, pooled: false })
    }

    /// Mark this buffer as pooled (returned to caching allocator on drop)
    /// or not. Called by `cached_alloc` after obtaining a buffer.
    pub fn set_pooled(&mut self, pooled: bool) {
        self.pooled = pooled;
    }

    pub fn len(&self) -> usize {
        self.len
    }

    pub fn as_ptr(&self) -> *const u8 {
        self.ptr
    }

    pub fn as_mut_ptr(&mut self) -> *mut u8 {
        self.ptr
    }

    /// Copy data from host (CPU) slice to this GPU buffer.
    pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
        assert!(src.len() <= self.len, "source larger than buffer");
        error::check(unsafe {
            ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
        })
    }

    /// Copy data from this GPU buffer to a host (CPU) slice.
    pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
        assert!(dst.len() <= self.len, "destination larger than buffer");
        error::check(unsafe {
            ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
        })
    }

    /// Async copy from host to device on the given stream.
    /// Safety: `src` must remain valid until the stream operation completes.
    pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
        assert!(src.len() <= self.len);
        unsafe {
            error::check(ffi::cudaMemcpyAsync(
                self.ptr,
                src.as_ptr(),
                src.len(),
                ffi::CUDA_MEMCPY_H2D,
                stream.as_raw(),
            ))
        }
    }

    /// Async copy from device to host on the given stream.
    /// Safety: `dst` must remain valid until the stream operation completes.
    pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
        assert!(dst.len() <= self.len);
        unsafe {
            error::check(ffi::cudaMemcpyAsync(
                dst.as_mut_ptr(),
                self.ptr,
                dst.len(),
                ffi::CUDA_MEMCPY_D2H,
                stream.as_raw(),
            ))
        }
    }

    /// Copy from another GPU buffer (D2D).
    pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
        let n = src.len.min(self.len);
        error::check(unsafe {
            ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D)
        })
    }

    /// Fill buffer with zeros.
    pub fn zero(&mut self) -> Result<()> {
        error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
    }

    /// Copy `count` bytes from `src` buffer at `src_offset` to this buffer at `dst_offset`.
    pub fn copy_from_device_at(&mut self, src: &GpuBuffer, src_offset: usize, dst_offset: usize, count: usize) -> Result<()> {
        assert!(src_offset + count <= src.len);
        assert!(dst_offset + count <= self.len);
        error::check(unsafe {
            ffi::cudaMemcpy(
                self.ptr.add(dst_offset),
                src.ptr.add(src_offset),
                count,
                ffi::CUDA_MEMCPY_D2D,
            )
        })
    }

    /// Async copy `count` bytes from `src` at `src_offset` to `self` at `dst_offset` on `stream`.
    pub fn copy_from_device_at_async(&mut self, src: &GpuBuffer, src_offset: usize, dst_offset: usize, count: usize, stream: &CudaStream) -> Result<()> {
        assert!(src_offset + count <= src.len);
        assert!(dst_offset + count <= self.len);
        error::check(unsafe {
            ffi::cudaMemcpyAsync(
                self.ptr.add(dst_offset),
                src.ptr.add(src_offset),
                count,
                ffi::CUDA_MEMCPY_D2D,
                stream.as_raw(),
            )
        })
    }

    /// Copy `count` bytes from this GPU buffer at `src_offset` to a host slice (D2H).
    pub fn copy_to_host_at(&self, dst: &mut [u8], src_offset: usize, count: usize) -> Result<()> {
        assert!(src_offset + count <= self.len, "src range out of bounds");
        assert!(count <= dst.len(), "host dst too small");
        error::check(unsafe {
            ffi::cudaMemcpy(
                dst.as_mut_ptr(),
                self.ptr.add(src_offset),
                count,
                ffi::CUDA_MEMCPY_D2H,
            )
        })
    }

    /// Copy `count` bytes from a host slice to this GPU buffer at `dst_offset` (H2D).
    pub fn copy_from_host_at(&mut self, src: &[u8], dst_offset: usize, count: usize) -> Result<()> {
        assert!(dst_offset + count <= self.len, "dst range out of bounds");
        assert!(count <= src.len(), "host src too small");
        error::check(unsafe {
            ffi::cudaMemcpy(
                self.ptr.add(dst_offset),
                src.as_ptr(),
                count,
                ffi::CUDA_MEMCPY_H2D,
            )
        })
    }

    /// Async zero fill on stream.
    pub fn zero_async(&mut self, stream: &CudaStream) -> Result<()> {
        error::check(unsafe {
            ffi::cudaMemsetAsync(self.ptr, 0, self.len, stream.as_raw())
        })
    }

    /// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
    /// Caller is responsible for eventually calling cudaFree.
    pub fn into_raw(self) -> (*mut u8, usize) {
        let ptr = self.ptr;
        let len = self.len;
        std::mem::forget(self);
        (ptr, len)
    }

    /// Reconstruct a GpuBuffer from a raw pointer + length.
    /// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
    pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
        Self { ptr, len, owned: true, pooled: false }
    }

    /// Create a non-owning view of GPU memory. Dropping this buffer does NOT
    /// call `cudaFree`. The caller must ensure the underlying allocation
    /// outlives this borrow.
    ///
    /// # Safety
    /// `ptr` must point to a valid GPU allocation of at least `len` bytes that
    /// will remain live for the lifetime of the returned `GpuBuffer`.
    pub unsafe fn borrow_raw(ptr: *mut u8, len: usize) -> Self {
        Self { ptr, len, owned: false, pooled: false }
    }
}

impl Drop for GpuBuffer {
    fn drop(&mut self) {
        if self.owned && !self.ptr.is_null() {
            if self.pooled {
                crate::allocator::return_to_pool(self.ptr, self.len);
            } else {
                unsafe { ffi::cudaFree(self.ptr) };
            }
        }
    }
}

unsafe impl Send for GpuBuffer {}

/// Pinned (page-locked) host memory for faster H2D/D2H transfers.
pub struct PinnedBuffer {
    ptr: *mut u8,
    len: usize,
}

impl PinnedBuffer {
    pub fn alloc(len: usize) -> Result<Self> {
        let mut ptr = std::ptr::null_mut();
        error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
        Ok(Self { ptr, len })
    }

    pub fn as_slice(&self) -> &[u8] {
        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
    }

    pub fn as_mut_slice(&mut self) -> &mut [u8] {
        unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
    }

    pub fn len(&self) -> usize {
        self.len
    }
}

impl Drop for PinnedBuffer {
    fn drop(&mut self) {
        if !self.ptr.is_null() {
            unsafe { ffi::cudaFreeHost(self.ptr) };
        }
    }
}

unsafe impl Send for PinnedBuffer {}