Files
xtrain/crates/xtrain-cuda/src/memory.rs
Gahow Wang a842e432b5 perf: streams / drop per-op sync
Default-stream kernels run in order and every host read goes through a
stream-ordered cudaMemcpy (to_device), so the per-op cudaDeviceSynchronize
after each kernel was pure overhead — remove all 21 in tensor.rs. Host
data is still correctly ordered by the D2H memcpy that reads it.

Also zero op-output buffers with cudaMemset (device-side, async) instead of
a blocking H2D memcpy of a host zero buffer on every allocation — that
copy was itself a hidden per-op sync point.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 16:56:17 +08:00

66 lines
1.8 KiB
Rust

use crate::error::{self, Result};
use crate::ffi;
/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
pub struct GpuBuffer {
ptr: *mut u8,
len: usize,
}
impl GpuBuffer {
pub fn alloc(len: usize) -> Result<Self> {
assert!(len > 0, "cannot allocate 0 bytes on GPU");
let mut ptr = std::ptr::null_mut();
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
Ok(Self { ptr, len })
}
pub fn len(&self) -> usize {
self.len
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn as_ptr(&self) -> *const u8 {
self.ptr
}
pub fn as_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
/// Copy data from a host (CPU) slice to this GPU buffer.
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
assert!(src.len() <= self.len, "source larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
})
}
/// Copy data from this GPU buffer to a host (CPU) slice.
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
assert!(dst.len() <= self.len, "destination larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
})
}
/// Set every byte of the buffer to `value` on the device (no host copy).
/// Used to zero op-output buffers without a blocking H2D memcpy of zeros.
pub fn memset(&mut self, value: u8) -> Result<()> {
error::check(unsafe { ffi::cudaMemset(self.ptr, value as i32, self.len) })
}
}
impl Drop for GpuBuffer {
fn drop(&mut self) {
if !self.ptr.is_null() {
unsafe { ffi::cudaFree(self.ptr) };
}
}
}
unsafe impl Send for GpuBuffer {}