Default-stream kernels run in order and every host read goes through a stream-ordered cudaMemcpy (to_device), so the per-op cudaDeviceSynchronize after each kernel was pure overhead — remove all 21 in tensor.rs. Host data is still correctly ordered by the D2H memcpy that reads it. Also zero op-output buffers with cudaMemset (device-side, async) instead of a blocking H2D memcpy of a host zero buffer on every allocation — that copy was itself a hidden per-op sync point. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
66 lines
1.8 KiB
Rust
66 lines
1.8 KiB
Rust
use crate::error::{self, Result};
|
|
use crate::ffi;
|
|
|
|
/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
|
|
pub struct GpuBuffer {
|
|
ptr: *mut u8,
|
|
len: usize,
|
|
}
|
|
|
|
impl GpuBuffer {
|
|
pub fn alloc(len: usize) -> Result<Self> {
|
|
assert!(len > 0, "cannot allocate 0 bytes on GPU");
|
|
let mut ptr = std::ptr::null_mut();
|
|
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
|
|
Ok(Self { ptr, len })
|
|
}
|
|
|
|
pub fn len(&self) -> usize {
|
|
self.len
|
|
}
|
|
|
|
pub fn is_empty(&self) -> bool {
|
|
self.len == 0
|
|
}
|
|
|
|
pub fn as_ptr(&self) -> *const u8 {
|
|
self.ptr
|
|
}
|
|
|
|
pub fn as_mut_ptr(&mut self) -> *mut u8 {
|
|
self.ptr
|
|
}
|
|
|
|
/// Copy data from a host (CPU) slice to this GPU buffer.
|
|
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
|
|
assert!(src.len() <= self.len, "source larger than buffer");
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
|
|
})
|
|
}
|
|
|
|
/// Copy data from this GPU buffer to a host (CPU) slice.
|
|
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
|
|
assert!(dst.len() <= self.len, "destination larger than buffer");
|
|
error::check(unsafe {
|
|
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
|
|
})
|
|
}
|
|
|
|
/// Set every byte of the buffer to `value` on the device (no host copy).
|
|
/// Used to zero op-output buffers without a blocking H2D memcpy of zeros.
|
|
pub fn memset(&mut self, value: u8) -> Result<()> {
|
|
error::check(unsafe { ffi::cudaMemset(self.ptr, value as i32, self.len) })
|
|
}
|
|
}
|
|
|
|
impl Drop for GpuBuffer {
|
|
fn drop(&mut self) {
|
|
if !self.ptr.is_null() {
|
|
unsafe { ffi::cudaFree(self.ptr) };
|
|
}
|
|
}
|
|
}
|
|
|
|
unsafe impl Send for GpuBuffer {}
|