phase 0+1: project scaffold + xserv-cuda crate
- Cargo workspace with xserv-cuda crate - CUDA FFI bindings (cudart: memory, stream, device, error) - GpuBuffer RAII wrapper with H2D/D2H/D2D copy - CudaStream wrapper with RAII Drop - CachingAllocator with size-bucketed free lists - PinnedBuffer for page-locked host memory - Device info query via cudaDeviceGetAttribute - Vector-add CUDA kernel smoke test - Integration test suite (11 tests) - build.rs: cc crate compiles .cu for SM 12.0 - sync-and-build.sh for remote build on dash5 - Roadmap doc (docs/00-roadmap.md) and Phase 0+1 design doc Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
10
crates/xserv-cuda/Cargo.toml
Normal file
10
crates/xserv-cuda/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "xserv-cuda"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
cc = { version = "1", features = ["cuda"] }
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9"
|
||||
20
crates/xserv-cuda/build.rs
Normal file
20
crates/xserv-cuda/build.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use std::env;
|
||||
|
||||
fn main() {
|
||||
let cuda_path = env::var("CUDA_HOME")
|
||||
.or_else(|_| env::var("CUDA_PATH"))
|
||||
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
||||
|
||||
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
||||
println!("cargo:rustc-link-lib=dylib=cudart");
|
||||
println!("cargo:rustc-link-lib=dylib=cuda");
|
||||
|
||||
cc::Build::new()
|
||||
.cuda(true)
|
||||
.cudart("shared")
|
||||
.flag("-gencode=arch=compute_120,code=sm_120")
|
||||
.file("../../csrc/test/vecadd.cu")
|
||||
.compile("xserv_cuda_kernels");
|
||||
|
||||
println!("cargo:rerun-if-changed=../../csrc/");
|
||||
}
|
||||
109
crates/xserv-cuda/src/allocator.rs
Normal file
109
crates/xserv-cuda/src/allocator.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use crate::error::Result;
|
||||
use crate::ffi;
|
||||
use crate::memory::GpuBuffer;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Caching allocator that reuses freed GPU buffers instead of calling
|
||||
/// cudaMalloc/cudaFree on every allocation.
|
||||
///
|
||||
/// Freed buffers are kept in a per-size-bucket free list. On allocation,
|
||||
/// we first check the free list for a buffer of matching (rounded) size.
|
||||
pub struct CachingAllocator {
|
||||
free_lists: HashMap<usize, Vec<(*mut u8, usize)>>,
|
||||
stats: AllocStats,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct AllocStats {
|
||||
pub alloc_count: u64,
|
||||
pub cache_hit_count: u64,
|
||||
pub cuda_malloc_count: u64,
|
||||
pub cuda_free_count: u64,
|
||||
pub current_allocated: usize,
|
||||
pub peak_allocated: usize,
|
||||
}
|
||||
|
||||
impl CachingAllocator {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
free_lists: HashMap::new(),
|
||||
stats: AllocStats::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn alloc(&mut self, size: usize) -> Result<GpuBuffer> {
|
||||
let bucket = bucket_size(size);
|
||||
self.stats.alloc_count += 1;
|
||||
|
||||
if let Some(list) = self.free_lists.get_mut(&bucket) {
|
||||
if let Some((ptr, actual_len)) = list.pop() {
|
||||
self.stats.cache_hit_count += 1;
|
||||
self.stats.current_allocated += actual_len;
|
||||
if self.stats.current_allocated > self.stats.peak_allocated {
|
||||
self.stats.peak_allocated = self.stats.current_allocated;
|
||||
}
|
||||
return Ok(unsafe { GpuBuffer::from_raw(ptr, actual_len) });
|
||||
}
|
||||
}
|
||||
|
||||
self.stats.cuda_malloc_count += 1;
|
||||
let buf = GpuBuffer::alloc(bucket)?;
|
||||
self.stats.current_allocated += bucket;
|
||||
if self.stats.current_allocated > self.stats.peak_allocated {
|
||||
self.stats.peak_allocated = self.stats.current_allocated;
|
||||
}
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Return a buffer to the cache instead of freeing it.
|
||||
pub fn dealloc(&mut self, buf: GpuBuffer) {
|
||||
let (ptr, len) = buf.into_raw();
|
||||
let bucket = bucket_size(len);
|
||||
self.stats.current_allocated = self.stats.current_allocated.saturating_sub(len);
|
||||
self.free_lists.entry(bucket).or_default().push((ptr, len));
|
||||
}
|
||||
|
||||
/// Actually free all cached buffers.
|
||||
pub fn trim(&mut self) {
|
||||
for (_bucket, list) in self.free_lists.drain() {
|
||||
for (ptr, _len) in list {
|
||||
unsafe { ffi::cudaFree(ptr) };
|
||||
self.stats.cuda_free_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stats(&self) -> &AllocStats {
|
||||
&self.stats
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for CachingAllocator {
|
||||
fn drop(&mut self) {
|
||||
self.trim();
|
||||
}
|
||||
}
|
||||
|
||||
/// Round up to next power-of-2, minimum 512 bytes.
|
||||
fn bucket_size(size: usize) -> usize {
|
||||
let min = 512;
|
||||
if size <= min {
|
||||
return min;
|
||||
}
|
||||
size.next_power_of_two()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_bucket_size() {
|
||||
assert_eq!(bucket_size(1), 512);
|
||||
assert_eq!(bucket_size(512), 512);
|
||||
assert_eq!(bucket_size(513), 1024);
|
||||
assert_eq!(bucket_size(1024), 1024);
|
||||
assert_eq!(bucket_size(1025), 2048);
|
||||
assert_eq!(bucket_size(1 << 20), 1 << 20);
|
||||
}
|
||||
}
|
||||
77
crates/xserv-cuda/src/device.rs
Normal file
77
crates/xserv-cuda/src/device.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use crate::error::{self, Result};
|
||||
use crate::ffi;
|
||||
use std::ffi::CStr;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DeviceInfo {
|
||||
pub index: u32,
|
||||
pub name: String,
|
||||
pub total_memory: usize,
|
||||
pub compute_major: i32,
|
||||
pub compute_minor: i32,
|
||||
pub sm_count: i32,
|
||||
pub shared_mem_per_block: usize,
|
||||
pub warp_size: i32,
|
||||
pub max_threads_per_block: i32,
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
fn cudaDeviceGetAttribute(value: *mut i32, attr: i32, device: i32) -> i32;
|
||||
}
|
||||
|
||||
fn get_attr(attr: i32, device: u32) -> Result<i32> {
|
||||
let mut value = 0;
|
||||
error::check(unsafe { cudaDeviceGetAttribute(&mut value, attr, device as i32) })?;
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
pub fn device_count() -> Result<i32> {
|
||||
let mut count = 0;
|
||||
error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
pub fn set_device(device: u32) -> Result<()> {
|
||||
error::check(unsafe { ffi::cudaSetDevice(device as i32) })
|
||||
}
|
||||
|
||||
pub fn current_device() -> Result<u32> {
|
||||
let mut dev = 0;
|
||||
error::check(unsafe { ffi::cudaGetDevice(&mut dev) })?;
|
||||
Ok(dev as u32)
|
||||
}
|
||||
|
||||
pub fn device_info(device: u32) -> Result<DeviceInfo> {
|
||||
// Use cudaGetDeviceProperties only for the name (first field, always stable).
|
||||
let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
|
||||
error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
|
||||
let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
|
||||
.to_string_lossy()
|
||||
.into_owned();
|
||||
|
||||
// Use cudaDeviceGetAttribute for everything else (layout-independent).
|
||||
// Attribute IDs from cuda_runtime_api.h:
|
||||
const TOTAL_GLOBAL_MEM: i32 = 0; // not available via attribute, use prop
|
||||
const SHARED_MEM_PER_BLOCK: i32 = 8;
|
||||
const WARP_SIZE: i32 = 10;
|
||||
const MAX_THREADS_PER_BLOCK: i32 = 1;
|
||||
const MULTI_PROCESSOR_COUNT: i32 = 16;
|
||||
const COMPUTE_MAJOR: i32 = 75;
|
||||
const COMPUTE_MINOR: i32 = 76;
|
||||
|
||||
Ok(DeviceInfo {
|
||||
index: device,
|
||||
name,
|
||||
total_memory: prop.total_global_mem,
|
||||
compute_major: get_attr(COMPUTE_MAJOR, device)?,
|
||||
compute_minor: get_attr(COMPUTE_MINOR, device)?,
|
||||
sm_count: get_attr(MULTI_PROCESSOR_COUNT, device)?,
|
||||
shared_mem_per_block: get_attr(SHARED_MEM_PER_BLOCK, device)? as usize,
|
||||
warp_size: get_attr(WARP_SIZE, device)?,
|
||||
max_threads_per_block: get_attr(MAX_THREADS_PER_BLOCK, device)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn synchronize() -> Result<()> {
|
||||
error::check(unsafe { ffi::cudaDeviceSynchronize() })
|
||||
}
|
||||
43
crates/xserv-cuda/src/error.rs
Normal file
43
crates/xserv-cuda/src/error.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
use crate::ffi;
|
||||
use std::ffi::CStr;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum CudaError {
|
||||
OutOfMemory,
|
||||
InvalidDevice,
|
||||
Raw { code: i32, message: String },
|
||||
}
|
||||
|
||||
impl fmt::Display for CudaError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
|
||||
CudaError::InvalidDevice => write!(f, "CUDA invalid device"),
|
||||
CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for CudaError {}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, CudaError>;
|
||||
|
||||
pub(crate) fn check(code: i32) -> Result<()> {
|
||||
if code == ffi::CUDA_SUCCESS {
|
||||
return Ok(());
|
||||
}
|
||||
let message = unsafe {
|
||||
let ptr = ffi::cudaGetErrorString(code);
|
||||
if ptr.is_null() {
|
||||
"unknown error".to_string()
|
||||
} else {
|
||||
CStr::from_ptr(ptr).to_string_lossy().into_owned()
|
||||
}
|
||||
};
|
||||
Err(match code {
|
||||
ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
|
||||
101 => CudaError::InvalidDevice,
|
||||
_ => CudaError::Raw { code, message },
|
||||
})
|
||||
}
|
||||
73
crates/xserv-cuda/src/ffi.rs
Normal file
73
crates/xserv-cuda/src/ffi.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::ffi::c_void;
|
||||
use std::os::raw::c_char;
|
||||
|
||||
pub type CudaStream = *mut c_void;
|
||||
pub type CudaEvent = *mut c_void;
|
||||
|
||||
pub const CUDA_MEMCPY_H2D: i32 = 1;
|
||||
pub const CUDA_MEMCPY_D2H: i32 = 2;
|
||||
pub const CUDA_MEMCPY_D2D: i32 = 3;
|
||||
|
||||
pub const CUDA_SUCCESS: i32 = 0;
|
||||
pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
|
||||
|
||||
#[repr(C)]
|
||||
pub struct CudaDeviceProp {
|
||||
pub name: [c_char; 256],
|
||||
pub total_global_mem: usize,
|
||||
pub shared_mem_per_block: usize,
|
||||
pub regs_per_block: i32,
|
||||
pub warp_size: i32,
|
||||
pub max_threads_per_block: i32,
|
||||
pub max_threads_dim: [i32; 3],
|
||||
pub max_grid_size: [i32; 3],
|
||||
pub clock_rate: i32,
|
||||
pub total_const_mem: usize,
|
||||
pub major: i32,
|
||||
pub minor: i32,
|
||||
// There are many more fields; we only read up to what we need.
|
||||
// cudaDeviceProp is a large struct (~1KB). We pad the rest.
|
||||
_pad: [u8; 4096],
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
// --- Device ---
|
||||
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
|
||||
pub fn cudaSetDevice(device: i32) -> i32;
|
||||
pub fn cudaGetDevice(device: *mut i32) -> i32;
|
||||
pub fn cudaGetDeviceProperties(prop: *mut CudaDeviceProp, device: i32) -> i32;
|
||||
pub fn cudaDeviceSynchronize() -> i32;
|
||||
|
||||
// --- Memory ---
|
||||
pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
|
||||
pub fn cudaFree(devptr: *mut u8) -> i32;
|
||||
pub fn cudaMallocHost(ptr: *mut *mut u8, size: usize) -> i32;
|
||||
pub fn cudaFreeHost(ptr: *mut u8) -> i32;
|
||||
pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
|
||||
pub fn cudaMemcpyAsync(
|
||||
dst: *mut u8,
|
||||
src: *const u8,
|
||||
count: usize,
|
||||
kind: i32,
|
||||
stream: CudaStream,
|
||||
) -> i32;
|
||||
pub fn cudaMemset(devptr: *mut u8, value: i32, count: usize) -> i32;
|
||||
|
||||
// --- Stream ---
|
||||
pub fn cudaStreamCreate(stream: *mut CudaStream) -> i32;
|
||||
pub fn cudaStreamDestroy(stream: CudaStream) -> i32;
|
||||
pub fn cudaStreamSynchronize(stream: CudaStream) -> i32;
|
||||
|
||||
// --- Error ---
|
||||
pub fn cudaGetLastError() -> i32;
|
||||
pub fn cudaGetErrorString(error: i32) -> *const c_char;
|
||||
|
||||
// --- Our test kernel ---
|
||||
pub fn launch_vecadd_f32(
|
||||
a: *const f32,
|
||||
b: *const f32,
|
||||
c: *mut f32,
|
||||
n: i32,
|
||||
stream: CudaStream,
|
||||
);
|
||||
}
|
||||
12
crates/xserv-cuda/src/lib.rs
Normal file
12
crates/xserv-cuda/src/lib.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
pub mod allocator;
|
||||
pub mod device;
|
||||
pub mod error;
|
||||
pub mod ffi;
|
||||
pub mod memory;
|
||||
pub mod stream;
|
||||
|
||||
pub use allocator::CachingAllocator;
|
||||
pub use device::DeviceInfo;
|
||||
pub use error::{CudaError, Result};
|
||||
pub use memory::{GpuBuffer, PinnedBuffer};
|
||||
pub use stream::CudaStream;
|
||||
146
crates/xserv-cuda/src/memory.rs
Normal file
146
crates/xserv-cuda/src/memory.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
use crate::error::{self, Result};
|
||||
use crate::ffi;
|
||||
use crate::stream::CudaStream;
|
||||
|
||||
/// RAII wrapper around a GPU memory allocation.
|
||||
pub struct GpuBuffer {
|
||||
ptr: *mut u8,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl GpuBuffer {
|
||||
pub fn alloc(len: usize) -> Result<Self> {
|
||||
assert!(len > 0, "cannot allocate 0 bytes on GPU");
|
||||
let mut ptr = std::ptr::null_mut();
|
||||
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
|
||||
Ok(Self { ptr, len })
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn as_ptr(&self) -> *const u8 {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub fn as_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
/// Copy data from host (CPU) slice to this GPU buffer.
|
||||
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
|
||||
assert!(src.len() <= self.len, "source larger than buffer");
|
||||
error::check(unsafe {
|
||||
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
|
||||
})
|
||||
}
|
||||
|
||||
/// Copy data from this GPU buffer to a host (CPU) slice.
|
||||
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
|
||||
assert!(dst.len() <= self.len, "destination larger than buffer");
|
||||
error::check(unsafe {
|
||||
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
|
||||
})
|
||||
}
|
||||
|
||||
/// Async copy from host to device on the given stream.
|
||||
/// Safety: `src` must remain valid until the stream operation completes.
|
||||
pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
|
||||
assert!(src.len() <= self.len);
|
||||
error::check(ffi::cudaMemcpyAsync(
|
||||
self.ptr,
|
||||
src.as_ptr(),
|
||||
src.len(),
|
||||
ffi::CUDA_MEMCPY_H2D,
|
||||
stream.as_raw(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Async copy from device to host on the given stream.
|
||||
/// Safety: `dst` must remain valid until the stream operation completes.
|
||||
pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
|
||||
assert!(dst.len() <= self.len);
|
||||
error::check(ffi::cudaMemcpyAsync(
|
||||
dst.as_mut_ptr(),
|
||||
self.ptr,
|
||||
dst.len(),
|
||||
ffi::CUDA_MEMCPY_D2H,
|
||||
stream.as_raw(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Copy from another GPU buffer (D2D).
|
||||
pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
|
||||
let n = src.len.min(self.len);
|
||||
error::check(unsafe {
|
||||
ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D)
|
||||
})
|
||||
}
|
||||
|
||||
/// Fill buffer with zeros.
|
||||
pub fn zero(&mut self) -> Result<()> {
|
||||
error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
|
||||
}
|
||||
|
||||
/// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
|
||||
/// Caller is responsible for eventually calling cudaFree.
|
||||
pub fn into_raw(self) -> (*mut u8, usize) {
|
||||
let ptr = self.ptr;
|
||||
let len = self.len;
|
||||
std::mem::forget(self);
|
||||
(ptr, len)
|
||||
}
|
||||
|
||||
/// Reconstruct a GpuBuffer from a raw pointer + length.
|
||||
/// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
|
||||
pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
|
||||
Self { ptr, len }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GpuBuffer {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
unsafe { ffi::cudaFree(self.ptr) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl Send for GpuBuffer {}
|
||||
|
||||
/// Pinned (page-locked) host memory for faster H2D/D2H transfers.
|
||||
pub struct PinnedBuffer {
|
||||
ptr: *mut u8,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl PinnedBuffer {
|
||||
pub fn alloc(len: usize) -> Result<Self> {
|
||||
let mut ptr = std::ptr::null_mut();
|
||||
error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
|
||||
Ok(Self { ptr, len })
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
|
||||
}
|
||||
|
||||
pub fn as_mut_slice(&mut self) -> &mut [u8] {
|
||||
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PinnedBuffer {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
unsafe { ffi::cudaFreeHost(self.ptr) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl Send for PinnedBuffer {}
|
||||
33
crates/xserv-cuda/src/stream.rs
Normal file
33
crates/xserv-cuda/src/stream.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use crate::error::{self, Result};
|
||||
use crate::ffi;
|
||||
|
||||
pub struct CudaStream {
|
||||
raw: ffi::CudaStream,
|
||||
}
|
||||
|
||||
impl CudaStream {
|
||||
pub fn new() -> Result<Self> {
|
||||
let mut raw = std::ptr::null_mut();
|
||||
error::check(unsafe { ffi::cudaStreamCreate(&mut raw) })?;
|
||||
Ok(Self { raw })
|
||||
}
|
||||
|
||||
pub fn synchronize(&self) -> Result<()> {
|
||||
error::check(unsafe { ffi::cudaStreamSynchronize(self.raw) })
|
||||
}
|
||||
|
||||
pub fn as_raw(&self) -> ffi::CudaStream {
|
||||
self.raw
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for CudaStream {
|
||||
fn drop(&mut self) {
|
||||
if !self.raw.is_null() {
|
||||
unsafe { ffi::cudaStreamDestroy(self.raw) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Can move across threads, but not shared without synchronization
|
||||
unsafe impl Send for CudaStream {}
|
||||
208
crates/xserv-cuda/tests/integration.rs
Normal file
208
crates/xserv-cuda/tests/integration.rs
Normal file
@@ -0,0 +1,208 @@
|
||||
use xserv_cuda::*;
|
||||
|
||||
#[test]
|
||||
fn test_device_info() {
|
||||
let count = device::device_count().expect("failed to get device count");
|
||||
assert!(count > 0, "no CUDA devices found");
|
||||
|
||||
let info = device::device_info(0).expect("failed to get device info");
|
||||
println!("GPU 0: {}", info.name);
|
||||
println!(" Memory: {} MB", info.total_memory / (1024 * 1024));
|
||||
println!(
|
||||
" Compute Capability: {}.{}",
|
||||
info.compute_major, info.compute_minor
|
||||
);
|
||||
println!(" SM Count: {}", info.sm_count);
|
||||
println!(" Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
|
||||
println!(" Warp Size: {}", info.warp_size);
|
||||
println!(" Max Threads/Block: {}", info.max_threads_per_block);
|
||||
|
||||
assert!(info.total_memory > 0);
|
||||
assert!(info.sm_count > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gpu_buffer_h2d_d2h() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
|
||||
let mut buf = GpuBuffer::alloc(data.len()).unwrap();
|
||||
buf.copy_from_host(&data).unwrap();
|
||||
|
||||
let mut out = vec![0u8; data.len()];
|
||||
buf.copy_to_host(&mut out).unwrap();
|
||||
|
||||
assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gpu_buffer_large() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let size = 64 * 1024 * 1024; // 64 MB
|
||||
let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
|
||||
let mut buf = GpuBuffer::alloc(size).unwrap();
|
||||
buf.copy_from_host(&data).unwrap();
|
||||
|
||||
let mut out = vec![0u8; size];
|
||||
buf.copy_to_host(&mut out).unwrap();
|
||||
|
||||
assert_eq!(data, out, "64MB roundtrip mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gpu_buffer_d2d() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
|
||||
let mut src = GpuBuffer::alloc(data.len()).unwrap();
|
||||
src.copy_from_host(&data).unwrap();
|
||||
|
||||
let mut dst = GpuBuffer::alloc(data.len()).unwrap();
|
||||
dst.copy_from_device(&src).unwrap();
|
||||
|
||||
let mut out = vec![0u8; data.len()];
|
||||
dst.copy_to_host(&mut out).unwrap();
|
||||
|
||||
assert_eq!(data, out, "D2D copy mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gpu_buffer_zero() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let mut buf = GpuBuffer::alloc(1024).unwrap();
|
||||
buf.zero().unwrap();
|
||||
|
||||
let mut out = vec![0xFFu8; 1024];
|
||||
buf.copy_to_host(&mut out).unwrap();
|
||||
|
||||
assert!(out.iter().all(|&b| b == 0), "zero fill failed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let stream = CudaStream::new().unwrap();
|
||||
stream.synchronize().unwrap();
|
||||
// stream drops here, should destroy cleanly
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vecadd_kernel() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let n = 1024;
|
||||
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
|
||||
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
|
||||
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
|
||||
|
||||
let byte_len = n * std::mem::size_of::<f32>();
|
||||
|
||||
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
|
||||
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
|
||||
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
|
||||
|
||||
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
|
||||
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
|
||||
d_a.copy_from_host(a_bytes).unwrap();
|
||||
d_b.copy_from_host(b_bytes).unwrap();
|
||||
|
||||
unsafe {
|
||||
ffi::launch_vecadd_f32(
|
||||
d_a.as_ptr() as *const f32,
|
||||
d_b.as_ptr() as *const f32,
|
||||
d_c.as_mut_ptr() as *mut f32,
|
||||
n as i32,
|
||||
std::ptr::null_mut(), // default stream
|
||||
);
|
||||
}
|
||||
device::synchronize().unwrap();
|
||||
|
||||
let mut result = vec![0.0f32; n];
|
||||
let result_bytes =
|
||||
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
|
||||
d_c.copy_to_host(result_bytes).unwrap();
|
||||
|
||||
assert_eq!(result, expected, "vecadd kernel output mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_caching_allocator() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let mut alloc = CachingAllocator::new();
|
||||
|
||||
// First allocation: should trigger cudaMalloc
|
||||
let buf1 = alloc.alloc(1024).unwrap();
|
||||
assert_eq!(alloc.stats().cuda_malloc_count, 1);
|
||||
assert_eq!(alloc.stats().cache_hit_count, 0);
|
||||
|
||||
// Return to cache
|
||||
alloc.dealloc(buf1);
|
||||
|
||||
// Second allocation of same size: should hit cache
|
||||
let _buf2 = alloc.alloc(1024).unwrap();
|
||||
assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
|
||||
assert_eq!(alloc.stats().cache_hit_count, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_caching_allocator_different_sizes() {
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let mut alloc = CachingAllocator::new();
|
||||
|
||||
let buf1 = alloc.alloc(512).unwrap();
|
||||
let buf2 = alloc.alloc(2048).unwrap();
|
||||
|
||||
alloc.dealloc(buf1);
|
||||
alloc.dealloc(buf2);
|
||||
|
||||
// Re-alloc different sizes: each should hit its own bucket
|
||||
let _buf3 = alloc.alloc(512).unwrap();
|
||||
let _buf4 = alloc.alloc(2048).unwrap();
|
||||
|
||||
assert_eq!(alloc.stats().cuda_malloc_count, 2);
|
||||
assert_eq!(alloc.stats().cache_hit_count, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pinned_memory() {
|
||||
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
|
||||
let slice = pinned.as_mut_slice();
|
||||
for (i, byte) in slice.iter_mut().enumerate() {
|
||||
*byte = (i % 256) as u8;
|
||||
}
|
||||
|
||||
device::set_device(0).unwrap();
|
||||
let mut gpu = GpuBuffer::alloc(4096).unwrap();
|
||||
gpu.copy_from_host(pinned.as_slice()).unwrap();
|
||||
|
||||
let mut out = vec![0u8; 4096];
|
||||
gpu.copy_to_host(&mut out).unwrap();
|
||||
|
||||
assert_eq!(pinned.as_slice(), &out[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_async_copy() {
|
||||
device::set_device(0).unwrap();
|
||||
let stream = CudaStream::new().unwrap();
|
||||
|
||||
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
|
||||
for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
|
||||
*byte = (i % 256) as u8;
|
||||
}
|
||||
|
||||
let mut gpu = GpuBuffer::alloc(4096).unwrap();
|
||||
unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
|
||||
stream.synchronize().unwrap();
|
||||
|
||||
let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
|
||||
unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
|
||||
stream.synchronize().unwrap();
|
||||
|
||||
assert_eq!(pinned.as_slice(), out_pinned.as_slice());
|
||||
}
|
||||
Reference in New Issue
Block a user