- unsafe extern "C" blocks (Rust 2024 requirement) - unsafe blocks inside unsafe fn bodies - Use cudaMemGetInfo for accurate GPU memory reporting - Remove cc "cuda" feature (doesn't exist, built-in) - All 12 tests pass on RTX 5090 (CC 12.0, 170 SMs, 32GB) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
210 lines
6.2 KiB
Rust
210 lines
6.2 KiB
Rust
use xserv_cuda::*;
|
|
|
|
#[test]
|
|
fn test_device_info() {
|
|
let count = device::device_count().expect("failed to get device count");
|
|
assert!(count > 0, "no CUDA devices found");
|
|
|
|
let info = device::device_info(0).expect("failed to get device info");
|
|
println!("GPU 0: {}", info.name);
|
|
println!(" Total Memory: {} MB", info.total_memory / (1024 * 1024));
|
|
println!(" Free Memory: {} MB", info.free_memory / (1024 * 1024));
|
|
println!(
|
|
" Compute Capability: {}.{}",
|
|
info.compute_major, info.compute_minor
|
|
);
|
|
println!(" SM Count: {}", info.sm_count);
|
|
println!(" Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
|
|
println!(" Warp Size: {}", info.warp_size);
|
|
println!(" Max Threads/Block: {}", info.max_threads_per_block);
|
|
|
|
assert!(info.total_memory > 30 * 1024 * 1024 * 1024); // 5090 has 32GB
|
|
assert!(info.sm_count > 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_gpu_buffer_h2d_d2h() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
|
|
let mut buf = GpuBuffer::alloc(data.len()).unwrap();
|
|
buf.copy_from_host(&data).unwrap();
|
|
|
|
let mut out = vec![0u8; data.len()];
|
|
buf.copy_to_host(&mut out).unwrap();
|
|
|
|
assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
|
|
}
|
|
|
|
#[test]
|
|
fn test_gpu_buffer_large() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let size = 64 * 1024 * 1024; // 64 MB
|
|
let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
|
|
let mut buf = GpuBuffer::alloc(size).unwrap();
|
|
buf.copy_from_host(&data).unwrap();
|
|
|
|
let mut out = vec![0u8; size];
|
|
buf.copy_to_host(&mut out).unwrap();
|
|
|
|
assert_eq!(data, out, "64MB roundtrip mismatch");
|
|
}
|
|
|
|
#[test]
|
|
fn test_gpu_buffer_d2d() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
|
|
let mut src = GpuBuffer::alloc(data.len()).unwrap();
|
|
src.copy_from_host(&data).unwrap();
|
|
|
|
let mut dst = GpuBuffer::alloc(data.len()).unwrap();
|
|
dst.copy_from_device(&src).unwrap();
|
|
|
|
let mut out = vec![0u8; data.len()];
|
|
dst.copy_to_host(&mut out).unwrap();
|
|
|
|
assert_eq!(data, out, "D2D copy mismatch");
|
|
}
|
|
|
|
#[test]
|
|
fn test_gpu_buffer_zero() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let mut buf = GpuBuffer::alloc(1024).unwrap();
|
|
buf.zero().unwrap();
|
|
|
|
let mut out = vec![0xFFu8; 1024];
|
|
buf.copy_to_host(&mut out).unwrap();
|
|
|
|
assert!(out.iter().all(|&b| b == 0), "zero fill failed");
|
|
}
|
|
|
|
#[test]
|
|
fn test_stream() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let stream = CudaStream::new().unwrap();
|
|
stream.synchronize().unwrap();
|
|
// stream drops here, should destroy cleanly
|
|
}
|
|
|
|
#[test]
|
|
fn test_vecadd_kernel() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let n = 1024;
|
|
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
|
|
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
|
|
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
|
|
|
|
let byte_len = n * std::mem::size_of::<f32>();
|
|
|
|
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
|
|
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
|
|
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
|
|
|
|
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
|
|
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
|
|
d_a.copy_from_host(a_bytes).unwrap();
|
|
d_b.copy_from_host(b_bytes).unwrap();
|
|
|
|
unsafe {
|
|
ffi::launch_vecadd_f32(
|
|
d_a.as_ptr() as *const f32,
|
|
d_b.as_ptr() as *const f32,
|
|
d_c.as_mut_ptr() as *mut f32,
|
|
n as i32,
|
|
std::ptr::null_mut(), // default stream
|
|
);
|
|
}
|
|
device::synchronize().unwrap();
|
|
|
|
let mut result = vec![0.0f32; n];
|
|
let result_bytes =
|
|
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
|
|
d_c.copy_to_host(result_bytes).unwrap();
|
|
|
|
assert_eq!(result, expected, "vecadd kernel output mismatch");
|
|
}
|
|
|
|
#[test]
|
|
fn test_caching_allocator() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let mut alloc = CachingAllocator::new();
|
|
|
|
// First allocation: should trigger cudaMalloc
|
|
let buf1 = alloc.alloc(1024).unwrap();
|
|
assert_eq!(alloc.stats().cuda_malloc_count, 1);
|
|
assert_eq!(alloc.stats().cache_hit_count, 0);
|
|
|
|
// Return to cache
|
|
alloc.dealloc(buf1);
|
|
|
|
// Second allocation of same size: should hit cache
|
|
let _buf2 = alloc.alloc(1024).unwrap();
|
|
assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
|
|
assert_eq!(alloc.stats().cache_hit_count, 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_caching_allocator_different_sizes() {
|
|
device::set_device(0).unwrap();
|
|
|
|
let mut alloc = CachingAllocator::new();
|
|
|
|
let buf1 = alloc.alloc(512).unwrap();
|
|
let buf2 = alloc.alloc(2048).unwrap();
|
|
|
|
alloc.dealloc(buf1);
|
|
alloc.dealloc(buf2);
|
|
|
|
// Re-alloc different sizes: each should hit its own bucket
|
|
let _buf3 = alloc.alloc(512).unwrap();
|
|
let _buf4 = alloc.alloc(2048).unwrap();
|
|
|
|
assert_eq!(alloc.stats().cuda_malloc_count, 2);
|
|
assert_eq!(alloc.stats().cache_hit_count, 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_pinned_memory() {
|
|
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
|
|
let slice = pinned.as_mut_slice();
|
|
for (i, byte) in slice.iter_mut().enumerate() {
|
|
*byte = (i % 256) as u8;
|
|
}
|
|
|
|
device::set_device(0).unwrap();
|
|
let mut gpu = GpuBuffer::alloc(4096).unwrap();
|
|
gpu.copy_from_host(pinned.as_slice()).unwrap();
|
|
|
|
let mut out = vec![0u8; 4096];
|
|
gpu.copy_to_host(&mut out).unwrap();
|
|
|
|
assert_eq!(pinned.as_slice(), &out[..]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_async_copy() {
|
|
device::set_device(0).unwrap();
|
|
let stream = CudaStream::new().unwrap();
|
|
|
|
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
|
|
for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
|
|
*byte = (i % 256) as u8;
|
|
}
|
|
|
|
let mut gpu = GpuBuffer::alloc(4096).unwrap();
|
|
unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
|
|
stream.synchronize().unwrap();
|
|
|
|
let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
|
|
unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
|
|
stream.synchronize().unwrap();
|
|
|
|
assert_eq!(pinned.as_slice(), out_pinned.as_slice());
|
|
}
|