Files
xserv/crates/xserv-cuda/tests/integration.rs
Gahow Wang c8f7bc0c3c phase 0+1: fix Rust 2024 edition compat + memory query
- unsafe extern "C" blocks (Rust 2024 requirement)
- unsafe blocks inside unsafe fn bodies
- Use cudaMemGetInfo for accurate GPU memory reporting
- Remove cc "cuda" feature (doesn't exist, built-in)
- All 12 tests pass on RTX 5090 (CC 12.0, 170 SMs, 32GB)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 19:40:49 +08:00

210 lines
6.2 KiB
Rust

use xserv_cuda::*;
#[test]
fn test_device_info() {
let count = device::device_count().expect("failed to get device count");
assert!(count > 0, "no CUDA devices found");
let info = device::device_info(0).expect("failed to get device info");
println!("GPU 0: {}", info.name);
println!(" Total Memory: {} MB", info.total_memory / (1024 * 1024));
println!(" Free Memory: {} MB", info.free_memory / (1024 * 1024));
println!(
" Compute Capability: {}.{}",
info.compute_major, info.compute_minor
);
println!(" SM Count: {}", info.sm_count);
println!(" Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
println!(" Warp Size: {}", info.warp_size);
println!(" Max Threads/Block: {}", info.max_threads_per_block);
assert!(info.total_memory > 30 * 1024 * 1024 * 1024); // 5090 has 32GB
assert!(info.sm_count > 0);
}
#[test]
fn test_gpu_buffer_h2d_d2h() {
device::set_device(0).unwrap();
let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
let mut buf = GpuBuffer::alloc(data.len()).unwrap();
buf.copy_from_host(&data).unwrap();
let mut out = vec![0u8; data.len()];
buf.copy_to_host(&mut out).unwrap();
assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
}
#[test]
fn test_gpu_buffer_large() {
device::set_device(0).unwrap();
let size = 64 * 1024 * 1024; // 64 MB
let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
let mut buf = GpuBuffer::alloc(size).unwrap();
buf.copy_from_host(&data).unwrap();
let mut out = vec![0u8; size];
buf.copy_to_host(&mut out).unwrap();
assert_eq!(data, out, "64MB roundtrip mismatch");
}
#[test]
fn test_gpu_buffer_d2d() {
device::set_device(0).unwrap();
let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
let mut src = GpuBuffer::alloc(data.len()).unwrap();
src.copy_from_host(&data).unwrap();
let mut dst = GpuBuffer::alloc(data.len()).unwrap();
dst.copy_from_device(&src).unwrap();
let mut out = vec![0u8; data.len()];
dst.copy_to_host(&mut out).unwrap();
assert_eq!(data, out, "D2D copy mismatch");
}
#[test]
fn test_gpu_buffer_zero() {
device::set_device(0).unwrap();
let mut buf = GpuBuffer::alloc(1024).unwrap();
buf.zero().unwrap();
let mut out = vec![0xFFu8; 1024];
buf.copy_to_host(&mut out).unwrap();
assert!(out.iter().all(|&b| b == 0), "zero fill failed");
}
#[test]
fn test_stream() {
device::set_device(0).unwrap();
let stream = CudaStream::new().unwrap();
stream.synchronize().unwrap();
// stream drops here, should destroy cleanly
}
#[test]
fn test_vecadd_kernel() {
device::set_device(0).unwrap();
let n = 1024;
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
let byte_len = n * std::mem::size_of::<f32>();
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
d_a.copy_from_host(a_bytes).unwrap();
d_b.copy_from_host(b_bytes).unwrap();
unsafe {
ffi::launch_vecadd_f32(
d_a.as_ptr() as *const f32,
d_b.as_ptr() as *const f32,
d_c.as_mut_ptr() as *mut f32,
n as i32,
std::ptr::null_mut(), // default stream
);
}
device::synchronize().unwrap();
let mut result = vec![0.0f32; n];
let result_bytes =
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
d_c.copy_to_host(result_bytes).unwrap();
assert_eq!(result, expected, "vecadd kernel output mismatch");
}
#[test]
fn test_caching_allocator() {
device::set_device(0).unwrap();
let mut alloc = CachingAllocator::new();
// First allocation: should trigger cudaMalloc
let buf1 = alloc.alloc(1024).unwrap();
assert_eq!(alloc.stats().cuda_malloc_count, 1);
assert_eq!(alloc.stats().cache_hit_count, 0);
// Return to cache
alloc.dealloc(buf1);
// Second allocation of same size: should hit cache
let _buf2 = alloc.alloc(1024).unwrap();
assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
assert_eq!(alloc.stats().cache_hit_count, 1);
}
#[test]
fn test_caching_allocator_different_sizes() {
device::set_device(0).unwrap();
let mut alloc = CachingAllocator::new();
let buf1 = alloc.alloc(512).unwrap();
let buf2 = alloc.alloc(2048).unwrap();
alloc.dealloc(buf1);
alloc.dealloc(buf2);
// Re-alloc different sizes: each should hit its own bucket
let _buf3 = alloc.alloc(512).unwrap();
let _buf4 = alloc.alloc(2048).unwrap();
assert_eq!(alloc.stats().cuda_malloc_count, 2);
assert_eq!(alloc.stats().cache_hit_count, 2);
}
#[test]
fn test_pinned_memory() {
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
let slice = pinned.as_mut_slice();
for (i, byte) in slice.iter_mut().enumerate() {
*byte = (i % 256) as u8;
}
device::set_device(0).unwrap();
let mut gpu = GpuBuffer::alloc(4096).unwrap();
gpu.copy_from_host(pinned.as_slice()).unwrap();
let mut out = vec![0u8; 4096];
gpu.copy_to_host(&mut out).unwrap();
assert_eq!(pinned.as_slice(), &out[..]);
}
#[test]
fn test_async_copy() {
device::set_device(0).unwrap();
let stream = CudaStream::new().unwrap();
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
*byte = (i % 256) as u8;
}
let mut gpu = GpuBuffer::alloc(4096).unwrap();
unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
stream.synchronize().unwrap();
let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
stream.synchronize().unwrap();
assert_eq!(pinned.as_slice(), out_pinned.as_slice());
}