xserv/crates/xserv-cuda/tests/integration.rs

use xserv_cuda::*;

#[test]
fn test_device_info() {
    let count = device::device_count().expect("failed to get device count");
    assert!(count > 0, "no CUDA devices found");

    let info = device::device_info(0).expect("failed to get device info");
    println!("GPU 0: {}", info.name);
    println!("  Total Memory: {} MB", info.total_memory / (1024 * 1024));
    println!("  Free Memory:  {} MB", info.free_memory / (1024 * 1024));
    println!(
        "  Compute Capability: {}.{}",
        info.compute_major, info.compute_minor
    );
    println!("  SM Count: {}", info.sm_count);
    println!("  Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
    println!("  Warp Size: {}", info.warp_size);
    println!("  Max Threads/Block: {}", info.max_threads_per_block);

    assert!(info.total_memory > 30 * 1024 * 1024 * 1024); // 5090 has 32GB
    assert!(info.sm_count > 0);
}

#[test]
fn test_gpu_buffer_h2d_d2h() {
    device::set_device(0).unwrap();

    let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
    let mut buf = GpuBuffer::alloc(data.len()).unwrap();
    buf.copy_from_host(&data).unwrap();

    let mut out = vec![0u8; data.len()];
    buf.copy_to_host(&mut out).unwrap();

    assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
}

#[test]
fn test_gpu_buffer_large() {
    device::set_device(0).unwrap();

    let size = 64 * 1024 * 1024; // 64 MB
    let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
    let mut buf = GpuBuffer::alloc(size).unwrap();
    buf.copy_from_host(&data).unwrap();

    let mut out = vec![0u8; size];
    buf.copy_to_host(&mut out).unwrap();

    assert_eq!(data, out, "64MB roundtrip mismatch");
}

#[test]
fn test_gpu_buffer_d2d() {
    device::set_device(0).unwrap();

    let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
    let mut src = GpuBuffer::alloc(data.len()).unwrap();
    src.copy_from_host(&data).unwrap();

    let mut dst = GpuBuffer::alloc(data.len()).unwrap();
    dst.copy_from_device(&src).unwrap();

    let mut out = vec![0u8; data.len()];
    dst.copy_to_host(&mut out).unwrap();

    assert_eq!(data, out, "D2D copy mismatch");
}

#[test]
fn test_gpu_buffer_zero() {
    device::set_device(0).unwrap();

    let mut buf = GpuBuffer::alloc(1024).unwrap();
    buf.zero().unwrap();

    let mut out = vec![0xFFu8; 1024];
    buf.copy_to_host(&mut out).unwrap();

    assert!(out.iter().all(|&b| b == 0), "zero fill failed");
}

#[test]
fn test_stream() {
    device::set_device(0).unwrap();

    let stream = CudaStream::new().unwrap();
    stream.synchronize().unwrap();
    // stream drops here, should destroy cleanly
}

#[test]
fn test_vecadd_kernel() {
    device::set_device(0).unwrap();

    let n = 1024;
    let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
    let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
    let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();

    let byte_len = n * std::mem::size_of::<f32>();

    let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
    let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
    let mut d_c = GpuBuffer::alloc(byte_len).unwrap();

    let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
    let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
    d_a.copy_from_host(a_bytes).unwrap();
    d_b.copy_from_host(b_bytes).unwrap();

    unsafe {
        ffi::launch_vecadd_f32(
            d_a.as_ptr() as *const f32,
            d_b.as_ptr() as *const f32,
            d_c.as_mut_ptr() as *mut f32,
            n as i32,
            std::ptr::null_mut(), // default stream
        );
    }
    device::synchronize().unwrap();

    let mut result = vec![0.0f32; n];
    let result_bytes =
        unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
    d_c.copy_to_host(result_bytes).unwrap();

    assert_eq!(result, expected, "vecadd kernel output mismatch");
}

#[test]
fn test_caching_allocator() {
    device::set_device(0).unwrap();

    let mut alloc = CachingAllocator::new();

    // First allocation: should trigger cudaMalloc
    let buf1 = alloc.alloc(1024).unwrap();
    assert_eq!(alloc.stats().cuda_malloc_count, 1);
    assert_eq!(alloc.stats().cache_hit_count, 0);

    // Return to cache
    alloc.dealloc(buf1);

    // Second allocation of same size: should hit cache
    let _buf2 = alloc.alloc(1024).unwrap();
    assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
    assert_eq!(alloc.stats().cache_hit_count, 1);
}

#[test]
fn test_caching_allocator_different_sizes() {
    device::set_device(0).unwrap();

    let mut alloc = CachingAllocator::new();

    let buf1 = alloc.alloc(512).unwrap();
    let buf2 = alloc.alloc(2048).unwrap();

    alloc.dealloc(buf1);
    alloc.dealloc(buf2);

    // Re-alloc different sizes: each should hit its own bucket
    let _buf3 = alloc.alloc(512).unwrap();
    let _buf4 = alloc.alloc(2048).unwrap();

    assert_eq!(alloc.stats().cuda_malloc_count, 2);
    assert_eq!(alloc.stats().cache_hit_count, 2);
}

#[test]
fn test_pinned_memory() {
    let mut pinned = PinnedBuffer::alloc(4096).unwrap();
    let slice = pinned.as_mut_slice();
    for (i, byte) in slice.iter_mut().enumerate() {
        *byte = (i % 256) as u8;
    }

    device::set_device(0).unwrap();
    let mut gpu = GpuBuffer::alloc(4096).unwrap();
    gpu.copy_from_host(pinned.as_slice()).unwrap();

    let mut out = vec![0u8; 4096];
    gpu.copy_to_host(&mut out).unwrap();

    assert_eq!(pinned.as_slice(), &out[..]);
}

#[test]
fn test_async_copy() {
    device::set_device(0).unwrap();
    let stream = CudaStream::new().unwrap();

    let mut pinned = PinnedBuffer::alloc(4096).unwrap();
    for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
        *byte = (i % 256) as u8;
    }

    let mut gpu = GpuBuffer::alloc(4096).unwrap();
    unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
    stream.synchronize().unwrap();

    let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
    unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
    stream.synchronize().unwrap();

    assert_eq!(pinned.as_slice(), out_pinned.as_slice());
}