xtrain/crates/xtrain-cuda/tests/integration.rs

// Smoke test for the Rust↔CUDA build chain: allocate two host vectors, run the
// vector-add kernel on the GPU, copy back, and assert the result is correct.
//
// Requires nvcc + a GPU, so it is gated behind `not(no_cuda)`. On the local
// (GPU-less) machine build.rs sets the `no_cuda` cfg and this test is skipped,
// keeping host-side `cargo check`/`cargo test --no-run` meaningful.
#![cfg(not(no_cuda))]

use xtrain_cuda::{GpuBuffer, device, ffi};

#[test]
fn test_vecadd_kernel() {
    let count = device::device_count().expect("failed to get device count");
    assert!(count > 0, "no CUDA devices found");
    device::set_device(0).unwrap();

    let n = 1024;
    let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
    let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
    let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();

    let byte_len = n * std::mem::size_of::<f32>();

    let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
    let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
    let mut d_c = GpuBuffer::alloc(byte_len).unwrap();

    let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
    let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
    d_a.copy_from_host(a_bytes).unwrap();
    d_b.copy_from_host(b_bytes).unwrap();

    unsafe {
        ffi::launch_vecadd_f32(
            d_a.as_ptr() as *const f32,
            d_b.as_ptr() as *const f32,
            d_c.as_mut_ptr() as *mut f32,
            n as i32,
            std::ptr::null_mut(), // default stream
        );
    }
    device::synchronize().unwrap();

    let mut result = vec![0.0f32; n];
    let result_bytes =
        unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
    d_c.copy_to_host(result_bytes).unwrap();

    assert_eq!(result, expected, "vecadd kernel output mismatch");
    println!(
        "vecadd OK: first={} mid={} last={} ({} elems)",
        result[0],
        result[n / 2],
        result[n - 1],
        n
    );
}