Files
xtrain/crates/xtrain-cuda/tests/integration.rs
Gahow Wang 92acf9f413 T1: scaffold repo + Rust/CUDA build chain (vecadd smoke test)
Stand up the xtrain project skeleton: a Cargo workspace mirroring xserv's
csrc/ + crates/ layout, with a single xtrain-cuda crate that wraps the CUDA
Runtime over hand-written extern "C" FFI. build.rs compiles csrc/test/vecadd.cu
via the cc crate targeting sm_120 (RTX 5090) and links cudart.

A gated integration test runs the vector-add kernel on the GPU and asserts the
result. When nvcc is absent (local GPU-less machine), build.rs skips CUDA
compilation and sets a `no_cuda` cfg so host-side cargo check still works.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 14:42:43 +08:00

58 lines
2.0 KiB
Rust

// Smoke test for the Rust↔CUDA build chain: allocate two host vectors, run the
// vector-add kernel on the GPU, copy back, and assert the result is correct.
//
// Requires nvcc + a GPU, so it is gated behind `not(no_cuda)`. On the local
// (GPU-less) machine build.rs sets the `no_cuda` cfg and this test is skipped,
// keeping host-side `cargo check`/`cargo test --no-run` meaningful.
#![cfg(not(no_cuda))]
use xtrain_cuda::{GpuBuffer, device, ffi};
#[test]
fn test_vecadd_kernel() {
let count = device::device_count().expect("failed to get device count");
assert!(count > 0, "no CUDA devices found");
device::set_device(0).unwrap();
let n = 1024;
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
let byte_len = n * std::mem::size_of::<f32>();
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
d_a.copy_from_host(a_bytes).unwrap();
d_b.copy_from_host(b_bytes).unwrap();
unsafe {
ffi::launch_vecadd_f32(
d_a.as_ptr() as *const f32,
d_b.as_ptr() as *const f32,
d_c.as_mut_ptr() as *mut f32,
n as i32,
std::ptr::null_mut(), // default stream
);
}
device::synchronize().unwrap();
let mut result = vec![0.0f32; n];
let result_bytes =
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
d_c.copy_to_host(result_bytes).unwrap();
assert_eq!(result, expected, "vecadd kernel output mismatch");
println!(
"vecadd OK: first={} mid={} last={} ({} elems)",
result[0],
result[n / 2],
result[n - 1],
n
);
}