Stand up the xtrain project skeleton: a Cargo workspace mirroring xserv's csrc/ + crates/ layout, with a single xtrain-cuda crate that wraps the CUDA Runtime over hand-written extern "C" FFI. build.rs compiles csrc/test/vecadd.cu via the cc crate targeting sm_120 (RTX 5090) and links cudart. A gated integration test runs the vector-add kernel on the GPU and asserts the result. When nvcc is absent (local GPU-less machine), build.rs skips CUDA compilation and sets a `no_cuda` cfg so host-side cargo check still works. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
58 lines
2.0 KiB
Rust
58 lines
2.0 KiB
Rust
// Smoke test for the Rust↔CUDA build chain: allocate two host vectors, run the
|
|
// vector-add kernel on the GPU, copy back, and assert the result is correct.
|
|
//
|
|
// Requires nvcc + a GPU, so it is gated behind `not(no_cuda)`. On the local
|
|
// (GPU-less) machine build.rs sets the `no_cuda` cfg and this test is skipped,
|
|
// keeping host-side `cargo check`/`cargo test --no-run` meaningful.
|
|
#![cfg(not(no_cuda))]
|
|
|
|
use xtrain_cuda::{GpuBuffer, device, ffi};
|
|
|
|
#[test]
|
|
fn test_vecadd_kernel() {
|
|
let count = device::device_count().expect("failed to get device count");
|
|
assert!(count > 0, "no CUDA devices found");
|
|
device::set_device(0).unwrap();
|
|
|
|
let n = 1024;
|
|
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
|
|
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
|
|
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
|
|
|
|
let byte_len = n * std::mem::size_of::<f32>();
|
|
|
|
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
|
|
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
|
|
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
|
|
|
|
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
|
|
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
|
|
d_a.copy_from_host(a_bytes).unwrap();
|
|
d_b.copy_from_host(b_bytes).unwrap();
|
|
|
|
unsafe {
|
|
ffi::launch_vecadd_f32(
|
|
d_a.as_ptr() as *const f32,
|
|
d_b.as_ptr() as *const f32,
|
|
d_c.as_mut_ptr() as *mut f32,
|
|
n as i32,
|
|
std::ptr::null_mut(), // default stream
|
|
);
|
|
}
|
|
device::synchronize().unwrap();
|
|
|
|
let mut result = vec![0.0f32; n];
|
|
let result_bytes =
|
|
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
|
|
d_c.copy_to_host(result_bytes).unwrap();
|
|
|
|
assert_eq!(result, expected, "vecadd kernel output mismatch");
|
|
println!(
|
|
"vecadd OK: first={} mid={} last={} ({} elems)",
|
|
result[0],
|
|
result[n / 2],
|
|
result[n - 1],
|
|
n
|
|
);
|
|
}
|