T1: scaffold repo + Rust/CUDA build chain (vecadd smoke test)

Stand up the xtrain project skeleton: a Cargo workspace mirroring xserv's
csrc/ + crates/ layout, with a single xtrain-cuda crate that wraps the CUDA
Runtime over hand-written extern "C" FFI. build.rs compiles csrc/test/vecadd.cu
via the cc crate targeting sm_120 (RTX 5090) and links cudart.

A gated integration test runs the vector-add kernel on the GPU and asserts the
result. When nvcc is absent (local GPU-less machine), build.rs skips CUDA
compilation and sets a `no_cuda` cfg so host-side cargo check still works.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 14:42:43 +08:00
commit 92acf9f413
13 changed files with 376 additions and 0 deletions

11
.gitignore vendored Normal file
View File

@@ -0,0 +1,11 @@
/target
*.o
*.so
*.a
*.ptx
*.cubin
**/*.rs.bk
.env
# Claude Code runtime state
/.claude/

32
Cargo.lock generated Normal file
View File

@@ -0,0 +1,32 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "cc"
version = "1.2.64"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f"
dependencies = [
"find-msvc-tools",
"shlex",
]
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
[[package]]
name = "shlex"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
[[package]]
name = "xtrain-cuda"
version = "0.1.0"
dependencies = [
"cc",
]

10
Cargo.toml Normal file
View File

@@ -0,0 +1,10 @@
[workspace]
resolver = "2"
members = [
"crates/xtrain-cuda",
]
[workspace.package]
version = "0.1.0"
edition = "2024"
license = "MIT"

50
README.md Normal file
View File

@@ -0,0 +1,50 @@
# xtrain
A from-scratch **Rust + CUDA** LLM **training** engine — the sibling of
[xserv](https://github.com/) (the inference side). GPU-first.
The goal is to learn the full training-systems stack by hand: autograd / backward
passes / optimizers (AdamW) / the training loop / distributed logic. Heavy lifting
is borrowed where it makes sense (GEMM → cuBLAS after a hand-written version,
multi-GPU comms → NCCL, tokenizer → reused from xserv), but the core is written
from scratch. The target architecture is a tiny modern transformer
(RoPE + RMSNorm + SwiGLU, ~130M params) whose forward aligns with xserv's Qwen3,
so the backward passes map one-to-one onto xserv's existing forward kernels and
trained weights can flow back into xserv.
## Status
Bootstrapping (P0). This repo currently contains only the project skeleton and a
working Rust↔CUDA build chain, verified by a trivial vector-add CUDA kernel.
## Layout
```
xtrain/
├── Cargo.toml # workspace
├── csrc/ # CUDA sources (.cu)
│ └── test/vecadd.cu # trivial element-wise vector-add (smoke test)
└── crates/
└── xtrain-cuda/ # CUDA Runtime FFI + build.rs (nvcc → sm_120)
├── build.rs # compiles csrc/*.cu via the `cc` crate, links cudart
├── src/ # ffi / error / device / memory
└── tests/ # vecadd smoke test
```
The build mirrors xserv's approach: `build.rs` invokes `nvcc` (via the `cc` crate)
to compile `csrc/*.cu` targeting `sm_120` (RTX 5090) and links them into the Rust
crate over hand-written `extern "C"` FFI.
## Building & testing
CUDA compilation and execution happen on a GPU box (dash5, 8× RTX 5090, sm_120):
```sh
export PATH=/usr/local/cuda/bin:$HOME/.cargo/bin:$PATH
cargo build
cargo test -p xtrain-cuda -- --nocapture # runs the vecadd smoke test
```
On a machine without `nvcc`/GPU, `build.rs` detects the missing toolchain, skips
CUDA compilation, and sets a `no_cuda` cfg — so host-side `cargo check` still
works (the GPU smoke test is compiled out).

View File

@@ -0,0 +1,7 @@
[package]
name = "xtrain-cuda"
version.workspace = true
edition.workspace = true
[build-dependencies]
cc = "1"

View File

@@ -0,0 +1,39 @@
use std::env;
use std::path::Path;
use std::process::Command;
fn main() {
println!("cargo:rustc-check-cfg=cfg(no_cuda)");
println!("cargo:rerun-if-changed=../../csrc/");
let cuda_path = env::var("CUDA_HOME")
.or_else(|_| env::var("CUDA_PATH"))
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
// Locally there is no nvcc / GPU. Detect that and skip the CUDA build so
// `cargo check`/`cargo build` of host-side Rust still works. The `no_cuda`
// cfg makes the FFI bindings + smoke test compile (but not run) without nvcc.
if !nvcc_available(&cuda_path) {
println!("cargo:warning=nvcc not found — skipping CUDA compilation (host-only build).");
println!("cargo:rustc-cfg=no_cuda");
return;
}
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
println!("cargo:rustc-link-lib=dylib=cudart");
println!("cargo:rustc-link-lib=dylib=cuda");
cc::Build::new()
.cuda(true)
.cudart("shared")
.flag("-gencode=arch=compute_120,code=sm_120")
.file("../../csrc/test/vecadd.cu")
.compile("xtrain_cuda_kernels");
}
fn nvcc_available(cuda_path: &str) -> bool {
if Command::new("nvcc").arg("--version").output().is_ok() {
return true;
}
Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
}

View File

@@ -0,0 +1,16 @@
use crate::error::{self, Result};
use crate::ffi;
pub fn device_count() -> Result<i32> {
let mut count = 0;
error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
Ok(count)
}
pub fn set_device(device: u32) -> Result<()> {
error::check(unsafe { ffi::cudaSetDevice(device as i32) })
}
pub fn synchronize() -> Result<()> {
error::check(unsafe { ffi::cudaDeviceSynchronize() })
}

View File

@@ -0,0 +1,40 @@
use crate::ffi;
use std::ffi::CStr;
use std::fmt;
#[derive(Debug)]
pub enum CudaError {
OutOfMemory,
Raw { code: i32, message: String },
}
impl fmt::Display for CudaError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
}
}
}
impl std::error::Error for CudaError {}
pub type Result<T> = std::result::Result<T, CudaError>;
pub fn check(code: i32) -> Result<()> {
if code == ffi::CUDA_SUCCESS {
return Ok(());
}
let message = unsafe {
let ptr = ffi::cudaGetErrorString(code);
if ptr.is_null() {
"unknown error".to_string()
} else {
CStr::from_ptr(ptr).to_string_lossy().into_owned()
}
};
Err(match code {
ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
_ => CudaError::Raw { code, message },
})
}

View File

@@ -0,0 +1,32 @@
use std::ffi::c_void;
use std::os::raw::c_char;
pub type CudaStream = *mut c_void;
pub const CUDA_MEMCPY_H2D: i32 = 1;
pub const CUDA_MEMCPY_D2H: i32 = 2;
pub const CUDA_SUCCESS: i32 = 0;
pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
unsafe extern "C" {
// --- Device ---
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
pub fn cudaSetDevice(device: i32) -> i32;
pub fn cudaDeviceSynchronize() -> i32;
// --- Memory ---
pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
pub fn cudaFree(devptr: *mut u8) -> i32;
pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
// --- Error ---
pub fn cudaGetErrorString(error: i32) -> *const c_char;
}
// The vector-add smoke-test kernel, compiled from csrc/test/vecadd.cu by build.rs.
// Only linked when CUDA is actually compiled (i.e. nvcc was present).
#[cfg(not(no_cuda))]
unsafe extern "C" {
pub fn launch_vecadd_f32(a: *const f32, b: *const f32, c: *mut f32, n: i32, stream: CudaStream);
}

View File

@@ -0,0 +1,7 @@
pub mod device;
pub mod error;
pub mod ffi;
pub mod memory;
pub use error::{CudaError, Result};
pub use memory::GpuBuffer;

View File

@@ -0,0 +1,59 @@
use crate::error::{self, Result};
use crate::ffi;
/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
pub struct GpuBuffer {
ptr: *mut u8,
len: usize,
}
impl GpuBuffer {
pub fn alloc(len: usize) -> Result<Self> {
assert!(len > 0, "cannot allocate 0 bytes on GPU");
let mut ptr = std::ptr::null_mut();
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
Ok(Self { ptr, len })
}
pub fn len(&self) -> usize {
self.len
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn as_ptr(&self) -> *const u8 {
self.ptr
}
pub fn as_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
/// Copy data from a host (CPU) slice to this GPU buffer.
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
assert!(src.len() <= self.len, "source larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
})
}
/// Copy data from this GPU buffer to a host (CPU) slice.
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
assert!(dst.len() <= self.len, "destination larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
})
}
}
impl Drop for GpuBuffer {
fn drop(&mut self) {
if !self.ptr.is_null() {
unsafe { ffi::cudaFree(self.ptr) };
}
}
}
unsafe impl Send for GpuBuffer {}

View File

@@ -0,0 +1,57 @@
// Smoke test for the Rust↔CUDA build chain: allocate two host vectors, run the
// vector-add kernel on the GPU, copy back, and assert the result is correct.
//
// Requires nvcc + a GPU, so it is gated behind `not(no_cuda)`. On the local
// (GPU-less) machine build.rs sets the `no_cuda` cfg and this test is skipped,
// keeping host-side `cargo check`/`cargo test --no-run` meaningful.
#![cfg(not(no_cuda))]
use xtrain_cuda::{GpuBuffer, device, ffi};
#[test]
fn test_vecadd_kernel() {
let count = device::device_count().expect("failed to get device count");
assert!(count > 0, "no CUDA devices found");
device::set_device(0).unwrap();
let n = 1024;
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
let byte_len = n * std::mem::size_of::<f32>();
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
d_a.copy_from_host(a_bytes).unwrap();
d_b.copy_from_host(b_bytes).unwrap();
unsafe {
ffi::launch_vecadd_f32(
d_a.as_ptr() as *const f32,
d_b.as_ptr() as *const f32,
d_c.as_mut_ptr() as *mut f32,
n as i32,
std::ptr::null_mut(), // default stream
);
}
device::synchronize().unwrap();
let mut result = vec![0.0f32; n];
let result_bytes =
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
d_c.copy_to_host(result_bytes).unwrap();
assert_eq!(result, expected, "vecadd kernel output mismatch");
println!(
"vecadd OK: first={} mid={} last={} ({} elems)",
result[0],
result[n / 2],
result[n - 1],
n
);
}

16
csrc/test/vecadd.cu Normal file
View File

@@ -0,0 +1,16 @@
extern "C" {
__global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
int block = 256;
int grid = (n + block - 1) / block;
vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
}
}