T1: scaffold repo + Rust/CUDA build chain (vecadd smoke test)

Stand up the xtrain project skeleton: a Cargo workspace mirroring xserv's csrc/ + crates/ layout, with a single xtrain-cuda crate that wraps the CUDA Runtime over hand-written extern "C" FFI. build.rs compiles csrc/test/vecadd.cu via the cc crate targeting sm_120 (RTX 5090) and links cudart. A gated integration test runs the vector-add kernel on the GPU and asserts the result. When nvcc is absent (local GPU-less machine), build.rs skips CUDA compilation and sets a `no_cuda` cfg so host-side cargo check still works. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 14:42:43 +08:00
commit 92acf9f413
13 changed files with 376 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,11 @@
+/target
+*.o
+*.so
+*.a
+*.ptx
+*.cubin
+**/*.rs.bk
+.env
+
+# Claude Code runtime state
+/.claude/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -0,0 +1,32 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "cc"
+version = "1.2.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f"
+dependencies = [
+ "find-msvc-tools",
+ "shlex",
+]
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
+[[package]]
+name = "shlex"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
+
+[[package]]
+name = "xtrain-cuda"
+version = "0.1.0"
+dependencies = [
+ "cc",
+]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,10 @@
+[workspace]
+resolver = "2"
+members = [
+    "crates/xtrain-cuda",
+]
+
+[workspace.package]
+version = "0.1.0"
+edition = "2024"
+license = "MIT"
--- a/README.md
+++ b/README.md
@@ -0,0 +1,50 @@
+# xtrain
+
+A from-scratch **Rust + CUDA** LLM **training** engine — the sibling of
+[xserv](https://github.com/) (the inference side). GPU-first.
+
+The goal is to learn the full training-systems stack by hand: autograd / backward
+passes / optimizers (AdamW) / the training loop / distributed logic. Heavy lifting
+is borrowed where it makes sense (GEMM → cuBLAS after a hand-written version,
+multi-GPU comms → NCCL, tokenizer → reused from xserv), but the core is written
+from scratch. The target architecture is a tiny modern transformer
+(RoPE + RMSNorm + SwiGLU, ~1–30M params) whose forward aligns with xserv's Qwen3,
+so the backward passes map one-to-one onto xserv's existing forward kernels and
+trained weights can flow back into xserv.
+
+## Status
+
+Bootstrapping (P0). This repo currently contains only the project skeleton and a
+working Rust↔CUDA build chain, verified by a trivial vector-add CUDA kernel.
+
+## Layout
+
+```
+xtrain/
+├── Cargo.toml              # workspace
+├── csrc/                   # CUDA sources (.cu)
+│   └── test/vecadd.cu      # trivial element-wise vector-add (smoke test)
+└── crates/
+    └── xtrain-cuda/        # CUDA Runtime FFI + build.rs (nvcc → sm_120)
+        ├── build.rs        # compiles csrc/*.cu via the `cc` crate, links cudart
+        ├── src/            # ffi / error / device / memory
+        └── tests/          # vecadd smoke test
+```
+
+The build mirrors xserv's approach: `build.rs` invokes `nvcc` (via the `cc` crate)
+to compile `csrc/*.cu` targeting `sm_120` (RTX 5090) and links them into the Rust
+crate over hand-written `extern "C"` FFI.
+
+## Building & testing
+
+CUDA compilation and execution happen on a GPU box (dash5, 8× RTX 5090, sm_120):
+
+```sh
+export PATH=/usr/local/cuda/bin:$HOME/.cargo/bin:$PATH
+cargo build
+cargo test -p xtrain-cuda -- --nocapture   # runs the vecadd smoke test
+```
+
+On a machine without `nvcc`/GPU, `build.rs` detects the missing toolchain, skips
+CUDA compilation, and sets a `no_cuda` cfg — so host-side `cargo check` still
+works (the GPU smoke test is compiled out).
--- a/crates/xtrain-cuda/Cargo.toml
+++ b/crates/xtrain-cuda/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "xtrain-cuda"
+version.workspace = true
+edition.workspace = true
+
+[build-dependencies]
+cc = "1"
--- a/crates/xtrain-cuda/build.rs
+++ b/crates/xtrain-cuda/build.rs
@@ -0,0 +1,39 @@
+use std::env;
+use std::path::Path;
+use std::process::Command;
+
+fn main() {
+    println!("cargo:rustc-check-cfg=cfg(no_cuda)");
+    println!("cargo:rerun-if-changed=../../csrc/");
+
+    let cuda_path = env::var("CUDA_HOME")
+        .or_else(|_| env::var("CUDA_PATH"))
+        .unwrap_or_else(|_| "/usr/local/cuda".to_string());
+
+    // Locally there is no nvcc / GPU. Detect that and skip the CUDA build so
+    // `cargo check`/`cargo build` of host-side Rust still works. The `no_cuda`
+    // cfg makes the FFI bindings + smoke test compile (but not run) without nvcc.
+    if !nvcc_available(&cuda_path) {
+        println!("cargo:warning=nvcc not found — skipping CUDA compilation (host-only build).");
+        println!("cargo:rustc-cfg=no_cuda");
+        return;
+    }
+
+    println!("cargo:rustc-link-search=native={cuda_path}/lib64");
+    println!("cargo:rustc-link-lib=dylib=cudart");
+    println!("cargo:rustc-link-lib=dylib=cuda");
+
+    cc::Build::new()
+        .cuda(true)
+        .cudart("shared")
+        .flag("-gencode=arch=compute_120,code=sm_120")
+        .file("../../csrc/test/vecadd.cu")
+        .compile("xtrain_cuda_kernels");
+}
+
+fn nvcc_available(cuda_path: &str) -> bool {
+    if Command::new("nvcc").arg("--version").output().is_ok() {
+        return true;
+    }
+    Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
+}
--- a/crates/xtrain-cuda/src/device.rs
+++ b/crates/xtrain-cuda/src/device.rs
@@ -0,0 +1,16 @@
+use crate::error::{self, Result};
+use crate::ffi;
+
+pub fn device_count() -> Result<i32> {
+    let mut count = 0;
+    error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
+    Ok(count)
+}
+
+pub fn set_device(device: u32) -> Result<()> {
+    error::check(unsafe { ffi::cudaSetDevice(device as i32) })
+}
+
+pub fn synchronize() -> Result<()> {
+    error::check(unsafe { ffi::cudaDeviceSynchronize() })
+}
--- a/crates/xtrain-cuda/src/error.rs
+++ b/crates/xtrain-cuda/src/error.rs
@@ -0,0 +1,40 @@
+use crate::ffi;
+use std::ffi::CStr;
+use std::fmt;
+
+#[derive(Debug)]
+pub enum CudaError {
+    OutOfMemory,
+    Raw { code: i32, message: String },
+}
+
+impl fmt::Display for CudaError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
+            CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
+        }
+    }
+}
+
+impl std::error::Error for CudaError {}
+
+pub type Result<T> = std::result::Result<T, CudaError>;
+
+pub fn check(code: i32) -> Result<()> {
+    if code == ffi::CUDA_SUCCESS {
+        return Ok(());
+    }
+    let message = unsafe {
+        let ptr = ffi::cudaGetErrorString(code);
+        if ptr.is_null() {
+            "unknown error".to_string()
+        } else {
+            CStr::from_ptr(ptr).to_string_lossy().into_owned()
+        }
+    };
+    Err(match code {
+        ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
+        _ => CudaError::Raw { code, message },
+    })
+}
--- a/crates/xtrain-cuda/src/ffi.rs
+++ b/crates/xtrain-cuda/src/ffi.rs
@@ -0,0 +1,32 @@
+use std::ffi::c_void;
+use std::os::raw::c_char;
+
+pub type CudaStream = *mut c_void;
+
+pub const CUDA_MEMCPY_H2D: i32 = 1;
+pub const CUDA_MEMCPY_D2H: i32 = 2;
+
+pub const CUDA_SUCCESS: i32 = 0;
+pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
+
+unsafe extern "C" {
+    // --- Device ---
+    pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
+    pub fn cudaSetDevice(device: i32) -> i32;
+    pub fn cudaDeviceSynchronize() -> i32;
+
+    // --- Memory ---
+    pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
+    pub fn cudaFree(devptr: *mut u8) -> i32;
+    pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
+
+    // --- Error ---
+    pub fn cudaGetErrorString(error: i32) -> *const c_char;
+}
+
+// The vector-add smoke-test kernel, compiled from csrc/test/vecadd.cu by build.rs.
+// Only linked when CUDA is actually compiled (i.e. nvcc was present).
+#[cfg(not(no_cuda))]
+unsafe extern "C" {
+    pub fn launch_vecadd_f32(a: *const f32, b: *const f32, c: *mut f32, n: i32, stream: CudaStream);
+}
--- a/crates/xtrain-cuda/src/lib.rs
+++ b/crates/xtrain-cuda/src/lib.rs
@@ -0,0 +1,7 @@
+pub mod device;
+pub mod error;
+pub mod ffi;
+pub mod memory;
+
+pub use error::{CudaError, Result};
+pub use memory::GpuBuffer;
--- a/crates/xtrain-cuda/src/memory.rs
+++ b/crates/xtrain-cuda/src/memory.rs
@@ -0,0 +1,59 @@
+use crate::error::{self, Result};
+use crate::ffi;
+
+/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
+pub struct GpuBuffer {
+    ptr: *mut u8,
+    len: usize,
+}
+
+impl GpuBuffer {
+    pub fn alloc(len: usize) -> Result<Self> {
+        assert!(len > 0, "cannot allocate 0 bytes on GPU");
+        let mut ptr = std::ptr::null_mut();
+        error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
+        Ok(Self { ptr, len })
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn as_ptr(&self) -> *const u8 {
+        self.ptr
+    }
+
+    pub fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr
+    }
+
+    /// Copy data from a host (CPU) slice to this GPU buffer.
+    pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
+        assert!(src.len() <= self.len, "source larger than buffer");
+        error::check(unsafe {
+            ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
+        })
+    }
+
+    /// Copy data from this GPU buffer to a host (CPU) slice.
+    pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
+        assert!(dst.len() <= self.len, "destination larger than buffer");
+        error::check(unsafe {
+            ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
+        })
+    }
+}
+
+impl Drop for GpuBuffer {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            unsafe { ffi::cudaFree(self.ptr) };
+        }
+    }
+}
+
+unsafe impl Send for GpuBuffer {}
--- a/crates/xtrain-cuda/tests/integration.rs
+++ b/crates/xtrain-cuda/tests/integration.rs
@@ -0,0 +1,57 @@
+// Smoke test for the Rust↔CUDA build chain: allocate two host vectors, run the
+// vector-add kernel on the GPU, copy back, and assert the result is correct.
+//
+// Requires nvcc + a GPU, so it is gated behind `not(no_cuda)`. On the local
+// (GPU-less) machine build.rs sets the `no_cuda` cfg and this test is skipped,
+// keeping host-side `cargo check`/`cargo test --no-run` meaningful.
+#![cfg(not(no_cuda))]
+
+use xtrain_cuda::{GpuBuffer, device, ffi};
+
+#[test]
+fn test_vecadd_kernel() {
+    let count = device::device_count().expect("failed to get device count");
+    assert!(count > 0, "no CUDA devices found");
+    device::set_device(0).unwrap();
+
+    let n = 1024;
+    let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
+    let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
+    let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
+
+    let byte_len = n * std::mem::size_of::<f32>();
+
+    let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
+    let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
+    let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
+
+    let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
+    let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
+    d_a.copy_from_host(a_bytes).unwrap();
+    d_b.copy_from_host(b_bytes).unwrap();
+
+    unsafe {
+        ffi::launch_vecadd_f32(
+            d_a.as_ptr() as *const f32,
+            d_b.as_ptr() as *const f32,
+            d_c.as_mut_ptr() as *mut f32,
+            n as i32,
+            std::ptr::null_mut(), // default stream
+        );
+    }
+    device::synchronize().unwrap();
+
+    let mut result = vec![0.0f32; n];
+    let result_bytes =
+        unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
+    d_c.copy_to_host(result_bytes).unwrap();
+
+    assert_eq!(result, expected, "vecadd kernel output mismatch");
+    println!(
+        "vecadd OK: first={} mid={} last={} ({} elems)",
+        result[0],
+        result[n / 2],
+        result[n - 1],
+        n
+    );
+}
--- a/csrc/test/vecadd.cu
+++ b/csrc/test/vecadd.cu
@@ -0,0 +1,16 @@
+extern "C" {
+
+__global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        c[idx] = a[idx] + b[idx];
+    }
+}
+
+void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
+    int block = 256;
+    int grid = (n + block - 1) / block;
+    vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
+}
+
+}