T1: scaffold repo + Rust/CUDA build chain (vecadd smoke test)
Stand up the xtrain project skeleton: a Cargo workspace mirroring xserv's csrc/ + crates/ layout, with a single xtrain-cuda crate that wraps the CUDA Runtime over hand-written extern "C" FFI. build.rs compiles csrc/test/vecadd.cu via the cc crate targeting sm_120 (RTX 5090) and links cudart. A gated integration test runs the vector-add kernel on the GPU and asserts the result. When nvcc is absent (local GPU-less machine), build.rs skips CUDA compilation and sets a `no_cuda` cfg so host-side cargo check still works. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
/target
|
||||
*.o
|
||||
*.so
|
||||
*.a
|
||||
*.ptx
|
||||
*.cubin
|
||||
**/*.rs.bk
|
||||
.env
|
||||
|
||||
# Claude Code runtime state
|
||||
/.claude/
|
||||
32
Cargo.lock
generated
Normal file
32
Cargo.lock
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.64"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
|
||||
|
||||
[[package]]
|
||||
name = "xtrain-cuda"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
10
Cargo.toml
Normal file
10
Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = [
|
||||
"crates/xtrain-cuda",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
license = "MIT"
|
||||
50
README.md
Normal file
50
README.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# xtrain
|
||||
|
||||
A from-scratch **Rust + CUDA** LLM **training** engine — the sibling of
|
||||
[xserv](https://github.com/) (the inference side). GPU-first.
|
||||
|
||||
The goal is to learn the full training-systems stack by hand: autograd / backward
|
||||
passes / optimizers (AdamW) / the training loop / distributed logic. Heavy lifting
|
||||
is borrowed where it makes sense (GEMM → cuBLAS after a hand-written version,
|
||||
multi-GPU comms → NCCL, tokenizer → reused from xserv), but the core is written
|
||||
from scratch. The target architecture is a tiny modern transformer
|
||||
(RoPE + RMSNorm + SwiGLU, ~1–30M params) whose forward aligns with xserv's Qwen3,
|
||||
so the backward passes map one-to-one onto xserv's existing forward kernels and
|
||||
trained weights can flow back into xserv.
|
||||
|
||||
## Status
|
||||
|
||||
Bootstrapping (P0). This repo currently contains only the project skeleton and a
|
||||
working Rust↔CUDA build chain, verified by a trivial vector-add CUDA kernel.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
xtrain/
|
||||
├── Cargo.toml # workspace
|
||||
├── csrc/ # CUDA sources (.cu)
|
||||
│ └── test/vecadd.cu # trivial element-wise vector-add (smoke test)
|
||||
└── crates/
|
||||
└── xtrain-cuda/ # CUDA Runtime FFI + build.rs (nvcc → sm_120)
|
||||
├── build.rs # compiles csrc/*.cu via the `cc` crate, links cudart
|
||||
├── src/ # ffi / error / device / memory
|
||||
└── tests/ # vecadd smoke test
|
||||
```
|
||||
|
||||
The build mirrors xserv's approach: `build.rs` invokes `nvcc` (via the `cc` crate)
|
||||
to compile `csrc/*.cu` targeting `sm_120` (RTX 5090) and links them into the Rust
|
||||
crate over hand-written `extern "C"` FFI.
|
||||
|
||||
## Building & testing
|
||||
|
||||
CUDA compilation and execution happen on a GPU box (dash5, 8× RTX 5090, sm_120):
|
||||
|
||||
```sh
|
||||
export PATH=/usr/local/cuda/bin:$HOME/.cargo/bin:$PATH
|
||||
cargo build
|
||||
cargo test -p xtrain-cuda -- --nocapture # runs the vecadd smoke test
|
||||
```
|
||||
|
||||
On a machine without `nvcc`/GPU, `build.rs` detects the missing toolchain, skips
|
||||
CUDA compilation, and sets a `no_cuda` cfg — so host-side `cargo check` still
|
||||
works (the GPU smoke test is compiled out).
|
||||
7
crates/xtrain-cuda/Cargo.toml
Normal file
7
crates/xtrain-cuda/Cargo.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[package]
|
||||
name = "xtrain-cuda"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
cc = "1"
|
||||
39
crates/xtrain-cuda/build.rs
Normal file
39
crates/xtrain-cuda/build.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
fn main() {
|
||||
println!("cargo:rustc-check-cfg=cfg(no_cuda)");
|
||||
println!("cargo:rerun-if-changed=../../csrc/");
|
||||
|
||||
let cuda_path = env::var("CUDA_HOME")
|
||||
.or_else(|_| env::var("CUDA_PATH"))
|
||||
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
||||
|
||||
// Locally there is no nvcc / GPU. Detect that and skip the CUDA build so
|
||||
// `cargo check`/`cargo build` of host-side Rust still works. The `no_cuda`
|
||||
// cfg makes the FFI bindings + smoke test compile (but not run) without nvcc.
|
||||
if !nvcc_available(&cuda_path) {
|
||||
println!("cargo:warning=nvcc not found — skipping CUDA compilation (host-only build).");
|
||||
println!("cargo:rustc-cfg=no_cuda");
|
||||
return;
|
||||
}
|
||||
|
||||
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
||||
println!("cargo:rustc-link-lib=dylib=cudart");
|
||||
println!("cargo:rustc-link-lib=dylib=cuda");
|
||||
|
||||
cc::Build::new()
|
||||
.cuda(true)
|
||||
.cudart("shared")
|
||||
.flag("-gencode=arch=compute_120,code=sm_120")
|
||||
.file("../../csrc/test/vecadd.cu")
|
||||
.compile("xtrain_cuda_kernels");
|
||||
}
|
||||
|
||||
fn nvcc_available(cuda_path: &str) -> bool {
|
||||
if Command::new("nvcc").arg("--version").output().is_ok() {
|
||||
return true;
|
||||
}
|
||||
Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
|
||||
}
|
||||
16
crates/xtrain-cuda/src/device.rs
Normal file
16
crates/xtrain-cuda/src/device.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
use crate::error::{self, Result};
|
||||
use crate::ffi;
|
||||
|
||||
pub fn device_count() -> Result<i32> {
|
||||
let mut count = 0;
|
||||
error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
pub fn set_device(device: u32) -> Result<()> {
|
||||
error::check(unsafe { ffi::cudaSetDevice(device as i32) })
|
||||
}
|
||||
|
||||
pub fn synchronize() -> Result<()> {
|
||||
error::check(unsafe { ffi::cudaDeviceSynchronize() })
|
||||
}
|
||||
40
crates/xtrain-cuda/src/error.rs
Normal file
40
crates/xtrain-cuda/src/error.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
use crate::ffi;
|
||||
use std::ffi::CStr;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum CudaError {
|
||||
OutOfMemory,
|
||||
Raw { code: i32, message: String },
|
||||
}
|
||||
|
||||
impl fmt::Display for CudaError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
|
||||
CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for CudaError {}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, CudaError>;
|
||||
|
||||
pub fn check(code: i32) -> Result<()> {
|
||||
if code == ffi::CUDA_SUCCESS {
|
||||
return Ok(());
|
||||
}
|
||||
let message = unsafe {
|
||||
let ptr = ffi::cudaGetErrorString(code);
|
||||
if ptr.is_null() {
|
||||
"unknown error".to_string()
|
||||
} else {
|
||||
CStr::from_ptr(ptr).to_string_lossy().into_owned()
|
||||
}
|
||||
};
|
||||
Err(match code {
|
||||
ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
|
||||
_ => CudaError::Raw { code, message },
|
||||
})
|
||||
}
|
||||
32
crates/xtrain-cuda/src/ffi.rs
Normal file
32
crates/xtrain-cuda/src/ffi.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use std::ffi::c_void;
|
||||
use std::os::raw::c_char;
|
||||
|
||||
pub type CudaStream = *mut c_void;
|
||||
|
||||
pub const CUDA_MEMCPY_H2D: i32 = 1;
|
||||
pub const CUDA_MEMCPY_D2H: i32 = 2;
|
||||
|
||||
pub const CUDA_SUCCESS: i32 = 0;
|
||||
pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
|
||||
|
||||
unsafe extern "C" {
|
||||
// --- Device ---
|
||||
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
|
||||
pub fn cudaSetDevice(device: i32) -> i32;
|
||||
pub fn cudaDeviceSynchronize() -> i32;
|
||||
|
||||
// --- Memory ---
|
||||
pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
|
||||
pub fn cudaFree(devptr: *mut u8) -> i32;
|
||||
pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
|
||||
|
||||
// --- Error ---
|
||||
pub fn cudaGetErrorString(error: i32) -> *const c_char;
|
||||
}
|
||||
|
||||
// The vector-add smoke-test kernel, compiled from csrc/test/vecadd.cu by build.rs.
|
||||
// Only linked when CUDA is actually compiled (i.e. nvcc was present).
|
||||
#[cfg(not(no_cuda))]
|
||||
unsafe extern "C" {
|
||||
pub fn launch_vecadd_f32(a: *const f32, b: *const f32, c: *mut f32, n: i32, stream: CudaStream);
|
||||
}
|
||||
7
crates/xtrain-cuda/src/lib.rs
Normal file
7
crates/xtrain-cuda/src/lib.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
pub mod device;
|
||||
pub mod error;
|
||||
pub mod ffi;
|
||||
pub mod memory;
|
||||
|
||||
pub use error::{CudaError, Result};
|
||||
pub use memory::GpuBuffer;
|
||||
59
crates/xtrain-cuda/src/memory.rs
Normal file
59
crates/xtrain-cuda/src/memory.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
use crate::error::{self, Result};
|
||||
use crate::ffi;
|
||||
|
||||
/// RAII wrapper around a GPU memory allocation. Dropping frees the memory.
|
||||
pub struct GpuBuffer {
|
||||
ptr: *mut u8,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl GpuBuffer {
|
||||
pub fn alloc(len: usize) -> Result<Self> {
|
||||
assert!(len > 0, "cannot allocate 0 bytes on GPU");
|
||||
let mut ptr = std::ptr::null_mut();
|
||||
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
|
||||
Ok(Self { ptr, len })
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len == 0
|
||||
}
|
||||
|
||||
pub fn as_ptr(&self) -> *const u8 {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub fn as_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
/// Copy data from a host (CPU) slice to this GPU buffer.
|
||||
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
|
||||
assert!(src.len() <= self.len, "source larger than buffer");
|
||||
error::check(unsafe {
|
||||
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
|
||||
})
|
||||
}
|
||||
|
||||
/// Copy data from this GPU buffer to a host (CPU) slice.
|
||||
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
|
||||
assert!(dst.len() <= self.len, "destination larger than buffer");
|
||||
error::check(unsafe {
|
||||
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GpuBuffer {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
unsafe { ffi::cudaFree(self.ptr) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl Send for GpuBuffer {}
|
||||
57
crates/xtrain-cuda/tests/integration.rs
Normal file
57
crates/xtrain-cuda/tests/integration.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
// Smoke test for the Rust↔CUDA build chain: allocate two host vectors, run the
|
||||
// vector-add kernel on the GPU, copy back, and assert the result is correct.
|
||||
//
|
||||
// Requires nvcc + a GPU, so it is gated behind `not(no_cuda)`. On the local
|
||||
// (GPU-less) machine build.rs sets the `no_cuda` cfg and this test is skipped,
|
||||
// keeping host-side `cargo check`/`cargo test --no-run` meaningful.
|
||||
#![cfg(not(no_cuda))]
|
||||
|
||||
use xtrain_cuda::{GpuBuffer, device, ffi};
|
||||
|
||||
#[test]
|
||||
fn test_vecadd_kernel() {
|
||||
let count = device::device_count().expect("failed to get device count");
|
||||
assert!(count > 0, "no CUDA devices found");
|
||||
device::set_device(0).unwrap();
|
||||
|
||||
let n = 1024;
|
||||
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
|
||||
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
|
||||
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
|
||||
|
||||
let byte_len = n * std::mem::size_of::<f32>();
|
||||
|
||||
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
|
||||
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
|
||||
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
|
||||
|
||||
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
|
||||
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
|
||||
d_a.copy_from_host(a_bytes).unwrap();
|
||||
d_b.copy_from_host(b_bytes).unwrap();
|
||||
|
||||
unsafe {
|
||||
ffi::launch_vecadd_f32(
|
||||
d_a.as_ptr() as *const f32,
|
||||
d_b.as_ptr() as *const f32,
|
||||
d_c.as_mut_ptr() as *mut f32,
|
||||
n as i32,
|
||||
std::ptr::null_mut(), // default stream
|
||||
);
|
||||
}
|
||||
device::synchronize().unwrap();
|
||||
|
||||
let mut result = vec![0.0f32; n];
|
||||
let result_bytes =
|
||||
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
|
||||
d_c.copy_to_host(result_bytes).unwrap();
|
||||
|
||||
assert_eq!(result, expected, "vecadd kernel output mismatch");
|
||||
println!(
|
||||
"vecadd OK: first={} mid={} last={} ({} elems)",
|
||||
result[0],
|
||||
result[n / 2],
|
||||
result[n - 1],
|
||||
n
|
||||
);
|
||||
}
|
||||
16
csrc/test/vecadd.cu
Normal file
16
csrc/test/vecadd.cu
Normal file
@@ -0,0 +1,16 @@
|
||||
extern "C" {
|
||||
|
||||
__global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) {
|
||||
c[idx] = a[idx] + b[idx];
|
||||
}
|
||||
}
|
||||
|
||||
void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user