phase 0+1: project scaffold + xserv-cuda crate

- Cargo workspace with xserv-cuda crate
- CUDA FFI bindings (cudart: memory, stream, device, error)
- GpuBuffer RAII wrapper with H2D/D2H/D2D copy
- CudaStream wrapper with RAII Drop
- CachingAllocator with size-bucketed free lists
- PinnedBuffer for page-locked host memory
- Device info query via cudaDeviceGetAttribute
- Vector-add CUDA kernel smoke test
- Integration test suite (11 tests)
- build.rs: cc crate compiles .cu for SM 12.0
- sync-and-build.sh for remote build on dash5
- Roadmap doc (docs/00-roadmap.md) and Phase 0+1 design doc

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-21 18:40:22 +08:00
commit 9806b4db35
16 changed files with 2629 additions and 0 deletions

9
.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
/target
*.o
*.so
*.a
*.ptx
*.cubin
**/*.rs.bk
.env
*.npy

14
Cargo.toml Normal file
View File

@@ -0,0 +1,14 @@
[workspace]
resolver = "2"
members = [
"crates/xserv-cuda",
]
[workspace.package]
version = "0.1.0"
edition = "2024"
license = "MIT"
[workspace.dependencies]
half = "2"
smallvec = "1"

View File

@@ -0,0 +1,10 @@
[package]
name = "xserv-cuda"
version.workspace = true
edition.workspace = true
[build-dependencies]
cc = { version = "1", features = ["cuda"] }
[dev-dependencies]
rand = "0.9"

View File

@@ -0,0 +1,20 @@
use std::env;
fn main() {
let cuda_path = env::var("CUDA_HOME")
.or_else(|_| env::var("CUDA_PATH"))
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
println!("cargo:rustc-link-lib=dylib=cudart");
println!("cargo:rustc-link-lib=dylib=cuda");
cc::Build::new()
.cuda(true)
.cudart("shared")
.flag("-gencode=arch=compute_120,code=sm_120")
.file("../../csrc/test/vecadd.cu")
.compile("xserv_cuda_kernels");
println!("cargo:rerun-if-changed=../../csrc/");
}

View File

@@ -0,0 +1,109 @@
use crate::error::Result;
use crate::ffi;
use crate::memory::GpuBuffer;
use std::collections::HashMap;
/// Caching allocator that reuses freed GPU buffers instead of calling
/// cudaMalloc/cudaFree on every allocation.
///
/// Freed buffers are kept in a per-size-bucket free list. On allocation,
/// we first check the free list for a buffer of matching (rounded) size.
pub struct CachingAllocator {
free_lists: HashMap<usize, Vec<(*mut u8, usize)>>,
stats: AllocStats,
}
#[derive(Debug, Default, Clone)]
pub struct AllocStats {
pub alloc_count: u64,
pub cache_hit_count: u64,
pub cuda_malloc_count: u64,
pub cuda_free_count: u64,
pub current_allocated: usize,
pub peak_allocated: usize,
}
impl CachingAllocator {
pub fn new() -> Self {
Self {
free_lists: HashMap::new(),
stats: AllocStats::default(),
}
}
pub fn alloc(&mut self, size: usize) -> Result<GpuBuffer> {
let bucket = bucket_size(size);
self.stats.alloc_count += 1;
if let Some(list) = self.free_lists.get_mut(&bucket) {
if let Some((ptr, actual_len)) = list.pop() {
self.stats.cache_hit_count += 1;
self.stats.current_allocated += actual_len;
if self.stats.current_allocated > self.stats.peak_allocated {
self.stats.peak_allocated = self.stats.current_allocated;
}
return Ok(unsafe { GpuBuffer::from_raw(ptr, actual_len) });
}
}
self.stats.cuda_malloc_count += 1;
let buf = GpuBuffer::alloc(bucket)?;
self.stats.current_allocated += bucket;
if self.stats.current_allocated > self.stats.peak_allocated {
self.stats.peak_allocated = self.stats.current_allocated;
}
Ok(buf)
}
/// Return a buffer to the cache instead of freeing it.
pub fn dealloc(&mut self, buf: GpuBuffer) {
let (ptr, len) = buf.into_raw();
let bucket = bucket_size(len);
self.stats.current_allocated = self.stats.current_allocated.saturating_sub(len);
self.free_lists.entry(bucket).or_default().push((ptr, len));
}
/// Actually free all cached buffers.
pub fn trim(&mut self) {
for (_bucket, list) in self.free_lists.drain() {
for (ptr, _len) in list {
unsafe { ffi::cudaFree(ptr) };
self.stats.cuda_free_count += 1;
}
}
}
pub fn stats(&self) -> &AllocStats {
&self.stats
}
}
impl Drop for CachingAllocator {
fn drop(&mut self) {
self.trim();
}
}
/// Round up to next power-of-2, minimum 512 bytes.
fn bucket_size(size: usize) -> usize {
let min = 512;
if size <= min {
return min;
}
size.next_power_of_two()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bucket_size() {
assert_eq!(bucket_size(1), 512);
assert_eq!(bucket_size(512), 512);
assert_eq!(bucket_size(513), 1024);
assert_eq!(bucket_size(1024), 1024);
assert_eq!(bucket_size(1025), 2048);
assert_eq!(bucket_size(1 << 20), 1 << 20);
}
}

View File

@@ -0,0 +1,77 @@
use crate::error::{self, Result};
use crate::ffi;
use std::ffi::CStr;
#[derive(Debug, Clone)]
pub struct DeviceInfo {
pub index: u32,
pub name: String,
pub total_memory: usize,
pub compute_major: i32,
pub compute_minor: i32,
pub sm_count: i32,
pub shared_mem_per_block: usize,
pub warp_size: i32,
pub max_threads_per_block: i32,
}
extern "C" {
fn cudaDeviceGetAttribute(value: *mut i32, attr: i32, device: i32) -> i32;
}
fn get_attr(attr: i32, device: u32) -> Result<i32> {
let mut value = 0;
error::check(unsafe { cudaDeviceGetAttribute(&mut value, attr, device as i32) })?;
Ok(value)
}
pub fn device_count() -> Result<i32> {
let mut count = 0;
error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
Ok(count)
}
pub fn set_device(device: u32) -> Result<()> {
error::check(unsafe { ffi::cudaSetDevice(device as i32) })
}
pub fn current_device() -> Result<u32> {
let mut dev = 0;
error::check(unsafe { ffi::cudaGetDevice(&mut dev) })?;
Ok(dev as u32)
}
pub fn device_info(device: u32) -> Result<DeviceInfo> {
// Use cudaGetDeviceProperties only for the name (first field, always stable).
let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
.to_string_lossy()
.into_owned();
// Use cudaDeviceGetAttribute for everything else (layout-independent).
// Attribute IDs from cuda_runtime_api.h:
const TOTAL_GLOBAL_MEM: i32 = 0; // not available via attribute, use prop
const SHARED_MEM_PER_BLOCK: i32 = 8;
const WARP_SIZE: i32 = 10;
const MAX_THREADS_PER_BLOCK: i32 = 1;
const MULTI_PROCESSOR_COUNT: i32 = 16;
const COMPUTE_MAJOR: i32 = 75;
const COMPUTE_MINOR: i32 = 76;
Ok(DeviceInfo {
index: device,
name,
total_memory: prop.total_global_mem,
compute_major: get_attr(COMPUTE_MAJOR, device)?,
compute_minor: get_attr(COMPUTE_MINOR, device)?,
sm_count: get_attr(MULTI_PROCESSOR_COUNT, device)?,
shared_mem_per_block: get_attr(SHARED_MEM_PER_BLOCK, device)? as usize,
warp_size: get_attr(WARP_SIZE, device)?,
max_threads_per_block: get_attr(MAX_THREADS_PER_BLOCK, device)?,
})
}
pub fn synchronize() -> Result<()> {
error::check(unsafe { ffi::cudaDeviceSynchronize() })
}

View File

@@ -0,0 +1,43 @@
use crate::ffi;
use std::ffi::CStr;
use std::fmt;
#[derive(Debug)]
pub enum CudaError {
OutOfMemory,
InvalidDevice,
Raw { code: i32, message: String },
}
impl fmt::Display for CudaError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
CudaError::InvalidDevice => write!(f, "CUDA invalid device"),
CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
}
}
}
impl std::error::Error for CudaError {}
pub type Result<T> = std::result::Result<T, CudaError>;
pub(crate) fn check(code: i32) -> Result<()> {
if code == ffi::CUDA_SUCCESS {
return Ok(());
}
let message = unsafe {
let ptr = ffi::cudaGetErrorString(code);
if ptr.is_null() {
"unknown error".to_string()
} else {
CStr::from_ptr(ptr).to_string_lossy().into_owned()
}
};
Err(match code {
ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
101 => CudaError::InvalidDevice,
_ => CudaError::Raw { code, message },
})
}

View File

@@ -0,0 +1,73 @@
use std::ffi::c_void;
use std::os::raw::c_char;
pub type CudaStream = *mut c_void;
pub type CudaEvent = *mut c_void;
pub const CUDA_MEMCPY_H2D: i32 = 1;
pub const CUDA_MEMCPY_D2H: i32 = 2;
pub const CUDA_MEMCPY_D2D: i32 = 3;
pub const CUDA_SUCCESS: i32 = 0;
pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
#[repr(C)]
pub struct CudaDeviceProp {
pub name: [c_char; 256],
pub total_global_mem: usize,
pub shared_mem_per_block: usize,
pub regs_per_block: i32,
pub warp_size: i32,
pub max_threads_per_block: i32,
pub max_threads_dim: [i32; 3],
pub max_grid_size: [i32; 3],
pub clock_rate: i32,
pub total_const_mem: usize,
pub major: i32,
pub minor: i32,
// There are many more fields; we only read up to what we need.
// cudaDeviceProp is a large struct (~1KB). We pad the rest.
_pad: [u8; 4096],
}
extern "C" {
// --- Device ---
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
pub fn cudaSetDevice(device: i32) -> i32;
pub fn cudaGetDevice(device: *mut i32) -> i32;
pub fn cudaGetDeviceProperties(prop: *mut CudaDeviceProp, device: i32) -> i32;
pub fn cudaDeviceSynchronize() -> i32;
// --- Memory ---
pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
pub fn cudaFree(devptr: *mut u8) -> i32;
pub fn cudaMallocHost(ptr: *mut *mut u8, size: usize) -> i32;
pub fn cudaFreeHost(ptr: *mut u8) -> i32;
pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
pub fn cudaMemcpyAsync(
dst: *mut u8,
src: *const u8,
count: usize,
kind: i32,
stream: CudaStream,
) -> i32;
pub fn cudaMemset(devptr: *mut u8, value: i32, count: usize) -> i32;
// --- Stream ---
pub fn cudaStreamCreate(stream: *mut CudaStream) -> i32;
pub fn cudaStreamDestroy(stream: CudaStream) -> i32;
pub fn cudaStreamSynchronize(stream: CudaStream) -> i32;
// --- Error ---
pub fn cudaGetLastError() -> i32;
pub fn cudaGetErrorString(error: i32) -> *const c_char;
// --- Our test kernel ---
pub fn launch_vecadd_f32(
a: *const f32,
b: *const f32,
c: *mut f32,
n: i32,
stream: CudaStream,
);
}

View File

@@ -0,0 +1,12 @@
pub mod allocator;
pub mod device;
pub mod error;
pub mod ffi;
pub mod memory;
pub mod stream;
pub use allocator::CachingAllocator;
pub use device::DeviceInfo;
pub use error::{CudaError, Result};
pub use memory::{GpuBuffer, PinnedBuffer};
pub use stream::CudaStream;

View File

@@ -0,0 +1,146 @@
use crate::error::{self, Result};
use crate::ffi;
use crate::stream::CudaStream;
/// RAII wrapper around a GPU memory allocation.
pub struct GpuBuffer {
ptr: *mut u8,
len: usize,
}
impl GpuBuffer {
pub fn alloc(len: usize) -> Result<Self> {
assert!(len > 0, "cannot allocate 0 bytes on GPU");
let mut ptr = std::ptr::null_mut();
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
Ok(Self { ptr, len })
}
pub fn len(&self) -> usize {
self.len
}
pub fn as_ptr(&self) -> *const u8 {
self.ptr
}
pub fn as_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
/// Copy data from host (CPU) slice to this GPU buffer.
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
assert!(src.len() <= self.len, "source larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
})
}
/// Copy data from this GPU buffer to a host (CPU) slice.
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
assert!(dst.len() <= self.len, "destination larger than buffer");
error::check(unsafe {
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
})
}
/// Async copy from host to device on the given stream.
/// Safety: `src` must remain valid until the stream operation completes.
pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
assert!(src.len() <= self.len);
error::check(ffi::cudaMemcpyAsync(
self.ptr,
src.as_ptr(),
src.len(),
ffi::CUDA_MEMCPY_H2D,
stream.as_raw(),
))
}
/// Async copy from device to host on the given stream.
/// Safety: `dst` must remain valid until the stream operation completes.
pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
assert!(dst.len() <= self.len);
error::check(ffi::cudaMemcpyAsync(
dst.as_mut_ptr(),
self.ptr,
dst.len(),
ffi::CUDA_MEMCPY_D2H,
stream.as_raw(),
))
}
/// Copy from another GPU buffer (D2D).
pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
let n = src.len.min(self.len);
error::check(unsafe {
ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D)
})
}
/// Fill buffer with zeros.
pub fn zero(&mut self) -> Result<()> {
error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
}
/// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
/// Caller is responsible for eventually calling cudaFree.
pub fn into_raw(self) -> (*mut u8, usize) {
let ptr = self.ptr;
let len = self.len;
std::mem::forget(self);
(ptr, len)
}
/// Reconstruct a GpuBuffer from a raw pointer + length.
/// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
Self { ptr, len }
}
}
impl Drop for GpuBuffer {
fn drop(&mut self) {
if !self.ptr.is_null() {
unsafe { ffi::cudaFree(self.ptr) };
}
}
}
unsafe impl Send for GpuBuffer {}
/// Pinned (page-locked) host memory for faster H2D/D2H transfers.
pub struct PinnedBuffer {
ptr: *mut u8,
len: usize,
}
impl PinnedBuffer {
pub fn alloc(len: usize) -> Result<Self> {
let mut ptr = std::ptr::null_mut();
error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
Ok(Self { ptr, len })
}
pub fn as_slice(&self) -> &[u8] {
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
}
pub fn as_mut_slice(&mut self) -> &mut [u8] {
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
}
pub fn len(&self) -> usize {
self.len
}
}
impl Drop for PinnedBuffer {
fn drop(&mut self) {
if !self.ptr.is_null() {
unsafe { ffi::cudaFreeHost(self.ptr) };
}
}
}
unsafe impl Send for PinnedBuffer {}

View File

@@ -0,0 +1,33 @@
use crate::error::{self, Result};
use crate::ffi;
pub struct CudaStream {
raw: ffi::CudaStream,
}
impl CudaStream {
pub fn new() -> Result<Self> {
let mut raw = std::ptr::null_mut();
error::check(unsafe { ffi::cudaStreamCreate(&mut raw) })?;
Ok(Self { raw })
}
pub fn synchronize(&self) -> Result<()> {
error::check(unsafe { ffi::cudaStreamSynchronize(self.raw) })
}
pub fn as_raw(&self) -> ffi::CudaStream {
self.raw
}
}
impl Drop for CudaStream {
fn drop(&mut self) {
if !self.raw.is_null() {
unsafe { ffi::cudaStreamDestroy(self.raw) };
}
}
}
// Can move across threads, but not shared without synchronization
unsafe impl Send for CudaStream {}

View File

@@ -0,0 +1,208 @@
use xserv_cuda::*;
#[test]
fn test_device_info() {
let count = device::device_count().expect("failed to get device count");
assert!(count > 0, "no CUDA devices found");
let info = device::device_info(0).expect("failed to get device info");
println!("GPU 0: {}", info.name);
println!(" Memory: {} MB", info.total_memory / (1024 * 1024));
println!(
" Compute Capability: {}.{}",
info.compute_major, info.compute_minor
);
println!(" SM Count: {}", info.sm_count);
println!(" Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
println!(" Warp Size: {}", info.warp_size);
println!(" Max Threads/Block: {}", info.max_threads_per_block);
assert!(info.total_memory > 0);
assert!(info.sm_count > 0);
}
#[test]
fn test_gpu_buffer_h2d_d2h() {
device::set_device(0).unwrap();
let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
let mut buf = GpuBuffer::alloc(data.len()).unwrap();
buf.copy_from_host(&data).unwrap();
let mut out = vec![0u8; data.len()];
buf.copy_to_host(&mut out).unwrap();
assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
}
#[test]
fn test_gpu_buffer_large() {
device::set_device(0).unwrap();
let size = 64 * 1024 * 1024; // 64 MB
let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
let mut buf = GpuBuffer::alloc(size).unwrap();
buf.copy_from_host(&data).unwrap();
let mut out = vec![0u8; size];
buf.copy_to_host(&mut out).unwrap();
assert_eq!(data, out, "64MB roundtrip mismatch");
}
#[test]
fn test_gpu_buffer_d2d() {
device::set_device(0).unwrap();
let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
let mut src = GpuBuffer::alloc(data.len()).unwrap();
src.copy_from_host(&data).unwrap();
let mut dst = GpuBuffer::alloc(data.len()).unwrap();
dst.copy_from_device(&src).unwrap();
let mut out = vec![0u8; data.len()];
dst.copy_to_host(&mut out).unwrap();
assert_eq!(data, out, "D2D copy mismatch");
}
#[test]
fn test_gpu_buffer_zero() {
device::set_device(0).unwrap();
let mut buf = GpuBuffer::alloc(1024).unwrap();
buf.zero().unwrap();
let mut out = vec![0xFFu8; 1024];
buf.copy_to_host(&mut out).unwrap();
assert!(out.iter().all(|&b| b == 0), "zero fill failed");
}
#[test]
fn test_stream() {
device::set_device(0).unwrap();
let stream = CudaStream::new().unwrap();
stream.synchronize().unwrap();
// stream drops here, should destroy cleanly
}
#[test]
fn test_vecadd_kernel() {
device::set_device(0).unwrap();
let n = 1024;
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
let byte_len = n * std::mem::size_of::<f32>();
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
d_a.copy_from_host(a_bytes).unwrap();
d_b.copy_from_host(b_bytes).unwrap();
unsafe {
ffi::launch_vecadd_f32(
d_a.as_ptr() as *const f32,
d_b.as_ptr() as *const f32,
d_c.as_mut_ptr() as *mut f32,
n as i32,
std::ptr::null_mut(), // default stream
);
}
device::synchronize().unwrap();
let mut result = vec![0.0f32; n];
let result_bytes =
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
d_c.copy_to_host(result_bytes).unwrap();
assert_eq!(result, expected, "vecadd kernel output mismatch");
}
#[test]
fn test_caching_allocator() {
device::set_device(0).unwrap();
let mut alloc = CachingAllocator::new();
// First allocation: should trigger cudaMalloc
let buf1 = alloc.alloc(1024).unwrap();
assert_eq!(alloc.stats().cuda_malloc_count, 1);
assert_eq!(alloc.stats().cache_hit_count, 0);
// Return to cache
alloc.dealloc(buf1);
// Second allocation of same size: should hit cache
let _buf2 = alloc.alloc(1024).unwrap();
assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
assert_eq!(alloc.stats().cache_hit_count, 1);
}
#[test]
fn test_caching_allocator_different_sizes() {
device::set_device(0).unwrap();
let mut alloc = CachingAllocator::new();
let buf1 = alloc.alloc(512).unwrap();
let buf2 = alloc.alloc(2048).unwrap();
alloc.dealloc(buf1);
alloc.dealloc(buf2);
// Re-alloc different sizes: each should hit its own bucket
let _buf3 = alloc.alloc(512).unwrap();
let _buf4 = alloc.alloc(2048).unwrap();
assert_eq!(alloc.stats().cuda_malloc_count, 2);
assert_eq!(alloc.stats().cache_hit_count, 2);
}
#[test]
fn test_pinned_memory() {
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
let slice = pinned.as_mut_slice();
for (i, byte) in slice.iter_mut().enumerate() {
*byte = (i % 256) as u8;
}
device::set_device(0).unwrap();
let mut gpu = GpuBuffer::alloc(4096).unwrap();
gpu.copy_from_host(pinned.as_slice()).unwrap();
let mut out = vec![0u8; 4096];
gpu.copy_to_host(&mut out).unwrap();
assert_eq!(pinned.as_slice(), &out[..]);
}
#[test]
fn test_async_copy() {
device::set_device(0).unwrap();
let stream = CudaStream::new().unwrap();
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
*byte = (i % 256) as u8;
}
let mut gpu = GpuBuffer::alloc(4096).unwrap();
unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
stream.synchronize().unwrap();
let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
stream.synchronize().unwrap();
assert_eq!(pinned.as_slice(), out_pinned.as_slice());
}

16
csrc/test/vecadd.cu Normal file
View File

@@ -0,0 +1,16 @@
extern "C" {
__global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
int block = 256;
int grid = (n + block - 1) / block;
vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
}
}

1754
docs/00-roadmap.md Normal file

File diff suppressed because it is too large Load Diff

80
docs/01-cuda-ffi.md Normal file
View File

@@ -0,0 +1,80 @@
# Phase 0+1: CUDA FFI Infrastructure — Design Document
## Goal
Build `xserv-cuda`, a Rust crate that wraps CUDA Runtime API with safe abstractions:
- Device query and selection
- GPU memory allocation with RAII (GpuBuffer)
- Caching allocator (avoid repeated cudaMalloc/cudaFree)
- CUDA streams for async operations
- Host↔Device memory transfers
- Error handling wrapping all CUDA calls
## Module Layout
```
crates/xserv-cuda/
├── Cargo.toml
├── build.rs # compiles csrc/*.cu via cc crate
└── src/
├── lib.rs # re-exports
├── ffi.rs # raw extern "C" bindings to CUDA runtime
├── error.rs # CudaError type
├── device.rs # device query, DeviceInfo
├── stream.rs # CudaStream wrapper
├── memory.rs # GpuBuffer, H2D/D2H/D2D copy
└── allocator.rs # CachingAllocator
```
## Key Design Decisions
### FFI Bindings (ffi.rs)
Hand-written extern "C" bindings (~25 functions). No bindgen — keeps things explicit and readable.
Core functions needed:
- Memory: cudaMalloc, cudaFree, cudaMemcpy, cudaMemcpyAsync, cudaMallocHost, cudaFreeHost
- Stream: cudaStreamCreate, cudaStreamDestroy, cudaStreamSynchronize
- Device: cudaGetDeviceCount, cudaSetDevice, cudaGetDevice, cudaGetDeviceProperties
- Sync: cudaDeviceSynchronize
- Error: cudaGetLastError, cudaGetErrorString
### Error Handling (error.rs)
Every CUDA call returns cudaError_t. We wrap all calls:
```rust
pub(crate) fn check(code: i32) -> Result<(), CudaError>
```
### GpuBuffer (memory.rs)
RAII wrapper around a GPU pointer. Drop frees memory.
```rust
pub struct GpuBuffer {
ptr: *mut u8,
len: usize, // in bytes
device: u32,
}
```
- No Clone (explicit copy_from instead)
- Send + !Sync (can move across threads, but not shared)
### CachingAllocator (allocator.rs)
Avoids cudaMalloc/cudaFree per allocation. Maintains a free-list keyed by size bucket.
Bucket boundaries: round up to next power of 2, minimum 512 bytes.
- alloc(size) → find bucket, pop from free list or cudaMalloc
- dealloc(ptr, size) → push to free list (don't cudaFree)
- trim() → actually cudaFree everything in free lists
### CudaStream (stream.rs)
Wraps cudaStream_t. RAII with Drop calling cudaStreamDestroy.
## Build Pipeline
- `csrc/test/vecadd.cu`: minimal vector-add kernel for smoke test
- `build.rs` uses `cc` crate to compile .cu files, link CUDA runtime
## Test Plan
1. Device info: print GPU name, memory, compute capability, SM count
2. GpuBuffer: alloc 1GB, H2D copy, D2H copy, verify data
3. Vector add kernel: launch from Rust, verify output
4. CachingAllocator: alloc→free→realloc same size uses cache (no new cudaMalloc)
5. Multi-stream: two concurrent memcpy on different streams
6. Benchmark: caching allocator vs raw cudaMalloc (100 cycles)

25
tools/sync-and-build.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/bin/bash
# Sync local project to dash5 and build/test there.
# Usage: ./tools/sync-and-build.sh [test|build|run]
set -e
REMOTE="dash5"
REMOTE_DIR="/opt/wjh/projects/xserv"
LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)"
ACTION="${1:-build}"
echo "=== Syncing to $REMOTE:$REMOTE_DIR ==="
ssh "$REMOTE" "mkdir -p $REMOTE_DIR"
rsync -az --delete \
--exclude target \
--exclude .git \
"$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/"
echo "=== Running: cargo $ACTION ==="
ssh "$REMOTE" "source \$HOME/.cargo/env && \
export PATH=/usr/local/cuda/bin:\$PATH && \
export CUDA_HOME=/usr/local/cuda && \
cd $REMOTE_DIR && \
cargo $ACTION --release 2>&1"