phase 0+1: project scaffold + xserv-cuda crate

- Cargo workspace with xserv-cuda crate - CUDA FFI bindings (cudart: memory, stream, device, error) - GpuBuffer RAII wrapper with H2D/D2H/D2D copy - CudaStream wrapper with RAII Drop - CachingAllocator with size-bucketed free lists - PinnedBuffer for page-locked host memory - Device info query via cudaDeviceGetAttribute - Vector-add CUDA kernel smoke test - Integration test suite (11 tests) - build.rs: cc crate compiles .cu for SM 12.0 - sync-and-build.sh for remote build on dash5 - Roadmap doc (docs/00-roadmap.md) and Phase 0+1 design doc Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 18:40:22 +08:00
commit 9806b4db35
16 changed files with 2629 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,9 @@
+/target
+*.o
+*.so
+*.a
+*.ptx
+*.cubin
+**/*.rs.bk
+.env
+*.npy
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,14 @@
+[workspace]
+resolver = "2"
+members = [
+    "crates/xserv-cuda",
+]
+
+[workspace.package]
+version = "0.1.0"
+edition = "2024"
+license = "MIT"
+
+[workspace.dependencies]
+half = "2"
+smallvec = "1"
--- a/crates/xserv-cuda/Cargo.toml
+++ b/crates/xserv-cuda/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "xserv-cuda"
+version.workspace = true
+edition.workspace = true
+
+[build-dependencies]
+cc = { version = "1", features = ["cuda"] }
+
+[dev-dependencies]
+rand = "0.9"
--- a/crates/xserv-cuda/build.rs
+++ b/crates/xserv-cuda/build.rs
@@ -0,0 +1,20 @@
+use std::env;
+
+fn main() {
+    let cuda_path = env::var("CUDA_HOME")
+        .or_else(|_| env::var("CUDA_PATH"))
+        .unwrap_or_else(|_| "/usr/local/cuda".to_string());
+
+    println!("cargo:rustc-link-search=native={cuda_path}/lib64");
+    println!("cargo:rustc-link-lib=dylib=cudart");
+    println!("cargo:rustc-link-lib=dylib=cuda");
+
+    cc::Build::new()
+        .cuda(true)
+        .cudart("shared")
+        .flag("-gencode=arch=compute_120,code=sm_120")
+        .file("../../csrc/test/vecadd.cu")
+        .compile("xserv_cuda_kernels");
+
+    println!("cargo:rerun-if-changed=../../csrc/");
+}
--- a/crates/xserv-cuda/src/allocator.rs
+++ b/crates/xserv-cuda/src/allocator.rs
@@ -0,0 +1,109 @@
+use crate::error::Result;
+use crate::ffi;
+use crate::memory::GpuBuffer;
+use std::collections::HashMap;
+
+/// Caching allocator that reuses freed GPU buffers instead of calling
+/// cudaMalloc/cudaFree on every allocation.
+///
+/// Freed buffers are kept in a per-size-bucket free list. On allocation,
+/// we first check the free list for a buffer of matching (rounded) size.
+pub struct CachingAllocator {
+    free_lists: HashMap<usize, Vec<(*mut u8, usize)>>,
+    stats: AllocStats,
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct AllocStats {
+    pub alloc_count: u64,
+    pub cache_hit_count: u64,
+    pub cuda_malloc_count: u64,
+    pub cuda_free_count: u64,
+    pub current_allocated: usize,
+    pub peak_allocated: usize,
+}
+
+impl CachingAllocator {
+    pub fn new() -> Self {
+        Self {
+            free_lists: HashMap::new(),
+            stats: AllocStats::default(),
+        }
+    }
+
+    pub fn alloc(&mut self, size: usize) -> Result<GpuBuffer> {
+        let bucket = bucket_size(size);
+        self.stats.alloc_count += 1;
+
+        if let Some(list) = self.free_lists.get_mut(&bucket) {
+            if let Some((ptr, actual_len)) = list.pop() {
+                self.stats.cache_hit_count += 1;
+                self.stats.current_allocated += actual_len;
+                if self.stats.current_allocated > self.stats.peak_allocated {
+                    self.stats.peak_allocated = self.stats.current_allocated;
+                }
+                return Ok(unsafe { GpuBuffer::from_raw(ptr, actual_len) });
+            }
+        }
+
+        self.stats.cuda_malloc_count += 1;
+        let buf = GpuBuffer::alloc(bucket)?;
+        self.stats.current_allocated += bucket;
+        if self.stats.current_allocated > self.stats.peak_allocated {
+            self.stats.peak_allocated = self.stats.current_allocated;
+        }
+        Ok(buf)
+    }
+
+    /// Return a buffer to the cache instead of freeing it.
+    pub fn dealloc(&mut self, buf: GpuBuffer) {
+        let (ptr, len) = buf.into_raw();
+        let bucket = bucket_size(len);
+        self.stats.current_allocated = self.stats.current_allocated.saturating_sub(len);
+        self.free_lists.entry(bucket).or_default().push((ptr, len));
+    }
+
+    /// Actually free all cached buffers.
+    pub fn trim(&mut self) {
+        for (_bucket, list) in self.free_lists.drain() {
+            for (ptr, _len) in list {
+                unsafe { ffi::cudaFree(ptr) };
+                self.stats.cuda_free_count += 1;
+            }
+        }
+    }
+
+    pub fn stats(&self) -> &AllocStats {
+        &self.stats
+    }
+}
+
+impl Drop for CachingAllocator {
+    fn drop(&mut self) {
+        self.trim();
+    }
+}
+
+/// Round up to next power-of-2, minimum 512 bytes.
+fn bucket_size(size: usize) -> usize {
+    let min = 512;
+    if size <= min {
+        return min;
+    }
+    size.next_power_of_two()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bucket_size() {
+        assert_eq!(bucket_size(1), 512);
+        assert_eq!(bucket_size(512), 512);
+        assert_eq!(bucket_size(513), 1024);
+        assert_eq!(bucket_size(1024), 1024);
+        assert_eq!(bucket_size(1025), 2048);
+        assert_eq!(bucket_size(1 << 20), 1 << 20);
+    }
+}
--- a/crates/xserv-cuda/src/device.rs
+++ b/crates/xserv-cuda/src/device.rs
@@ -0,0 +1,77 @@
+use crate::error::{self, Result};
+use crate::ffi;
+use std::ffi::CStr;
+
+#[derive(Debug, Clone)]
+pub struct DeviceInfo {
+    pub index: u32,
+    pub name: String,
+    pub total_memory: usize,
+    pub compute_major: i32,
+    pub compute_minor: i32,
+    pub sm_count: i32,
+    pub shared_mem_per_block: usize,
+    pub warp_size: i32,
+    pub max_threads_per_block: i32,
+}
+
+extern "C" {
+    fn cudaDeviceGetAttribute(value: *mut i32, attr: i32, device: i32) -> i32;
+}
+
+fn get_attr(attr: i32, device: u32) -> Result<i32> {
+    let mut value = 0;
+    error::check(unsafe { cudaDeviceGetAttribute(&mut value, attr, device as i32) })?;
+    Ok(value)
+}
+
+pub fn device_count() -> Result<i32> {
+    let mut count = 0;
+    error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
+    Ok(count)
+}
+
+pub fn set_device(device: u32) -> Result<()> {
+    error::check(unsafe { ffi::cudaSetDevice(device as i32) })
+}
+
+pub fn current_device() -> Result<u32> {
+    let mut dev = 0;
+    error::check(unsafe { ffi::cudaGetDevice(&mut dev) })?;
+    Ok(dev as u32)
+}
+
+pub fn device_info(device: u32) -> Result<DeviceInfo> {
+    // Use cudaGetDeviceProperties only for the name (first field, always stable).
+    let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
+    error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
+    let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
+        .to_string_lossy()
+        .into_owned();
+
+    // Use cudaDeviceGetAttribute for everything else (layout-independent).
+    // Attribute IDs from cuda_runtime_api.h:
+    const TOTAL_GLOBAL_MEM: i32 = 0; // not available via attribute, use prop
+    const SHARED_MEM_PER_BLOCK: i32 = 8;
+    const WARP_SIZE: i32 = 10;
+    const MAX_THREADS_PER_BLOCK: i32 = 1;
+    const MULTI_PROCESSOR_COUNT: i32 = 16;
+    const COMPUTE_MAJOR: i32 = 75;
+    const COMPUTE_MINOR: i32 = 76;
+
+    Ok(DeviceInfo {
+        index: device,
+        name,
+        total_memory: prop.total_global_mem,
+        compute_major: get_attr(COMPUTE_MAJOR, device)?,
+        compute_minor: get_attr(COMPUTE_MINOR, device)?,
+        sm_count: get_attr(MULTI_PROCESSOR_COUNT, device)?,
+        shared_mem_per_block: get_attr(SHARED_MEM_PER_BLOCK, device)? as usize,
+        warp_size: get_attr(WARP_SIZE, device)?,
+        max_threads_per_block: get_attr(MAX_THREADS_PER_BLOCK, device)?,
+    })
+}
+
+pub fn synchronize() -> Result<()> {
+    error::check(unsafe { ffi::cudaDeviceSynchronize() })
+}
--- a/crates/xserv-cuda/src/error.rs
+++ b/crates/xserv-cuda/src/error.rs
@@ -0,0 +1,43 @@
+use crate::ffi;
+use std::ffi::CStr;
+use std::fmt;
+
+#[derive(Debug)]
+pub enum CudaError {
+    OutOfMemory,
+    InvalidDevice,
+    Raw { code: i32, message: String },
+}
+
+impl fmt::Display for CudaError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
+            CudaError::InvalidDevice => write!(f, "CUDA invalid device"),
+            CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
+        }
+    }
+}
+
+impl std::error::Error for CudaError {}
+
+pub type Result<T> = std::result::Result<T, CudaError>;
+
+pub(crate) fn check(code: i32) -> Result<()> {
+    if code == ffi::CUDA_SUCCESS {
+        return Ok(());
+    }
+    let message = unsafe {
+        let ptr = ffi::cudaGetErrorString(code);
+        if ptr.is_null() {
+            "unknown error".to_string()
+        } else {
+            CStr::from_ptr(ptr).to_string_lossy().into_owned()
+        }
+    };
+    Err(match code {
+        ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
+        101 => CudaError::InvalidDevice,
+        _ => CudaError::Raw { code, message },
+    })
+}
--- a/crates/xserv-cuda/src/ffi.rs
+++ b/crates/xserv-cuda/src/ffi.rs
@@ -0,0 +1,73 @@
+use std::ffi::c_void;
+use std::os::raw::c_char;
+
+pub type CudaStream = *mut c_void;
+pub type CudaEvent = *mut c_void;
+
+pub const CUDA_MEMCPY_H2D: i32 = 1;
+pub const CUDA_MEMCPY_D2H: i32 = 2;
+pub const CUDA_MEMCPY_D2D: i32 = 3;
+
+pub const CUDA_SUCCESS: i32 = 0;
+pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
+
+#[repr(C)]
+pub struct CudaDeviceProp {
+    pub name: [c_char; 256],
+    pub total_global_mem: usize,
+    pub shared_mem_per_block: usize,
+    pub regs_per_block: i32,
+    pub warp_size: i32,
+    pub max_threads_per_block: i32,
+    pub max_threads_dim: [i32; 3],
+    pub max_grid_size: [i32; 3],
+    pub clock_rate: i32,
+    pub total_const_mem: usize,
+    pub major: i32,
+    pub minor: i32,
+    // There are many more fields; we only read up to what we need.
+    // cudaDeviceProp is a large struct (~1KB). We pad the rest.
+    _pad: [u8; 4096],
+}
+
+extern "C" {
+    // --- Device ---
+    pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
+    pub fn cudaSetDevice(device: i32) -> i32;
+    pub fn cudaGetDevice(device: *mut i32) -> i32;
+    pub fn cudaGetDeviceProperties(prop: *mut CudaDeviceProp, device: i32) -> i32;
+    pub fn cudaDeviceSynchronize() -> i32;
+
+    // --- Memory ---
+    pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
+    pub fn cudaFree(devptr: *mut u8) -> i32;
+    pub fn cudaMallocHost(ptr: *mut *mut u8, size: usize) -> i32;
+    pub fn cudaFreeHost(ptr: *mut u8) -> i32;
+    pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
+    pub fn cudaMemcpyAsync(
+        dst: *mut u8,
+        src: *const u8,
+        count: usize,
+        kind: i32,
+        stream: CudaStream,
+    ) -> i32;
+    pub fn cudaMemset(devptr: *mut u8, value: i32, count: usize) -> i32;
+
+    // --- Stream ---
+    pub fn cudaStreamCreate(stream: *mut CudaStream) -> i32;
+    pub fn cudaStreamDestroy(stream: CudaStream) -> i32;
+    pub fn cudaStreamSynchronize(stream: CudaStream) -> i32;
+
+    // --- Error ---
+    pub fn cudaGetLastError() -> i32;
+    pub fn cudaGetErrorString(error: i32) -> *const c_char;
+
+    // --- Our test kernel ---
+    pub fn launch_vecadd_f32(
+        a: *const f32,
+        b: *const f32,
+        c: *mut f32,
+        n: i32,
+        stream: CudaStream,
+    );
+}
--- a/crates/xserv-cuda/src/lib.rs
+++ b/crates/xserv-cuda/src/lib.rs
@@ -0,0 +1,12 @@
+pub mod allocator;
+pub mod device;
+pub mod error;
+pub mod ffi;
+pub mod memory;
+pub mod stream;
+
+pub use allocator::CachingAllocator;
+pub use device::DeviceInfo;
+pub use error::{CudaError, Result};
+pub use memory::{GpuBuffer, PinnedBuffer};
+pub use stream::CudaStream;
--- a/crates/xserv-cuda/src/memory.rs
+++ b/crates/xserv-cuda/src/memory.rs
@@ -0,0 +1,146 @@
+use crate::error::{self, Result};
+use crate::ffi;
+use crate::stream::CudaStream;
+
+/// RAII wrapper around a GPU memory allocation.
+pub struct GpuBuffer {
+    ptr: *mut u8,
+    len: usize,
+}
+
+impl GpuBuffer {
+    pub fn alloc(len: usize) -> Result<Self> {
+        assert!(len > 0, "cannot allocate 0 bytes on GPU");
+        let mut ptr = std::ptr::null_mut();
+        error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
+        Ok(Self { ptr, len })
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn as_ptr(&self) -> *const u8 {
+        self.ptr
+    }
+
+    pub fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr
+    }
+
+    /// Copy data from host (CPU) slice to this GPU buffer.
+    pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
+        assert!(src.len() <= self.len, "source larger than buffer");
+        error::check(unsafe {
+            ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
+        })
+    }
+
+    /// Copy data from this GPU buffer to a host (CPU) slice.
+    pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
+        assert!(dst.len() <= self.len, "destination larger than buffer");
+        error::check(unsafe {
+            ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
+        })
+    }
+
+    /// Async copy from host to device on the given stream.
+    /// Safety: `src` must remain valid until the stream operation completes.
+    pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
+        assert!(src.len() <= self.len);
+        error::check(ffi::cudaMemcpyAsync(
+            self.ptr,
+            src.as_ptr(),
+            src.len(),
+            ffi::CUDA_MEMCPY_H2D,
+            stream.as_raw(),
+        ))
+    }
+
+    /// Async copy from device to host on the given stream.
+    /// Safety: `dst` must remain valid until the stream operation completes.
+    pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
+        assert!(dst.len() <= self.len);
+        error::check(ffi::cudaMemcpyAsync(
+            dst.as_mut_ptr(),
+            self.ptr,
+            dst.len(),
+            ffi::CUDA_MEMCPY_D2H,
+            stream.as_raw(),
+        ))
+    }
+
+    /// Copy from another GPU buffer (D2D).
+    pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
+        let n = src.len.min(self.len);
+        error::check(unsafe {
+            ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D)
+        })
+    }
+
+    /// Fill buffer with zeros.
+    pub fn zero(&mut self) -> Result<()> {
+        error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
+    }
+
+    /// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
+    /// Caller is responsible for eventually calling cudaFree.
+    pub fn into_raw(self) -> (*mut u8, usize) {
+        let ptr = self.ptr;
+        let len = self.len;
+        std::mem::forget(self);
+        (ptr, len)
+    }
+
+    /// Reconstruct a GpuBuffer from a raw pointer + length.
+    /// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
+    pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
+        Self { ptr, len }
+    }
+}
+
+impl Drop for GpuBuffer {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            unsafe { ffi::cudaFree(self.ptr) };
+        }
+    }
+}
+
+unsafe impl Send for GpuBuffer {}
+
+/// Pinned (page-locked) host memory for faster H2D/D2H transfers.
+pub struct PinnedBuffer {
+    ptr: *mut u8,
+    len: usize,
+}
+
+impl PinnedBuffer {
+    pub fn alloc(len: usize) -> Result<Self> {
+        let mut ptr = std::ptr::null_mut();
+        error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
+        Ok(Self { ptr, len })
+    }
+
+    pub fn as_slice(&self) -> &[u8] {
+        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
+    }
+
+    pub fn as_mut_slice(&mut self) -> &mut [u8] {
+        unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+}
+
+impl Drop for PinnedBuffer {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            unsafe { ffi::cudaFreeHost(self.ptr) };
+        }
+    }
+}
+
+unsafe impl Send for PinnedBuffer {}
--- a/crates/xserv-cuda/src/stream.rs
+++ b/crates/xserv-cuda/src/stream.rs
@@ -0,0 +1,33 @@
+use crate::error::{self, Result};
+use crate::ffi;
+
+pub struct CudaStream {
+    raw: ffi::CudaStream,
+}
+
+impl CudaStream {
+    pub fn new() -> Result<Self> {
+        let mut raw = std::ptr::null_mut();
+        error::check(unsafe { ffi::cudaStreamCreate(&mut raw) })?;
+        Ok(Self { raw })
+    }
+
+    pub fn synchronize(&self) -> Result<()> {
+        error::check(unsafe { ffi::cudaStreamSynchronize(self.raw) })
+    }
+
+    pub fn as_raw(&self) -> ffi::CudaStream {
+        self.raw
+    }
+}
+
+impl Drop for CudaStream {
+    fn drop(&mut self) {
+        if !self.raw.is_null() {
+            unsafe { ffi::cudaStreamDestroy(self.raw) };
+        }
+    }
+}
+
+// Can move across threads, but not shared without synchronization
+unsafe impl Send for CudaStream {}
--- a/crates/xserv-cuda/tests/integration.rs
+++ b/crates/xserv-cuda/tests/integration.rs
@@ -0,0 +1,208 @@
+use xserv_cuda::*;
+
+#[test]
+fn test_device_info() {
+    let count = device::device_count().expect("failed to get device count");
+    assert!(count > 0, "no CUDA devices found");
+
+    let info = device::device_info(0).expect("failed to get device info");
+    println!("GPU 0: {}", info.name);
+    println!("  Memory: {} MB", info.total_memory / (1024 * 1024));
+    println!(
+        "  Compute Capability: {}.{}",
+        info.compute_major, info.compute_minor
+    );
+    println!("  SM Count: {}", info.sm_count);
+    println!("  Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
+    println!("  Warp Size: {}", info.warp_size);
+    println!("  Max Threads/Block: {}", info.max_threads_per_block);
+
+    assert!(info.total_memory > 0);
+    assert!(info.sm_count > 0);
+}
+
+#[test]
+fn test_gpu_buffer_h2d_d2h() {
+    device::set_device(0).unwrap();
+
+    let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
+    let mut buf = GpuBuffer::alloc(data.len()).unwrap();
+    buf.copy_from_host(&data).unwrap();
+
+    let mut out = vec![0u8; data.len()];
+    buf.copy_to_host(&mut out).unwrap();
+
+    assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
+}
+
+#[test]
+fn test_gpu_buffer_large() {
+    device::set_device(0).unwrap();
+
+    let size = 64 * 1024 * 1024; // 64 MB
+    let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
+    let mut buf = GpuBuffer::alloc(size).unwrap();
+    buf.copy_from_host(&data).unwrap();
+
+    let mut out = vec![0u8; size];
+    buf.copy_to_host(&mut out).unwrap();
+
+    assert_eq!(data, out, "64MB roundtrip mismatch");
+}
+
+#[test]
+fn test_gpu_buffer_d2d() {
+    device::set_device(0).unwrap();
+
+    let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
+    let mut src = GpuBuffer::alloc(data.len()).unwrap();
+    src.copy_from_host(&data).unwrap();
+
+    let mut dst = GpuBuffer::alloc(data.len()).unwrap();
+    dst.copy_from_device(&src).unwrap();
+
+    let mut out = vec![0u8; data.len()];
+    dst.copy_to_host(&mut out).unwrap();
+
+    assert_eq!(data, out, "D2D copy mismatch");
+}
+
+#[test]
+fn test_gpu_buffer_zero() {
+    device::set_device(0).unwrap();
+
+    let mut buf = GpuBuffer::alloc(1024).unwrap();
+    buf.zero().unwrap();
+
+    let mut out = vec![0xFFu8; 1024];
+    buf.copy_to_host(&mut out).unwrap();
+
+    assert!(out.iter().all(|&b| b == 0), "zero fill failed");
+}
+
+#[test]
+fn test_stream() {
+    device::set_device(0).unwrap();
+
+    let stream = CudaStream::new().unwrap();
+    stream.synchronize().unwrap();
+    // stream drops here, should destroy cleanly
+}
+
+#[test]
+fn test_vecadd_kernel() {
+    device::set_device(0).unwrap();
+
+    let n = 1024;
+    let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
+    let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
+    let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
+
+    let byte_len = n * std::mem::size_of::<f32>();
+
+    let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
+    let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
+    let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
+
+    let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
+    let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
+    d_a.copy_from_host(a_bytes).unwrap();
+    d_b.copy_from_host(b_bytes).unwrap();
+
+    unsafe {
+        ffi::launch_vecadd_f32(
+            d_a.as_ptr() as *const f32,
+            d_b.as_ptr() as *const f32,
+            d_c.as_mut_ptr() as *mut f32,
+            n as i32,
+            std::ptr::null_mut(), // default stream
+        );
+    }
+    device::synchronize().unwrap();
+
+    let mut result = vec![0.0f32; n];
+    let result_bytes =
+        unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
+    d_c.copy_to_host(result_bytes).unwrap();
+
+    assert_eq!(result, expected, "vecadd kernel output mismatch");
+}
+
+#[test]
+fn test_caching_allocator() {
+    device::set_device(0).unwrap();
+
+    let mut alloc = CachingAllocator::new();
+
+    // First allocation: should trigger cudaMalloc
+    let buf1 = alloc.alloc(1024).unwrap();
+    assert_eq!(alloc.stats().cuda_malloc_count, 1);
+    assert_eq!(alloc.stats().cache_hit_count, 0);
+
+    // Return to cache
+    alloc.dealloc(buf1);
+
+    // Second allocation of same size: should hit cache
+    let _buf2 = alloc.alloc(1024).unwrap();
+    assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
+    assert_eq!(alloc.stats().cache_hit_count, 1);
+}
+
+#[test]
+fn test_caching_allocator_different_sizes() {
+    device::set_device(0).unwrap();
+
+    let mut alloc = CachingAllocator::new();
+
+    let buf1 = alloc.alloc(512).unwrap();
+    let buf2 = alloc.alloc(2048).unwrap();
+
+    alloc.dealloc(buf1);
+    alloc.dealloc(buf2);
+
+    // Re-alloc different sizes: each should hit its own bucket
+    let _buf3 = alloc.alloc(512).unwrap();
+    let _buf4 = alloc.alloc(2048).unwrap();
+
+    assert_eq!(alloc.stats().cuda_malloc_count, 2);
+    assert_eq!(alloc.stats().cache_hit_count, 2);
+}
+
+#[test]
+fn test_pinned_memory() {
+    let mut pinned = PinnedBuffer::alloc(4096).unwrap();
+    let slice = pinned.as_mut_slice();
+    for (i, byte) in slice.iter_mut().enumerate() {
+        *byte = (i % 256) as u8;
+    }
+
+    device::set_device(0).unwrap();
+    let mut gpu = GpuBuffer::alloc(4096).unwrap();
+    gpu.copy_from_host(pinned.as_slice()).unwrap();
+
+    let mut out = vec![0u8; 4096];
+    gpu.copy_to_host(&mut out).unwrap();
+
+    assert_eq!(pinned.as_slice(), &out[..]);
+}
+
+#[test]
+fn test_async_copy() {
+    device::set_device(0).unwrap();
+    let stream = CudaStream::new().unwrap();
+
+    let mut pinned = PinnedBuffer::alloc(4096).unwrap();
+    for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
+        *byte = (i % 256) as u8;
+    }
+
+    let mut gpu = GpuBuffer::alloc(4096).unwrap();
+    unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
+    stream.synchronize().unwrap();
+
+    let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
+    unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
+    stream.synchronize().unwrap();
+
+    assert_eq!(pinned.as_slice(), out_pinned.as_slice());
+}
--- a/csrc/test/vecadd.cu
+++ b/csrc/test/vecadd.cu
@@ -0,0 +1,16 @@
+extern "C" {
+
+__global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        c[idx] = a[idx] + b[idx];
+    }
+}
+
+void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
+    int block = 256;
+    int grid = (n + block - 1) / block;
+    vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
+}
+
+}
--- a/docs/00-roadmap.md
+++ b/docs/00-roadmap.md
--- a/docs/01-cuda-ffi.md
+++ b/docs/01-cuda-ffi.md
@@ -0,0 +1,80 @@
+# Phase 0+1: CUDA FFI Infrastructure — Design Document
+
+## Goal
+
+Build `xserv-cuda`, a Rust crate that wraps CUDA Runtime API with safe abstractions:
+- Device query and selection
+- GPU memory allocation with RAII (GpuBuffer)
+- Caching allocator (avoid repeated cudaMalloc/cudaFree)
+- CUDA streams for async operations
+- Host↔Device memory transfers
+- Error handling wrapping all CUDA calls
+
+## Module Layout
+
+```
+crates/xserv-cuda/
+├── Cargo.toml
+├── build.rs          # compiles csrc/*.cu via cc crate
+└── src/
+    ├── lib.rs        # re-exports
+    ├── ffi.rs        # raw extern "C" bindings to CUDA runtime
+    ├── error.rs      # CudaError type
+    ├── device.rs     # device query, DeviceInfo
+    ├── stream.rs     # CudaStream wrapper
+    ├── memory.rs     # GpuBuffer, H2D/D2H/D2D copy
+    └── allocator.rs  # CachingAllocator
+```
+
+## Key Design Decisions
+
+### FFI Bindings (ffi.rs)
+Hand-written extern "C" bindings (~25 functions). No bindgen — keeps things explicit and readable.
+
+Core functions needed:
+- Memory: cudaMalloc, cudaFree, cudaMemcpy, cudaMemcpyAsync, cudaMallocHost, cudaFreeHost
+- Stream: cudaStreamCreate, cudaStreamDestroy, cudaStreamSynchronize
+- Device: cudaGetDeviceCount, cudaSetDevice, cudaGetDevice, cudaGetDeviceProperties
+- Sync: cudaDeviceSynchronize
+- Error: cudaGetLastError, cudaGetErrorString
+
+### Error Handling (error.rs)
+Every CUDA call returns cudaError_t. We wrap all calls:
+```rust
+pub(crate) fn check(code: i32) -> Result<(), CudaError>
+```
+
+### GpuBuffer (memory.rs)
+RAII wrapper around a GPU pointer. Drop frees memory.
+```rust
+pub struct GpuBuffer {
+    ptr: *mut u8,
+    len: usize,       // in bytes
+    device: u32,
+}
+```
+- No Clone (explicit copy_from instead)
+- Send + !Sync (can move across threads, but not shared)
+
+### CachingAllocator (allocator.rs)
+Avoids cudaMalloc/cudaFree per allocation. Maintains a free-list keyed by size bucket.
+
+Bucket boundaries: round up to next power of 2, minimum 512 bytes.
+- alloc(size) → find bucket, pop from free list or cudaMalloc
+- dealloc(ptr, size) → push to free list (don't cudaFree)
+- trim() → actually cudaFree everything in free lists
+
+### CudaStream (stream.rs)
+Wraps cudaStream_t. RAII with Drop calling cudaStreamDestroy.
+
+## Build Pipeline
+- `csrc/test/vecadd.cu`: minimal vector-add kernel for smoke test
+- `build.rs` uses `cc` crate to compile .cu files, link CUDA runtime
+
+## Test Plan
+1. Device info: print GPU name, memory, compute capability, SM count
+2. GpuBuffer: alloc 1GB, H2D copy, D2H copy, verify data
+3. Vector add kernel: launch from Rust, verify output
+4. CachingAllocator: alloc→free→realloc same size uses cache (no new cudaMalloc)
+5. Multi-stream: two concurrent memcpy on different streams
+6. Benchmark: caching allocator vs raw cudaMalloc (100 cycles)
--- a/tools/sync-and-build.sh
+++ b/tools/sync-and-build.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Sync local project to dash5 and build/test there.
+# Usage: ./tools/sync-and-build.sh [test|build|run]
+
+set -e
+
+REMOTE="dash5"
+REMOTE_DIR="/opt/wjh/projects/xserv"
+LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+
+ACTION="${1:-build}"
+
+echo "=== Syncing to $REMOTE:$REMOTE_DIR ==="
+ssh "$REMOTE" "mkdir -p $REMOTE_DIR"
+rsync -az --delete \
+    --exclude target \
+    --exclude .git \
+    "$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/"
+
+echo "=== Running: cargo $ACTION ==="
+ssh "$REMOTE" "source \$HOME/.cargo/env && \
+    export PATH=/usr/local/cuda/bin:\$PATH && \
+    export CUDA_HOME=/usr/local/cuda && \
+    cd $REMOTE_DIR && \
+    cargo $ACTION --release 2>&1"