phase 0+1: project scaffold + xserv-cuda crate

- Cargo workspace with xserv-cuda crate - CUDA FFI bindings (cudart: memory, stream, device, error) - GpuBuffer RAII wrapper with H2D/D2H/D2D copy - CudaStream wrapper with RAII Drop - CachingAllocator with size-bucketed free lists - PinnedBuffer for page-locked host memory - Device info query via cudaDeviceGetAttribute - Vector-add CUDA kernel smoke test - Integration test suite (11 tests) - build.rs: cc crate compiles .cu for SM 12.0 - sync-and-build.sh for remote build on dash5 - Roadmap doc (docs/00-roadmap.md) and Phase 0+1 design doc Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 18:40:22 +08:00
commit 9806b4db35
16 changed files with 2629 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,9 @@
 /target
 *.o
 *.so
 *.a
 *.ptx
 *.cubin
 **/*.rs.bk
 .env
 *.npy
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,14 @@
 [workspace]
 resolver = "2"
 members = [
    "crates/xserv-cuda",
 ]
 [workspace.package]
 version = "0.1.0"
 edition = "2024"
 license = "MIT"
 [workspace.dependencies]
 half = "2"
 smallvec = "1"
--- a/crates/xserv-cuda/Cargo.toml
+++ b/crates/xserv-cuda/Cargo.toml
@@ -0,0 +1,10 @@
 [package]
 name = "xserv-cuda"
 version.workspace = true
 edition.workspace = true
 [build-dependencies]
 cc = { version = "1", features = ["cuda"] }
 [dev-dependencies]
 rand = "0.9"
--- a/crates/xserv-cuda/build.rs
+++ b/crates/xserv-cuda/build.rs
@@ -0,0 +1,20 @@
 use std::env;
 fn main() {
    let cuda_path = env::var("CUDA_HOME")
        .or_else(|_| env::var("CUDA_PATH"))
        .unwrap_or_else(|_| "/usr/local/cuda".to_string());
    println!("cargo:rustc-link-search=native={cuda_path}/lib64");
    println!("cargo:rustc-link-lib=dylib=cudart");
    println!("cargo:rustc-link-lib=dylib=cuda");
    cc::Build::new()
        .cuda(true)
        .cudart("shared")
        .flag("-gencode=arch=compute_120,code=sm_120")
        .file("../../csrc/test/vecadd.cu")
        .compile("xserv_cuda_kernels");
    println!("cargo:rerun-if-changed=../../csrc/");
 }
--- a/crates/xserv-cuda/src/allocator.rs
+++ b/crates/xserv-cuda/src/allocator.rs
@@ -0,0 +1,109 @@
 use crate::error::Result;
 use crate::ffi;
 use crate::memory::GpuBuffer;
 use std::collections::HashMap;
 /// Caching allocator that reuses freed GPU buffers instead of calling
 /// cudaMalloc/cudaFree on every allocation.
 ///
 /// Freed buffers are kept in a per-size-bucket free list. On allocation,
 /// we first check the free list for a buffer of matching (rounded) size.
 pub struct CachingAllocator {
    free_lists: HashMap<usize, Vec<(*mut u8, usize)>>,
    stats: AllocStats,
 }
 #[derive(Debug, Default, Clone)]
 pub struct AllocStats {
    pub alloc_count: u64,
    pub cache_hit_count: u64,
    pub cuda_malloc_count: u64,
    pub cuda_free_count: u64,
    pub current_allocated: usize,
    pub peak_allocated: usize,
 }
 impl CachingAllocator {
    pub fn new() -> Self {
        Self {
            free_lists: HashMap::new(),
            stats: AllocStats::default(),
        }
    }
    pub fn alloc(&mut self, size: usize) -> Result<GpuBuffer> {
        let bucket = bucket_size(size);
        self.stats.alloc_count += 1;
        if let Some(list) = self.free_lists.get_mut(&bucket) {
            if let Some((ptr, actual_len)) = list.pop() {
                self.stats.cache_hit_count += 1;
                self.stats.current_allocated += actual_len;
                if self.stats.current_allocated > self.stats.peak_allocated {
                    self.stats.peak_allocated = self.stats.current_allocated;
                }
                return Ok(unsafe { GpuBuffer::from_raw(ptr, actual_len) });
            }
        }
        self.stats.cuda_malloc_count += 1;
        let buf = GpuBuffer::alloc(bucket)?;
        self.stats.current_allocated += bucket;
        if self.stats.current_allocated > self.stats.peak_allocated {
            self.stats.peak_allocated = self.stats.current_allocated;
        }
        Ok(buf)
    }
    /// Return a buffer to the cache instead of freeing it.
    pub fn dealloc(&mut self, buf: GpuBuffer) {
        let (ptr, len) = buf.into_raw();
        let bucket = bucket_size(len);
        self.stats.current_allocated = self.stats.current_allocated.saturating_sub(len);
        self.free_lists.entry(bucket).or_default().push((ptr, len));
    }
    /// Actually free all cached buffers.
    pub fn trim(&mut self) {
        for (_bucket, list) in self.free_lists.drain() {
            for (ptr, _len) in list {
                unsafe { ffi::cudaFree(ptr) };
                self.stats.cuda_free_count += 1;
            }
        }
    }
    pub fn stats(&self) -> &AllocStats {
        &self.stats
    }
 }
 impl Drop for CachingAllocator {
    fn drop(&mut self) {
        self.trim();
    }
 }
 /// Round up to next power-of-2, minimum 512 bytes.
 fn bucket_size(size: usize) -> usize {
    let min = 512;
    if size <= min {
        return min;
    }
    size.next_power_of_two()
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_bucket_size() {
        assert_eq!(bucket_size(1), 512);
        assert_eq!(bucket_size(512), 512);
        assert_eq!(bucket_size(513), 1024);
        assert_eq!(bucket_size(1024), 1024);
        assert_eq!(bucket_size(1025), 2048);
        assert_eq!(bucket_size(1 << 20), 1 << 20);
    }
 }
--- a/crates/xserv-cuda/src/device.rs
+++ b/crates/xserv-cuda/src/device.rs
@@ -0,0 +1,77 @@
 use crate::error::{self, Result};
 use crate::ffi;
 use std::ffi::CStr;
 #[derive(Debug, Clone)]
 pub struct DeviceInfo {
    pub index: u32,
    pub name: String,
    pub total_memory: usize,
    pub compute_major: i32,
    pub compute_minor: i32,
    pub sm_count: i32,
    pub shared_mem_per_block: usize,
    pub warp_size: i32,
    pub max_threads_per_block: i32,
 }
 extern "C" {
    fn cudaDeviceGetAttribute(value: *mut i32, attr: i32, device: i32) -> i32;
 }
 fn get_attr(attr: i32, device: u32) -> Result<i32> {
    let mut value = 0;
    error::check(unsafe { cudaDeviceGetAttribute(&mut value, attr, device as i32) })?;
    Ok(value)
 }
 pub fn device_count() -> Result<i32> {
    let mut count = 0;
    error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
    Ok(count)
 }
 pub fn set_device(device: u32) -> Result<()> {
    error::check(unsafe { ffi::cudaSetDevice(device as i32) })
 }
 pub fn current_device() -> Result<u32> {
    let mut dev = 0;
    error::check(unsafe { ffi::cudaGetDevice(&mut dev) })?;
    Ok(dev as u32)
 }
 pub fn device_info(device: u32) -> Result<DeviceInfo> {
    // Use cudaGetDeviceProperties only for the name (first field, always stable).
    let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
    error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
    let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
        .to_string_lossy()
        .into_owned();
    // Use cudaDeviceGetAttribute for everything else (layout-independent).
    // Attribute IDs from cuda_runtime_api.h:
    const TOTAL_GLOBAL_MEM: i32 = 0; // not available via attribute, use prop
    const SHARED_MEM_PER_BLOCK: i32 = 8;
    const WARP_SIZE: i32 = 10;
    const MAX_THREADS_PER_BLOCK: i32 = 1;
    const MULTI_PROCESSOR_COUNT: i32 = 16;
    const COMPUTE_MAJOR: i32 = 75;
    const COMPUTE_MINOR: i32 = 76;
    Ok(DeviceInfo {
        index: device,
        name,
        total_memory: prop.total_global_mem,
        compute_major: get_attr(COMPUTE_MAJOR, device)?,
        compute_minor: get_attr(COMPUTE_MINOR, device)?,
        sm_count: get_attr(MULTI_PROCESSOR_COUNT, device)?,
        shared_mem_per_block: get_attr(SHARED_MEM_PER_BLOCK, device)? as usize,
        warp_size: get_attr(WARP_SIZE, device)?,
        max_threads_per_block: get_attr(MAX_THREADS_PER_BLOCK, device)?,
    })
 }
 pub fn synchronize() -> Result<()> {
    error::check(unsafe { ffi::cudaDeviceSynchronize() })
 }
--- a/crates/xserv-cuda/src/error.rs
+++ b/crates/xserv-cuda/src/error.rs
@@ -0,0 +1,43 @@
 use crate::ffi;
 use std::ffi::CStr;
 use std::fmt;
 #[derive(Debug)]
 pub enum CudaError {
    OutOfMemory,
    InvalidDevice,
    Raw { code: i32, message: String },
 }
 impl fmt::Display for CudaError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
            CudaError::InvalidDevice => write!(f, "CUDA invalid device"),
            CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
        }
    }
 }
 impl std::error::Error for CudaError {}
 pub type Result<T> = std::result::Result<T, CudaError>;
 pub(crate) fn check(code: i32) -> Result<()> {
    if code == ffi::CUDA_SUCCESS {
        return Ok(());
    }
    let message = unsafe {
        let ptr = ffi::cudaGetErrorString(code);
        if ptr.is_null() {
            "unknown error".to_string()
        } else {
            CStr::from_ptr(ptr).to_string_lossy().into_owned()
        }
    };
    Err(match code {
        ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
        101 => CudaError::InvalidDevice,
        _ => CudaError::Raw { code, message },
    })
 }
--- a/crates/xserv-cuda/src/ffi.rs
+++ b/crates/xserv-cuda/src/ffi.rs
@@ -0,0 +1,73 @@
 use std::ffi::c_void;
 use std::os::raw::c_char;
 pub type CudaStream = *mut c_void;
 pub type CudaEvent = *mut c_void;
 pub const CUDA_MEMCPY_H2D: i32 = 1;
 pub const CUDA_MEMCPY_D2H: i32 = 2;
 pub const CUDA_MEMCPY_D2D: i32 = 3;
 pub const CUDA_SUCCESS: i32 = 0;
 pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
 #[repr(C)]
 pub struct CudaDeviceProp {
    pub name: [c_char; 256],
    pub total_global_mem: usize,
    pub shared_mem_per_block: usize,
    pub regs_per_block: i32,
    pub warp_size: i32,
    pub max_threads_per_block: i32,
    pub max_threads_dim: [i32; 3],
    pub max_grid_size: [i32; 3],
    pub clock_rate: i32,
    pub total_const_mem: usize,
    pub major: i32,
    pub minor: i32,
    // There are many more fields; we only read up to what we need.
    // cudaDeviceProp is a large struct (~1KB). We pad the rest.
    _pad: [u8; 4096],
 }
 extern "C" {
    // --- Device ---
    pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
    pub fn cudaSetDevice(device: i32) -> i32;
    pub fn cudaGetDevice(device: *mut i32) -> i32;
    pub fn cudaGetDeviceProperties(prop: *mut CudaDeviceProp, device: i32) -> i32;
    pub fn cudaDeviceSynchronize() -> i32;
    // --- Memory ---
    pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
    pub fn cudaFree(devptr: *mut u8) -> i32;
    pub fn cudaMallocHost(ptr: *mut *mut u8, size: usize) -> i32;
    pub fn cudaFreeHost(ptr: *mut u8) -> i32;
    pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
    pub fn cudaMemcpyAsync(
        dst: *mut u8,
        src: *const u8,
        count: usize,
        kind: i32,
        stream: CudaStream,
    ) -> i32;
    pub fn cudaMemset(devptr: *mut u8, value: i32, count: usize) -> i32;
    // --- Stream ---
    pub fn cudaStreamCreate(stream: *mut CudaStream) -> i32;
    pub fn cudaStreamDestroy(stream: CudaStream) -> i32;
    pub fn cudaStreamSynchronize(stream: CudaStream) -> i32;
    // --- Error ---
    pub fn cudaGetLastError() -> i32;
    pub fn cudaGetErrorString(error: i32) -> *const c_char;
    // --- Our test kernel ---
    pub fn launch_vecadd_f32(
        a: *const f32,
        b: *const f32,
        c: *mut f32,
        n: i32,
        stream: CudaStream,
    );
 }
--- a/crates/xserv-cuda/src/lib.rs
+++ b/crates/xserv-cuda/src/lib.rs
@@ -0,0 +1,12 @@
 pub mod allocator;
 pub mod device;
 pub mod error;
 pub mod ffi;
 pub mod memory;
 pub mod stream;
 pub use allocator::CachingAllocator;
 pub use device::DeviceInfo;
 pub use error::{CudaError, Result};
 pub use memory::{GpuBuffer, PinnedBuffer};
 pub use stream::CudaStream;
--- a/crates/xserv-cuda/src/memory.rs
+++ b/crates/xserv-cuda/src/memory.rs
@@ -0,0 +1,146 @@
 use crate::error::{self, Result};
 use crate::ffi;
 use crate::stream::CudaStream;
 /// RAII wrapper around a GPU memory allocation.
 pub struct GpuBuffer {
    ptr: *mut u8,
    len: usize,
 }
 impl GpuBuffer {
    pub fn alloc(len: usize) -> Result<Self> {
        assert!(len > 0, "cannot allocate 0 bytes on GPU");
        let mut ptr = std::ptr::null_mut();
        error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
        Ok(Self { ptr, len })
    }
    pub fn len(&self) -> usize {
        self.len
    }
    pub fn as_ptr(&self) -> *const u8 {
        self.ptr
    }
    pub fn as_mut_ptr(&mut self) -> *mut u8 {
        self.ptr
    }
    /// Copy data from host (CPU) slice to this GPU buffer.
    pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
        assert!(src.len() <= self.len, "source larger than buffer");
        error::check(unsafe {
            ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
        })
    }
    /// Copy data from this GPU buffer to a host (CPU) slice.
    pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
        assert!(dst.len() <= self.len, "destination larger than buffer");
        error::check(unsafe {
            ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
        })
    }
    /// Async copy from host to device on the given stream.
    /// Safety: `src` must remain valid until the stream operation completes.
    pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
        assert!(src.len() <= self.len);
        error::check(ffi::cudaMemcpyAsync(
            self.ptr,
            src.as_ptr(),
            src.len(),
            ffi::CUDA_MEMCPY_H2D,
            stream.as_raw(),
        ))
    }
    /// Async copy from device to host on the given stream.
    /// Safety: `dst` must remain valid until the stream operation completes.
    pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
        assert!(dst.len() <= self.len);
        error::check(ffi::cudaMemcpyAsync(
            dst.as_mut_ptr(),
            self.ptr,
            dst.len(),
            ffi::CUDA_MEMCPY_D2H,
            stream.as_raw(),
        ))
    }
    /// Copy from another GPU buffer (D2D).
    pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
        let n = src.len.min(self.len);
        error::check(unsafe {
            ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D)
        })
    }
    /// Fill buffer with zeros.
    pub fn zero(&mut self) -> Result<()> {
        error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
    }
    /// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
    /// Caller is responsible for eventually calling cudaFree.
    pub fn into_raw(self) -> (*mut u8, usize) {
        let ptr = self.ptr;
        let len = self.len;
        std::mem::forget(self);
        (ptr, len)
    }
    /// Reconstruct a GpuBuffer from a raw pointer + length.
    /// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
    pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
        Self { ptr, len }
    }
 }
 impl Drop for GpuBuffer {
    fn drop(&mut self) {
        if !self.ptr.is_null() {
            unsafe { ffi::cudaFree(self.ptr) };
        }
    }
 }
 unsafe impl Send for GpuBuffer {}
 /// Pinned (page-locked) host memory for faster H2D/D2H transfers.
 pub struct PinnedBuffer {
    ptr: *mut u8,
    len: usize,
 }
 impl PinnedBuffer {
    pub fn alloc(len: usize) -> Result<Self> {
        let mut ptr = std::ptr::null_mut();
        error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
        Ok(Self { ptr, len })
    }
    pub fn as_slice(&self) -> &[u8] {
        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
    }
    pub fn as_mut_slice(&mut self) -> &mut [u8] {
        unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
    }
    pub fn len(&self) -> usize {
        self.len
    }
 }
 impl Drop for PinnedBuffer {
    fn drop(&mut self) {
        if !self.ptr.is_null() {
            unsafe { ffi::cudaFreeHost(self.ptr) };
        }
    }
 }
 unsafe impl Send for PinnedBuffer {}
--- a/crates/xserv-cuda/src/stream.rs
+++ b/crates/xserv-cuda/src/stream.rs
@@ -0,0 +1,33 @@
 use crate::error::{self, Result};
 use crate::ffi;
 pub struct CudaStream {
    raw: ffi::CudaStream,
 }
 impl CudaStream {
    pub fn new() -> Result<Self> {
        let mut raw = std::ptr::null_mut();
        error::check(unsafe { ffi::cudaStreamCreate(&mut raw) })?;
        Ok(Self { raw })
    }
    pub fn synchronize(&self) -> Result<()> {
        error::check(unsafe { ffi::cudaStreamSynchronize(self.raw) })
    }
    pub fn as_raw(&self) -> ffi::CudaStream {
        self.raw
    }
 }
 impl Drop for CudaStream {
    fn drop(&mut self) {
        if !self.raw.is_null() {
            unsafe { ffi::cudaStreamDestroy(self.raw) };
        }
    }
 }
 // Can move across threads, but not shared without synchronization
 unsafe impl Send for CudaStream {}
--- a/crates/xserv-cuda/tests/integration.rs
+++ b/crates/xserv-cuda/tests/integration.rs
@@ -0,0 +1,208 @@
 use xserv_cuda::*;
 #[test]
 fn test_device_info() {
    let count = device::device_count().expect("failed to get device count");
    assert!(count > 0, "no CUDA devices found");
    let info = device::device_info(0).expect("failed to get device info");
    println!("GPU 0: {}", info.name);
    println!("  Memory: {} MB", info.total_memory / (1024 * 1024));
    println!(
        "  Compute Capability: {}.{}",
        info.compute_major, info.compute_minor
    );
    println!("  SM Count: {}", info.sm_count);
    println!("  Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
    println!("  Warp Size: {}", info.warp_size);
    println!("  Max Threads/Block: {}", info.max_threads_per_block);
    assert!(info.total_memory > 0);
    assert!(info.sm_count > 0);
 }
 #[test]
 fn test_gpu_buffer_h2d_d2h() {
    device::set_device(0).unwrap();
    let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
    let mut buf = GpuBuffer::alloc(data.len()).unwrap();
    buf.copy_from_host(&data).unwrap();
    let mut out = vec![0u8; data.len()];
    buf.copy_to_host(&mut out).unwrap();
    assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
 }
 #[test]
 fn test_gpu_buffer_large() {
    device::set_device(0).unwrap();
    let size = 64 * 1024 * 1024; // 64 MB
    let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
    let mut buf = GpuBuffer::alloc(size).unwrap();
    buf.copy_from_host(&data).unwrap();
    let mut out = vec![0u8; size];
    buf.copy_to_host(&mut out).unwrap();
    assert_eq!(data, out, "64MB roundtrip mismatch");
 }
 #[test]
 fn test_gpu_buffer_d2d() {
    device::set_device(0).unwrap();
    let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
    let mut src = GpuBuffer::alloc(data.len()).unwrap();
    src.copy_from_host(&data).unwrap();
    let mut dst = GpuBuffer::alloc(data.len()).unwrap();
    dst.copy_from_device(&src).unwrap();
    let mut out = vec![0u8; data.len()];
    dst.copy_to_host(&mut out).unwrap();
    assert_eq!(data, out, "D2D copy mismatch");
 }
 #[test]
 fn test_gpu_buffer_zero() {
    device::set_device(0).unwrap();
    let mut buf = GpuBuffer::alloc(1024).unwrap();
    buf.zero().unwrap();
    let mut out = vec![0xFFu8; 1024];
    buf.copy_to_host(&mut out).unwrap();
    assert!(out.iter().all(|&b| b == 0), "zero fill failed");
 }
 #[test]
 fn test_stream() {
    device::set_device(0).unwrap();
    let stream = CudaStream::new().unwrap();
    stream.synchronize().unwrap();
    // stream drops here, should destroy cleanly
 }
 #[test]
 fn test_vecadd_kernel() {
    device::set_device(0).unwrap();
    let n = 1024;
    let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
    let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
    let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
    let byte_len = n * std::mem::size_of::<f32>();
    let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
    let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
    let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
    let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
    let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
    d_a.copy_from_host(a_bytes).unwrap();
    d_b.copy_from_host(b_bytes).unwrap();
    unsafe {
        ffi::launch_vecadd_f32(
            d_a.as_ptr() as *const f32,
            d_b.as_ptr() as *const f32,
            d_c.as_mut_ptr() as *mut f32,
            n as i32,
            std::ptr::null_mut(), // default stream
        );
    }
    device::synchronize().unwrap();
    let mut result = vec![0.0f32; n];
    let result_bytes =
        unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
    d_c.copy_to_host(result_bytes).unwrap();
    assert_eq!(result, expected, "vecadd kernel output mismatch");
 }
 #[test]
 fn test_caching_allocator() {
    device::set_device(0).unwrap();
    let mut alloc = CachingAllocator::new();
    // First allocation: should trigger cudaMalloc
    let buf1 = alloc.alloc(1024).unwrap();
    assert_eq!(alloc.stats().cuda_malloc_count, 1);
    assert_eq!(alloc.stats().cache_hit_count, 0);
    // Return to cache
    alloc.dealloc(buf1);
    // Second allocation of same size: should hit cache
    let _buf2 = alloc.alloc(1024).unwrap();
    assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
    assert_eq!(alloc.stats().cache_hit_count, 1);
 }
 #[test]
 fn test_caching_allocator_different_sizes() {
    device::set_device(0).unwrap();
    let mut alloc = CachingAllocator::new();
    let buf1 = alloc.alloc(512).unwrap();
    let buf2 = alloc.alloc(2048).unwrap();
    alloc.dealloc(buf1);
    alloc.dealloc(buf2);
    // Re-alloc different sizes: each should hit its own bucket
    let _buf3 = alloc.alloc(512).unwrap();
    let _buf4 = alloc.alloc(2048).unwrap();
    assert_eq!(alloc.stats().cuda_malloc_count, 2);
    assert_eq!(alloc.stats().cache_hit_count, 2);
 }
 #[test]
 fn test_pinned_memory() {
    let mut pinned = PinnedBuffer::alloc(4096).unwrap();
    let slice = pinned.as_mut_slice();
    for (i, byte) in slice.iter_mut().enumerate() {
        *byte = (i % 256) as u8;
    }
    device::set_device(0).unwrap();
    let mut gpu = GpuBuffer::alloc(4096).unwrap();
    gpu.copy_from_host(pinned.as_slice()).unwrap();
    let mut out = vec![0u8; 4096];
    gpu.copy_to_host(&mut out).unwrap();
    assert_eq!(pinned.as_slice(), &out[..]);
 }
 #[test]
 fn test_async_copy() {
    device::set_device(0).unwrap();
    let stream = CudaStream::new().unwrap();
    let mut pinned = PinnedBuffer::alloc(4096).unwrap();
    for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
        *byte = (i % 256) as u8;
    }
    let mut gpu = GpuBuffer::alloc(4096).unwrap();
    unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
    stream.synchronize().unwrap();
    let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
    unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
    stream.synchronize().unwrap();
    assert_eq!(pinned.as_slice(), out_pinned.as_slice());
 }
--- a/csrc/test/vecadd.cu
+++ b/csrc/test/vecadd.cu
@@ -0,0 +1,16 @@
 extern "C" {
 __global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
 }
 void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
    int block = 256;
    int grid = (n + block - 1) / block;
    vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
 }
 }
--- a/docs/00-roadmap.md
+++ b/docs/00-roadmap.md
--- a/docs/01-cuda-ffi.md
+++ b/docs/01-cuda-ffi.md
@@ -0,0 +1,80 @@
 # Phase 0+1: CUDA FFI Infrastructure — Design Document
 ## Goal
 Build `xserv-cuda`, a Rust crate that wraps CUDA Runtime API with safe abstractions:
 - Device query and selection
 - GPU memory allocation with RAII (GpuBuffer)
 - Caching allocator (avoid repeated cudaMalloc/cudaFree)
 - CUDA streams for async operations
 - Host↔Device memory transfers
 - Error handling wrapping all CUDA calls
 ## Module Layout
 ```
 crates/xserv-cuda/
 ├── Cargo.toml
 ├── build.rs          # compiles csrc/*.cu via cc crate
 └── src/
    ├── lib.rs        # re-exports
    ├── ffi.rs        # raw extern "C" bindings to CUDA runtime
    ├── error.rs      # CudaError type
    ├── device.rs     # device query, DeviceInfo
    ├── stream.rs     # CudaStream wrapper
    ├── memory.rs     # GpuBuffer, H2D/D2H/D2D copy
    └── allocator.rs  # CachingAllocator
 ```
 ## Key Design Decisions
 ### FFI Bindings (ffi.rs)
 Hand-written extern "C" bindings (~25 functions). No bindgen — keeps things explicit and readable.
 Core functions needed:
 - Memory: cudaMalloc, cudaFree, cudaMemcpy, cudaMemcpyAsync, cudaMallocHost, cudaFreeHost
 - Stream: cudaStreamCreate, cudaStreamDestroy, cudaStreamSynchronize
 - Device: cudaGetDeviceCount, cudaSetDevice, cudaGetDevice, cudaGetDeviceProperties
 - Sync: cudaDeviceSynchronize
 - Error: cudaGetLastError, cudaGetErrorString
 ### Error Handling (error.rs)
 Every CUDA call returns cudaError_t. We wrap all calls:
 ```rust
 pub(crate) fn check(code: i32) -> Result<(), CudaError>
 ```
 ### GpuBuffer (memory.rs)
 RAII wrapper around a GPU pointer. Drop frees memory.
 ```rust
 pub struct GpuBuffer {
    ptr: *mut u8,
    len: usize,       // in bytes
    device: u32,
 }
 ```
 - No Clone (explicit copy_from instead)
 - Send + !Sync (can move across threads, but not shared)
 ### CachingAllocator (allocator.rs)
 Avoids cudaMalloc/cudaFree per allocation. Maintains a free-list keyed by size bucket.
 Bucket boundaries: round up to next power of 2, minimum 512 bytes.
 - alloc(size) → find bucket, pop from free list or cudaMalloc
 - dealloc(ptr, size) → push to free list (don't cudaFree)
 - trim() → actually cudaFree everything in free lists
 ### CudaStream (stream.rs)
 Wraps cudaStream_t. RAII with Drop calling cudaStreamDestroy.
 ## Build Pipeline
 - `csrc/test/vecadd.cu`: minimal vector-add kernel for smoke test
 - `build.rs` uses `cc` crate to compile .cu files, link CUDA runtime
 ## Test Plan
 1. Device info: print GPU name, memory, compute capability, SM count
 2. GpuBuffer: alloc 1GB, H2D copy, D2H copy, verify data
 3. Vector add kernel: launch from Rust, verify output
 4. CachingAllocator: alloc→free→realloc same size uses cache (no new cudaMalloc)
 5. Multi-stream: two concurrent memcpy on different streams
 6. Benchmark: caching allocator vs raw cudaMalloc (100 cycles)
--- a/tools/sync-and-build.sh
+++ b/tools/sync-and-build.sh
@@ -0,0 +1,25 @@
 #!/bin/bash
 # Sync local project to dash5 and build/test there.
 # Usage: ./tools/sync-and-build.sh [test|build|run]
 set -e
 REMOTE="dash5"
 REMOTE_DIR="/opt/wjh/projects/xserv"
 LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)"
 ACTION="${1:-build}"
 echo "=== Syncing to $REMOTE:$REMOTE_DIR ==="
 ssh "$REMOTE" "mkdir -p $REMOTE_DIR"
 rsync -az --delete \
    --exclude target \
    --exclude .git \
    "$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/"
 echo "=== Running: cargo $ACTION ==="
 ssh "$REMOTE" "source \$HOME/.cargo/env && \
    export PATH=/usr/local/cuda/bin:\$PATH && \
    export CUDA_HOME=/usr/local/cuda && \
    cd $REMOTE_DIR && \
    cargo $ACTION --release 2>&1"