phase 0+1: project scaffold + xserv-cuda crate
- Cargo workspace with xserv-cuda crate - CUDA FFI bindings (cudart: memory, stream, device, error) - GpuBuffer RAII wrapper with H2D/D2H/D2D copy - CudaStream wrapper with RAII Drop - CachingAllocator with size-bucketed free lists - PinnedBuffer for page-locked host memory - Device info query via cudaDeviceGetAttribute - Vector-add CUDA kernel smoke test - Integration test suite (11 tests) - build.rs: cc crate compiles .cu for SM 12.0 - sync-and-build.sh for remote build on dash5 - Roadmap doc (docs/00-roadmap.md) and Phase 0+1 design doc Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
9
.gitignore
vendored
Normal file
9
.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
/target
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
*.a
|
||||||
|
*.ptx
|
||||||
|
*.cubin
|
||||||
|
**/*.rs.bk
|
||||||
|
.env
|
||||||
|
*.npy
|
||||||
14
Cargo.toml
Normal file
14
Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[workspace]
|
||||||
|
resolver = "2"
|
||||||
|
members = [
|
||||||
|
"crates/xserv-cuda",
|
||||||
|
]
|
||||||
|
|
||||||
|
[workspace.package]
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
license = "MIT"
|
||||||
|
|
||||||
|
[workspace.dependencies]
|
||||||
|
half = "2"
|
||||||
|
smallvec = "1"
|
||||||
10
crates/xserv-cuda/Cargo.toml
Normal file
10
crates/xserv-cuda/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
[package]
|
||||||
|
name = "xserv-cuda"
|
||||||
|
version.workspace = true
|
||||||
|
edition.workspace = true
|
||||||
|
|
||||||
|
[build-dependencies]
|
||||||
|
cc = { version = "1", features = ["cuda"] }
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
rand = "0.9"
|
||||||
20
crates/xserv-cuda/build.rs
Normal file
20
crates/xserv-cuda/build.rs
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
use std::env;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let cuda_path = env::var("CUDA_HOME")
|
||||||
|
.or_else(|_| env::var("CUDA_PATH"))
|
||||||
|
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
||||||
|
|
||||||
|
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
||||||
|
println!("cargo:rustc-link-lib=dylib=cudart");
|
||||||
|
println!("cargo:rustc-link-lib=dylib=cuda");
|
||||||
|
|
||||||
|
cc::Build::new()
|
||||||
|
.cuda(true)
|
||||||
|
.cudart("shared")
|
||||||
|
.flag("-gencode=arch=compute_120,code=sm_120")
|
||||||
|
.file("../../csrc/test/vecadd.cu")
|
||||||
|
.compile("xserv_cuda_kernels");
|
||||||
|
|
||||||
|
println!("cargo:rerun-if-changed=../../csrc/");
|
||||||
|
}
|
||||||
109
crates/xserv-cuda/src/allocator.rs
Normal file
109
crates/xserv-cuda/src/allocator.rs
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
use crate::error::Result;
|
||||||
|
use crate::ffi;
|
||||||
|
use crate::memory::GpuBuffer;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Caching allocator that reuses freed GPU buffers instead of calling
|
||||||
|
/// cudaMalloc/cudaFree on every allocation.
|
||||||
|
///
|
||||||
|
/// Freed buffers are kept in a per-size-bucket free list. On allocation,
|
||||||
|
/// we first check the free list for a buffer of matching (rounded) size.
|
||||||
|
pub struct CachingAllocator {
|
||||||
|
free_lists: HashMap<usize, Vec<(*mut u8, usize)>>,
|
||||||
|
stats: AllocStats,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone)]
|
||||||
|
pub struct AllocStats {
|
||||||
|
pub alloc_count: u64,
|
||||||
|
pub cache_hit_count: u64,
|
||||||
|
pub cuda_malloc_count: u64,
|
||||||
|
pub cuda_free_count: u64,
|
||||||
|
pub current_allocated: usize,
|
||||||
|
pub peak_allocated: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CachingAllocator {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
free_lists: HashMap::new(),
|
||||||
|
stats: AllocStats::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn alloc(&mut self, size: usize) -> Result<GpuBuffer> {
|
||||||
|
let bucket = bucket_size(size);
|
||||||
|
self.stats.alloc_count += 1;
|
||||||
|
|
||||||
|
if let Some(list) = self.free_lists.get_mut(&bucket) {
|
||||||
|
if let Some((ptr, actual_len)) = list.pop() {
|
||||||
|
self.stats.cache_hit_count += 1;
|
||||||
|
self.stats.current_allocated += actual_len;
|
||||||
|
if self.stats.current_allocated > self.stats.peak_allocated {
|
||||||
|
self.stats.peak_allocated = self.stats.current_allocated;
|
||||||
|
}
|
||||||
|
return Ok(unsafe { GpuBuffer::from_raw(ptr, actual_len) });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.stats.cuda_malloc_count += 1;
|
||||||
|
let buf = GpuBuffer::alloc(bucket)?;
|
||||||
|
self.stats.current_allocated += bucket;
|
||||||
|
if self.stats.current_allocated > self.stats.peak_allocated {
|
||||||
|
self.stats.peak_allocated = self.stats.current_allocated;
|
||||||
|
}
|
||||||
|
Ok(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a buffer to the cache instead of freeing it.
|
||||||
|
pub fn dealloc(&mut self, buf: GpuBuffer) {
|
||||||
|
let (ptr, len) = buf.into_raw();
|
||||||
|
let bucket = bucket_size(len);
|
||||||
|
self.stats.current_allocated = self.stats.current_allocated.saturating_sub(len);
|
||||||
|
self.free_lists.entry(bucket).or_default().push((ptr, len));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Actually free all cached buffers.
|
||||||
|
pub fn trim(&mut self) {
|
||||||
|
for (_bucket, list) in self.free_lists.drain() {
|
||||||
|
for (ptr, _len) in list {
|
||||||
|
unsafe { ffi::cudaFree(ptr) };
|
||||||
|
self.stats.cuda_free_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stats(&self) -> &AllocStats {
|
||||||
|
&self.stats
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for CachingAllocator {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Round up to next power-of-2, minimum 512 bytes.
|
||||||
|
fn bucket_size(size: usize) -> usize {
|
||||||
|
let min = 512;
|
||||||
|
if size <= min {
|
||||||
|
return min;
|
||||||
|
}
|
||||||
|
size.next_power_of_two()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bucket_size() {
|
||||||
|
assert_eq!(bucket_size(1), 512);
|
||||||
|
assert_eq!(bucket_size(512), 512);
|
||||||
|
assert_eq!(bucket_size(513), 1024);
|
||||||
|
assert_eq!(bucket_size(1024), 1024);
|
||||||
|
assert_eq!(bucket_size(1025), 2048);
|
||||||
|
assert_eq!(bucket_size(1 << 20), 1 << 20);
|
||||||
|
}
|
||||||
|
}
|
||||||
77
crates/xserv-cuda/src/device.rs
Normal file
77
crates/xserv-cuda/src/device.rs
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
use crate::error::{self, Result};
|
||||||
|
use crate::ffi;
|
||||||
|
use std::ffi::CStr;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct DeviceInfo {
|
||||||
|
pub index: u32,
|
||||||
|
pub name: String,
|
||||||
|
pub total_memory: usize,
|
||||||
|
pub compute_major: i32,
|
||||||
|
pub compute_minor: i32,
|
||||||
|
pub sm_count: i32,
|
||||||
|
pub shared_mem_per_block: usize,
|
||||||
|
pub warp_size: i32,
|
||||||
|
pub max_threads_per_block: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
fn cudaDeviceGetAttribute(value: *mut i32, attr: i32, device: i32) -> i32;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_attr(attr: i32, device: u32) -> Result<i32> {
|
||||||
|
let mut value = 0;
|
||||||
|
error::check(unsafe { cudaDeviceGetAttribute(&mut value, attr, device as i32) })?;
|
||||||
|
Ok(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn device_count() -> Result<i32> {
|
||||||
|
let mut count = 0;
|
||||||
|
error::check(unsafe { ffi::cudaGetDeviceCount(&mut count) })?;
|
||||||
|
Ok(count)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_device(device: u32) -> Result<()> {
|
||||||
|
error::check(unsafe { ffi::cudaSetDevice(device as i32) })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn current_device() -> Result<u32> {
|
||||||
|
let mut dev = 0;
|
||||||
|
error::check(unsafe { ffi::cudaGetDevice(&mut dev) })?;
|
||||||
|
Ok(dev as u32)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn device_info(device: u32) -> Result<DeviceInfo> {
|
||||||
|
// Use cudaGetDeviceProperties only for the name (first field, always stable).
|
||||||
|
let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
|
||||||
|
error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
|
||||||
|
let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
|
||||||
|
.to_string_lossy()
|
||||||
|
.into_owned();
|
||||||
|
|
||||||
|
// Use cudaDeviceGetAttribute for everything else (layout-independent).
|
||||||
|
// Attribute IDs from cuda_runtime_api.h:
|
||||||
|
const TOTAL_GLOBAL_MEM: i32 = 0; // not available via attribute, use prop
|
||||||
|
const SHARED_MEM_PER_BLOCK: i32 = 8;
|
||||||
|
const WARP_SIZE: i32 = 10;
|
||||||
|
const MAX_THREADS_PER_BLOCK: i32 = 1;
|
||||||
|
const MULTI_PROCESSOR_COUNT: i32 = 16;
|
||||||
|
const COMPUTE_MAJOR: i32 = 75;
|
||||||
|
const COMPUTE_MINOR: i32 = 76;
|
||||||
|
|
||||||
|
Ok(DeviceInfo {
|
||||||
|
index: device,
|
||||||
|
name,
|
||||||
|
total_memory: prop.total_global_mem,
|
||||||
|
compute_major: get_attr(COMPUTE_MAJOR, device)?,
|
||||||
|
compute_minor: get_attr(COMPUTE_MINOR, device)?,
|
||||||
|
sm_count: get_attr(MULTI_PROCESSOR_COUNT, device)?,
|
||||||
|
shared_mem_per_block: get_attr(SHARED_MEM_PER_BLOCK, device)? as usize,
|
||||||
|
warp_size: get_attr(WARP_SIZE, device)?,
|
||||||
|
max_threads_per_block: get_attr(MAX_THREADS_PER_BLOCK, device)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn synchronize() -> Result<()> {
|
||||||
|
error::check(unsafe { ffi::cudaDeviceSynchronize() })
|
||||||
|
}
|
||||||
43
crates/xserv-cuda/src/error.rs
Normal file
43
crates/xserv-cuda/src/error.rs
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
use crate::ffi;
|
||||||
|
use std::ffi::CStr;
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum CudaError {
|
||||||
|
OutOfMemory,
|
||||||
|
InvalidDevice,
|
||||||
|
Raw { code: i32, message: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for CudaError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
CudaError::OutOfMemory => write!(f, "CUDA out of memory"),
|
||||||
|
CudaError::InvalidDevice => write!(f, "CUDA invalid device"),
|
||||||
|
CudaError::Raw { code, message } => write!(f, "CUDA error {code}: {message}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::error::Error for CudaError {}
|
||||||
|
|
||||||
|
pub type Result<T> = std::result::Result<T, CudaError>;
|
||||||
|
|
||||||
|
pub(crate) fn check(code: i32) -> Result<()> {
|
||||||
|
if code == ffi::CUDA_SUCCESS {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let message = unsafe {
|
||||||
|
let ptr = ffi::cudaGetErrorString(code);
|
||||||
|
if ptr.is_null() {
|
||||||
|
"unknown error".to_string()
|
||||||
|
} else {
|
||||||
|
CStr::from_ptr(ptr).to_string_lossy().into_owned()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Err(match code {
|
||||||
|
ffi::CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
|
||||||
|
101 => CudaError::InvalidDevice,
|
||||||
|
_ => CudaError::Raw { code, message },
|
||||||
|
})
|
||||||
|
}
|
||||||
73
crates/xserv-cuda/src/ffi.rs
Normal file
73
crates/xserv-cuda/src/ffi.rs
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
use std::ffi::c_void;
|
||||||
|
use std::os::raw::c_char;
|
||||||
|
|
||||||
|
pub type CudaStream = *mut c_void;
|
||||||
|
pub type CudaEvent = *mut c_void;
|
||||||
|
|
||||||
|
pub const CUDA_MEMCPY_H2D: i32 = 1;
|
||||||
|
pub const CUDA_MEMCPY_D2H: i32 = 2;
|
||||||
|
pub const CUDA_MEMCPY_D2D: i32 = 3;
|
||||||
|
|
||||||
|
pub const CUDA_SUCCESS: i32 = 0;
|
||||||
|
pub const CUDA_ERROR_OUT_OF_MEMORY: i32 = 2;
|
||||||
|
|
||||||
|
#[repr(C)]
|
||||||
|
pub struct CudaDeviceProp {
|
||||||
|
pub name: [c_char; 256],
|
||||||
|
pub total_global_mem: usize,
|
||||||
|
pub shared_mem_per_block: usize,
|
||||||
|
pub regs_per_block: i32,
|
||||||
|
pub warp_size: i32,
|
||||||
|
pub max_threads_per_block: i32,
|
||||||
|
pub max_threads_dim: [i32; 3],
|
||||||
|
pub max_grid_size: [i32; 3],
|
||||||
|
pub clock_rate: i32,
|
||||||
|
pub total_const_mem: usize,
|
||||||
|
pub major: i32,
|
||||||
|
pub minor: i32,
|
||||||
|
// There are many more fields; we only read up to what we need.
|
||||||
|
// cudaDeviceProp is a large struct (~1KB). We pad the rest.
|
||||||
|
_pad: [u8; 4096],
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
// --- Device ---
|
||||||
|
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
|
||||||
|
pub fn cudaSetDevice(device: i32) -> i32;
|
||||||
|
pub fn cudaGetDevice(device: *mut i32) -> i32;
|
||||||
|
pub fn cudaGetDeviceProperties(prop: *mut CudaDeviceProp, device: i32) -> i32;
|
||||||
|
pub fn cudaDeviceSynchronize() -> i32;
|
||||||
|
|
||||||
|
// --- Memory ---
|
||||||
|
pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
|
||||||
|
pub fn cudaFree(devptr: *mut u8) -> i32;
|
||||||
|
pub fn cudaMallocHost(ptr: *mut *mut u8, size: usize) -> i32;
|
||||||
|
pub fn cudaFreeHost(ptr: *mut u8) -> i32;
|
||||||
|
pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
|
||||||
|
pub fn cudaMemcpyAsync(
|
||||||
|
dst: *mut u8,
|
||||||
|
src: *const u8,
|
||||||
|
count: usize,
|
||||||
|
kind: i32,
|
||||||
|
stream: CudaStream,
|
||||||
|
) -> i32;
|
||||||
|
pub fn cudaMemset(devptr: *mut u8, value: i32, count: usize) -> i32;
|
||||||
|
|
||||||
|
// --- Stream ---
|
||||||
|
pub fn cudaStreamCreate(stream: *mut CudaStream) -> i32;
|
||||||
|
pub fn cudaStreamDestroy(stream: CudaStream) -> i32;
|
||||||
|
pub fn cudaStreamSynchronize(stream: CudaStream) -> i32;
|
||||||
|
|
||||||
|
// --- Error ---
|
||||||
|
pub fn cudaGetLastError() -> i32;
|
||||||
|
pub fn cudaGetErrorString(error: i32) -> *const c_char;
|
||||||
|
|
||||||
|
// --- Our test kernel ---
|
||||||
|
pub fn launch_vecadd_f32(
|
||||||
|
a: *const f32,
|
||||||
|
b: *const f32,
|
||||||
|
c: *mut f32,
|
||||||
|
n: i32,
|
||||||
|
stream: CudaStream,
|
||||||
|
);
|
||||||
|
}
|
||||||
12
crates/xserv-cuda/src/lib.rs
Normal file
12
crates/xserv-cuda/src/lib.rs
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
pub mod allocator;
|
||||||
|
pub mod device;
|
||||||
|
pub mod error;
|
||||||
|
pub mod ffi;
|
||||||
|
pub mod memory;
|
||||||
|
pub mod stream;
|
||||||
|
|
||||||
|
pub use allocator::CachingAllocator;
|
||||||
|
pub use device::DeviceInfo;
|
||||||
|
pub use error::{CudaError, Result};
|
||||||
|
pub use memory::{GpuBuffer, PinnedBuffer};
|
||||||
|
pub use stream::CudaStream;
|
||||||
146
crates/xserv-cuda/src/memory.rs
Normal file
146
crates/xserv-cuda/src/memory.rs
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
use crate::error::{self, Result};
|
||||||
|
use crate::ffi;
|
||||||
|
use crate::stream::CudaStream;
|
||||||
|
|
||||||
|
/// RAII wrapper around a GPU memory allocation.
|
||||||
|
pub struct GpuBuffer {
|
||||||
|
ptr: *mut u8,
|
||||||
|
len: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GpuBuffer {
|
||||||
|
pub fn alloc(len: usize) -> Result<Self> {
|
||||||
|
assert!(len > 0, "cannot allocate 0 bytes on GPU");
|
||||||
|
let mut ptr = std::ptr::null_mut();
|
||||||
|
error::check(unsafe { ffi::cudaMalloc(&mut ptr, len) })?;
|
||||||
|
Ok(Self { ptr, len })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.len
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_ptr(&self) -> *const u8 {
|
||||||
|
self.ptr
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_mut_ptr(&mut self) -> *mut u8 {
|
||||||
|
self.ptr
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Copy data from host (CPU) slice to this GPU buffer.
|
||||||
|
pub fn copy_from_host(&mut self, src: &[u8]) -> Result<()> {
|
||||||
|
assert!(src.len() <= self.len, "source larger than buffer");
|
||||||
|
error::check(unsafe {
|
||||||
|
ffi::cudaMemcpy(self.ptr, src.as_ptr(), src.len(), ffi::CUDA_MEMCPY_H2D)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Copy data from this GPU buffer to a host (CPU) slice.
|
||||||
|
pub fn copy_to_host(&self, dst: &mut [u8]) -> Result<()> {
|
||||||
|
assert!(dst.len() <= self.len, "destination larger than buffer");
|
||||||
|
error::check(unsafe {
|
||||||
|
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Async copy from host to device on the given stream.
|
||||||
|
/// Safety: `src` must remain valid until the stream operation completes.
|
||||||
|
pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
|
||||||
|
assert!(src.len() <= self.len);
|
||||||
|
error::check(ffi::cudaMemcpyAsync(
|
||||||
|
self.ptr,
|
||||||
|
src.as_ptr(),
|
||||||
|
src.len(),
|
||||||
|
ffi::CUDA_MEMCPY_H2D,
|
||||||
|
stream.as_raw(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Async copy from device to host on the given stream.
|
||||||
|
/// Safety: `dst` must remain valid until the stream operation completes.
|
||||||
|
pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
|
||||||
|
assert!(dst.len() <= self.len);
|
||||||
|
error::check(ffi::cudaMemcpyAsync(
|
||||||
|
dst.as_mut_ptr(),
|
||||||
|
self.ptr,
|
||||||
|
dst.len(),
|
||||||
|
ffi::CUDA_MEMCPY_D2H,
|
||||||
|
stream.as_raw(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Copy from another GPU buffer (D2D).
|
||||||
|
pub fn copy_from_device(&mut self, src: &GpuBuffer) -> Result<()> {
|
||||||
|
let n = src.len.min(self.len);
|
||||||
|
error::check(unsafe {
|
||||||
|
ffi::cudaMemcpy(self.ptr, src.ptr, n, ffi::CUDA_MEMCPY_D2D)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fill buffer with zeros.
|
||||||
|
pub fn zero(&mut self) -> Result<()> {
|
||||||
|
error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
|
||||||
|
/// Caller is responsible for eventually calling cudaFree.
|
||||||
|
pub fn into_raw(self) -> (*mut u8, usize) {
|
||||||
|
let ptr = self.ptr;
|
||||||
|
let len = self.len;
|
||||||
|
std::mem::forget(self);
|
||||||
|
(ptr, len)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reconstruct a GpuBuffer from a raw pointer + length.
|
||||||
|
/// Safety: ptr must have been allocated with cudaMalloc, len must be correct.
|
||||||
|
pub unsafe fn from_raw(ptr: *mut u8, len: usize) -> Self {
|
||||||
|
Self { ptr, len }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for GpuBuffer {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.ptr.is_null() {
|
||||||
|
unsafe { ffi::cudaFree(self.ptr) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe impl Send for GpuBuffer {}
|
||||||
|
|
||||||
|
/// Pinned (page-locked) host memory for faster H2D/D2H transfers.
|
||||||
|
pub struct PinnedBuffer {
|
||||||
|
ptr: *mut u8,
|
||||||
|
len: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PinnedBuffer {
|
||||||
|
pub fn alloc(len: usize) -> Result<Self> {
|
||||||
|
let mut ptr = std::ptr::null_mut();
|
||||||
|
error::check(unsafe { ffi::cudaMallocHost(&mut ptr, len) })?;
|
||||||
|
Ok(Self { ptr, len })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_slice(&self) -> &[u8] {
|
||||||
|
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_mut_slice(&mut self) -> &mut [u8] {
|
||||||
|
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.len
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for PinnedBuffer {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.ptr.is_null() {
|
||||||
|
unsafe { ffi::cudaFreeHost(self.ptr) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe impl Send for PinnedBuffer {}
|
||||||
33
crates/xserv-cuda/src/stream.rs
Normal file
33
crates/xserv-cuda/src/stream.rs
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
use crate::error::{self, Result};
|
||||||
|
use crate::ffi;
|
||||||
|
|
||||||
|
pub struct CudaStream {
|
||||||
|
raw: ffi::CudaStream,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CudaStream {
|
||||||
|
pub fn new() -> Result<Self> {
|
||||||
|
let mut raw = std::ptr::null_mut();
|
||||||
|
error::check(unsafe { ffi::cudaStreamCreate(&mut raw) })?;
|
||||||
|
Ok(Self { raw })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn synchronize(&self) -> Result<()> {
|
||||||
|
error::check(unsafe { ffi::cudaStreamSynchronize(self.raw) })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_raw(&self) -> ffi::CudaStream {
|
||||||
|
self.raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for CudaStream {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.raw.is_null() {
|
||||||
|
unsafe { ffi::cudaStreamDestroy(self.raw) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can move across threads, but not shared without synchronization
|
||||||
|
unsafe impl Send for CudaStream {}
|
||||||
208
crates/xserv-cuda/tests/integration.rs
Normal file
208
crates/xserv-cuda/tests/integration.rs
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
use xserv_cuda::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_device_info() {
|
||||||
|
let count = device::device_count().expect("failed to get device count");
|
||||||
|
assert!(count > 0, "no CUDA devices found");
|
||||||
|
|
||||||
|
let info = device::device_info(0).expect("failed to get device info");
|
||||||
|
println!("GPU 0: {}", info.name);
|
||||||
|
println!(" Memory: {} MB", info.total_memory / (1024 * 1024));
|
||||||
|
println!(
|
||||||
|
" Compute Capability: {}.{}",
|
||||||
|
info.compute_major, info.compute_minor
|
||||||
|
);
|
||||||
|
println!(" SM Count: {}", info.sm_count);
|
||||||
|
println!(" Shared Mem/Block: {} KB", info.shared_mem_per_block / 1024);
|
||||||
|
println!(" Warp Size: {}", info.warp_size);
|
||||||
|
println!(" Max Threads/Block: {}", info.max_threads_per_block);
|
||||||
|
|
||||||
|
assert!(info.total_memory > 0);
|
||||||
|
assert!(info.sm_count > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_gpu_buffer_h2d_d2h() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let data: Vec<u8> = (0..256).map(|i| (i % 256) as u8).collect();
|
||||||
|
let mut buf = GpuBuffer::alloc(data.len()).unwrap();
|
||||||
|
buf.copy_from_host(&data).unwrap();
|
||||||
|
|
||||||
|
let mut out = vec![0u8; data.len()];
|
||||||
|
buf.copy_to_host(&mut out).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(data, out, "H2D → D2H roundtrip mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_gpu_buffer_large() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let size = 64 * 1024 * 1024; // 64 MB
|
||||||
|
let data: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
|
||||||
|
let mut buf = GpuBuffer::alloc(size).unwrap();
|
||||||
|
buf.copy_from_host(&data).unwrap();
|
||||||
|
|
||||||
|
let mut out = vec![0u8; size];
|
||||||
|
buf.copy_to_host(&mut out).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(data, out, "64MB roundtrip mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_gpu_buffer_d2d() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
|
||||||
|
let mut src = GpuBuffer::alloc(data.len()).unwrap();
|
||||||
|
src.copy_from_host(&data).unwrap();
|
||||||
|
|
||||||
|
let mut dst = GpuBuffer::alloc(data.len()).unwrap();
|
||||||
|
dst.copy_from_device(&src).unwrap();
|
||||||
|
|
||||||
|
let mut out = vec![0u8; data.len()];
|
||||||
|
dst.copy_to_host(&mut out).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(data, out, "D2D copy mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_gpu_buffer_zero() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let mut buf = GpuBuffer::alloc(1024).unwrap();
|
||||||
|
buf.zero().unwrap();
|
||||||
|
|
||||||
|
let mut out = vec![0xFFu8; 1024];
|
||||||
|
buf.copy_to_host(&mut out).unwrap();
|
||||||
|
|
||||||
|
assert!(out.iter().all(|&b| b == 0), "zero fill failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stream() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let stream = CudaStream::new().unwrap();
|
||||||
|
stream.synchronize().unwrap();
|
||||||
|
// stream drops here, should destroy cleanly
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vecadd_kernel() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let n = 1024;
|
||||||
|
let a: Vec<f32> = (0..n).map(|i| i as f32).collect();
|
||||||
|
let b: Vec<f32> = (0..n).map(|i| (i * 2) as f32).collect();
|
||||||
|
let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
|
||||||
|
|
||||||
|
let byte_len = n * std::mem::size_of::<f32>();
|
||||||
|
|
||||||
|
let mut d_a = GpuBuffer::alloc(byte_len).unwrap();
|
||||||
|
let mut d_b = GpuBuffer::alloc(byte_len).unwrap();
|
||||||
|
let mut d_c = GpuBuffer::alloc(byte_len).unwrap();
|
||||||
|
|
||||||
|
let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, byte_len) };
|
||||||
|
let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, byte_len) };
|
||||||
|
d_a.copy_from_host(a_bytes).unwrap();
|
||||||
|
d_b.copy_from_host(b_bytes).unwrap();
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
ffi::launch_vecadd_f32(
|
||||||
|
d_a.as_ptr() as *const f32,
|
||||||
|
d_b.as_ptr() as *const f32,
|
||||||
|
d_c.as_mut_ptr() as *mut f32,
|
||||||
|
n as i32,
|
||||||
|
std::ptr::null_mut(), // default stream
|
||||||
|
);
|
||||||
|
}
|
||||||
|
device::synchronize().unwrap();
|
||||||
|
|
||||||
|
let mut result = vec![0.0f32; n];
|
||||||
|
let result_bytes =
|
||||||
|
unsafe { std::slice::from_raw_parts_mut(result.as_mut_ptr() as *mut u8, byte_len) };
|
||||||
|
d_c.copy_to_host(result_bytes).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(result, expected, "vecadd kernel output mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_caching_allocator() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let mut alloc = CachingAllocator::new();
|
||||||
|
|
||||||
|
// First allocation: should trigger cudaMalloc
|
||||||
|
let buf1 = alloc.alloc(1024).unwrap();
|
||||||
|
assert_eq!(alloc.stats().cuda_malloc_count, 1);
|
||||||
|
assert_eq!(alloc.stats().cache_hit_count, 0);
|
||||||
|
|
||||||
|
// Return to cache
|
||||||
|
alloc.dealloc(buf1);
|
||||||
|
|
||||||
|
// Second allocation of same size: should hit cache
|
||||||
|
let _buf2 = alloc.alloc(1024).unwrap();
|
||||||
|
assert_eq!(alloc.stats().cuda_malloc_count, 1, "should reuse cached buffer");
|
||||||
|
assert_eq!(alloc.stats().cache_hit_count, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_caching_allocator_different_sizes() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let mut alloc = CachingAllocator::new();
|
||||||
|
|
||||||
|
let buf1 = alloc.alloc(512).unwrap();
|
||||||
|
let buf2 = alloc.alloc(2048).unwrap();
|
||||||
|
|
||||||
|
alloc.dealloc(buf1);
|
||||||
|
alloc.dealloc(buf2);
|
||||||
|
|
||||||
|
// Re-alloc different sizes: each should hit its own bucket
|
||||||
|
let _buf3 = alloc.alloc(512).unwrap();
|
||||||
|
let _buf4 = alloc.alloc(2048).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(alloc.stats().cuda_malloc_count, 2);
|
||||||
|
assert_eq!(alloc.stats().cache_hit_count, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_pinned_memory() {
|
||||||
|
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
|
||||||
|
let slice = pinned.as_mut_slice();
|
||||||
|
for (i, byte) in slice.iter_mut().enumerate() {
|
||||||
|
*byte = (i % 256) as u8;
|
||||||
|
}
|
||||||
|
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
let mut gpu = GpuBuffer::alloc(4096).unwrap();
|
||||||
|
gpu.copy_from_host(pinned.as_slice()).unwrap();
|
||||||
|
|
||||||
|
let mut out = vec![0u8; 4096];
|
||||||
|
gpu.copy_to_host(&mut out).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(pinned.as_slice(), &out[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_async_copy() {
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
let stream = CudaStream::new().unwrap();
|
||||||
|
|
||||||
|
let mut pinned = PinnedBuffer::alloc(4096).unwrap();
|
||||||
|
for (i, byte) in pinned.as_mut_slice().iter_mut().enumerate() {
|
||||||
|
*byte = (i % 256) as u8;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut gpu = GpuBuffer::alloc(4096).unwrap();
|
||||||
|
unsafe { gpu.copy_from_host_async(pinned.as_slice(), &stream).unwrap() };
|
||||||
|
stream.synchronize().unwrap();
|
||||||
|
|
||||||
|
let mut out_pinned = PinnedBuffer::alloc(4096).unwrap();
|
||||||
|
unsafe { gpu.copy_to_host_async(out_pinned.as_mut_slice(), &stream).unwrap() };
|
||||||
|
stream.synchronize().unwrap();
|
||||||
|
|
||||||
|
assert_eq!(pinned.as_slice(), out_pinned.as_slice());
|
||||||
|
}
|
||||||
16
csrc/test/vecadd.cu
Normal file
16
csrc/test/vecadd.cu
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
extern "C" {
|
||||||
|
|
||||||
|
__global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
|
||||||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx < n) {
|
||||||
|
c[idx] = a[idx] + b[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
|
||||||
|
int block = 256;
|
||||||
|
int grid = (n + block - 1) / block;
|
||||||
|
vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
1754
docs/00-roadmap.md
Normal file
1754
docs/00-roadmap.md
Normal file
File diff suppressed because it is too large
Load Diff
80
docs/01-cuda-ffi.md
Normal file
80
docs/01-cuda-ffi.md
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
# Phase 0+1: CUDA FFI Infrastructure — Design Document
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Build `xserv-cuda`, a Rust crate that wraps CUDA Runtime API with safe abstractions:
|
||||||
|
- Device query and selection
|
||||||
|
- GPU memory allocation with RAII (GpuBuffer)
|
||||||
|
- Caching allocator (avoid repeated cudaMalloc/cudaFree)
|
||||||
|
- CUDA streams for async operations
|
||||||
|
- Host↔Device memory transfers
|
||||||
|
- Error handling wrapping all CUDA calls
|
||||||
|
|
||||||
|
## Module Layout
|
||||||
|
|
||||||
|
```
|
||||||
|
crates/xserv-cuda/
|
||||||
|
├── Cargo.toml
|
||||||
|
├── build.rs # compiles csrc/*.cu via cc crate
|
||||||
|
└── src/
|
||||||
|
├── lib.rs # re-exports
|
||||||
|
├── ffi.rs # raw extern "C" bindings to CUDA runtime
|
||||||
|
├── error.rs # CudaError type
|
||||||
|
├── device.rs # device query, DeviceInfo
|
||||||
|
├── stream.rs # CudaStream wrapper
|
||||||
|
├── memory.rs # GpuBuffer, H2D/D2H/D2D copy
|
||||||
|
└── allocator.rs # CachingAllocator
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key Design Decisions
|
||||||
|
|
||||||
|
### FFI Bindings (ffi.rs)
|
||||||
|
Hand-written extern "C" bindings (~25 functions). No bindgen — keeps things explicit and readable.
|
||||||
|
|
||||||
|
Core functions needed:
|
||||||
|
- Memory: cudaMalloc, cudaFree, cudaMemcpy, cudaMemcpyAsync, cudaMallocHost, cudaFreeHost
|
||||||
|
- Stream: cudaStreamCreate, cudaStreamDestroy, cudaStreamSynchronize
|
||||||
|
- Device: cudaGetDeviceCount, cudaSetDevice, cudaGetDevice, cudaGetDeviceProperties
|
||||||
|
- Sync: cudaDeviceSynchronize
|
||||||
|
- Error: cudaGetLastError, cudaGetErrorString
|
||||||
|
|
||||||
|
### Error Handling (error.rs)
|
||||||
|
Every CUDA call returns cudaError_t. We wrap all calls:
|
||||||
|
```rust
|
||||||
|
pub(crate) fn check(code: i32) -> Result<(), CudaError>
|
||||||
|
```
|
||||||
|
|
||||||
|
### GpuBuffer (memory.rs)
|
||||||
|
RAII wrapper around a GPU pointer. Drop frees memory.
|
||||||
|
```rust
|
||||||
|
pub struct GpuBuffer {
|
||||||
|
ptr: *mut u8,
|
||||||
|
len: usize, // in bytes
|
||||||
|
device: u32,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
- No Clone (explicit copy_from instead)
|
||||||
|
- Send + !Sync (can move across threads, but not shared)
|
||||||
|
|
||||||
|
### CachingAllocator (allocator.rs)
|
||||||
|
Avoids cudaMalloc/cudaFree per allocation. Maintains a free-list keyed by size bucket.
|
||||||
|
|
||||||
|
Bucket boundaries: round up to next power of 2, minimum 512 bytes.
|
||||||
|
- alloc(size) → find bucket, pop from free list or cudaMalloc
|
||||||
|
- dealloc(ptr, size) → push to free list (don't cudaFree)
|
||||||
|
- trim() → actually cudaFree everything in free lists
|
||||||
|
|
||||||
|
### CudaStream (stream.rs)
|
||||||
|
Wraps cudaStream_t. RAII with Drop calling cudaStreamDestroy.
|
||||||
|
|
||||||
|
## Build Pipeline
|
||||||
|
- `csrc/test/vecadd.cu`: minimal vector-add kernel for smoke test
|
||||||
|
- `build.rs` uses `cc` crate to compile .cu files, link CUDA runtime
|
||||||
|
|
||||||
|
## Test Plan
|
||||||
|
1. Device info: print GPU name, memory, compute capability, SM count
|
||||||
|
2. GpuBuffer: alloc 1GB, H2D copy, D2H copy, verify data
|
||||||
|
3. Vector add kernel: launch from Rust, verify output
|
||||||
|
4. CachingAllocator: alloc→free→realloc same size uses cache (no new cudaMalloc)
|
||||||
|
5. Multi-stream: two concurrent memcpy on different streams
|
||||||
|
6. Benchmark: caching allocator vs raw cudaMalloc (100 cycles)
|
||||||
25
tools/sync-and-build.sh
Executable file
25
tools/sync-and-build.sh
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Sync local project to dash5 and build/test there.
|
||||||
|
# Usage: ./tools/sync-and-build.sh [test|build|run]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
REMOTE="dash5"
|
||||||
|
REMOTE_DIR="/opt/wjh/projects/xserv"
|
||||||
|
LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
|
||||||
|
ACTION="${1:-build}"
|
||||||
|
|
||||||
|
echo "=== Syncing to $REMOTE:$REMOTE_DIR ==="
|
||||||
|
ssh "$REMOTE" "mkdir -p $REMOTE_DIR"
|
||||||
|
rsync -az --delete \
|
||||||
|
--exclude target \
|
||||||
|
--exclude .git \
|
||||||
|
"$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/"
|
||||||
|
|
||||||
|
echo "=== Running: cargo $ACTION ==="
|
||||||
|
ssh "$REMOTE" "source \$HOME/.cargo/env && \
|
||||||
|
export PATH=/usr/local/cuda/bin:\$PATH && \
|
||||||
|
export CUDA_HOME=/usr/local/cuda && \
|
||||||
|
cd $REMOTE_DIR && \
|
||||||
|
cargo $ACTION --release 2>&1"
|
||||||
Reference in New Issue
Block a user