diff --git a/Cargo.toml b/Cargo.toml index c7b46dc..34e3b83 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ resolver = "2" members = [ "crates/xserv-cuda", + "crates/xserv-tensor", ] [workspace.package] diff --git a/crates/xserv-tensor/Cargo.toml b/crates/xserv-tensor/Cargo.toml new file mode 100644 index 0000000..ab69203 --- /dev/null +++ b/crates/xserv-tensor/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "xserv-tensor" +version.workspace = true +edition.workspace = true + +[dependencies] +xserv-cuda = { path = "../xserv-cuda" } +half.workspace = true +smallvec.workspace = true diff --git a/crates/xserv-tensor/src/dtype.rs b/crates/xserv-tensor/src/dtype.rs new file mode 100644 index 0000000..058f81b --- /dev/null +++ b/crates/xserv-tensor/src/dtype.rs @@ -0,0 +1,57 @@ +use half::{bf16, f16}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum DType { + F32, + F16, + BF16, +} + +impl DType { + pub fn size_bytes(self) -> usize { + match self { + DType::F32 => 4, + DType::F16 => 2, + DType::BF16 => 2, + } + } + + pub fn name(self) -> &'static str { + match self { + DType::F32 => "f32", + DType::F16 => "f16", + DType::BF16 => "bf16", + } + } +} + +impl std::fmt::Display for DType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.name()) + } +} + +/// Trait for types that can be stored in a Tensor. +pub trait TensorDType: Copy + Send + Sync + 'static { + const DTYPE: DType; + fn to_f64(self) -> f64; + fn from_f64(v: f64) -> Self; +} + +impl TensorDType for f32 { + const DTYPE: DType = DType::F32; + fn to_f64(self) -> f64 { self as f64 } + fn from_f64(v: f64) -> Self { v as f32 } +} + +impl TensorDType for f16 { + const DTYPE: DType = DType::F16; + fn to_f64(self) -> f64 { self.to_f32() as f64 } + fn from_f64(v: f64) -> Self { f16::from_f32(v as f32) } +} + +impl TensorDType for bf16 { + const DTYPE: DType = DType::BF16; + fn to_f64(self) -> f64 { self.to_f32() as f64 } + fn from_f64(v: f64) -> Self { bf16::from_f32(v as f32) } +} diff --git a/crates/xserv-tensor/src/lib.rs b/crates/xserv-tensor/src/lib.rs new file mode 100644 index 0000000..753ee40 --- /dev/null +++ b/crates/xserv-tensor/src/lib.rs @@ -0,0 +1,8 @@ +pub mod dtype; +pub mod shape; +pub mod storage; +pub mod tensor; + +pub use dtype::{DType, TensorDType}; +pub use storage::Device; +pub use tensor::Tensor; diff --git a/crates/xserv-tensor/src/shape.rs b/crates/xserv-tensor/src/shape.rs new file mode 100644 index 0000000..5f70dc6 --- /dev/null +++ b/crates/xserv-tensor/src/shape.rs @@ -0,0 +1,105 @@ +use smallvec::SmallVec; + +pub type Dims = SmallVec<[usize; 4]>; + +/// Compute contiguous strides for a given shape (row-major / C order). +/// Example: shape [2, 3, 4] => strides [12, 4, 1] +pub fn contiguous_strides(shape: &[usize]) -> Dims { + let mut strides = SmallVec::with_capacity(shape.len()); + strides.resize(shape.len(), 0); + if shape.is_empty() { + return strides; + } + strides[shape.len() - 1] = 1; + for i in (0..shape.len() - 1).rev() { + strides[i] = strides[i + 1] * shape[i + 1]; + } + strides +} + +/// Check if the given strides represent contiguous (row-major) layout for the shape. +pub fn is_contiguous(shape: &[usize], strides: &[usize]) -> bool { + if shape.is_empty() { + return true; + } + let expected = contiguous_strides(shape); + strides == expected.as_slice() +} + +/// Total number of elements given a shape. +pub fn num_elements(shape: &[usize]) -> usize { + shape.iter().product() +} + +/// Compute the shape after broadcasting two shapes together (NumPy rules). +/// Returns None if shapes are not broadcastable. +pub fn broadcast_shape(a: &[usize], b: &[usize]) -> Option { + let ndim = a.len().max(b.len()); + let mut result = SmallVec::with_capacity(ndim); + for i in 0..ndim { + let da = if i < ndim - a.len() { 1 } else { a[i - (ndim - a.len())] }; + let db = if i < ndim - b.len() { 1 } else { b[i - (ndim - b.len())] }; + if da == db { + result.push(da); + } else if da == 1 { + result.push(db); + } else if db == 1 { + result.push(da); + } else { + return None; + } + } + Some(result) +} + +/// Compute broadcast strides: for dimensions where size is 1 but output is >1, stride becomes 0. +pub fn broadcast_strides(shape: &[usize], strides: &[usize], target_shape: &[usize]) -> Dims { + let ndim = target_shape.len(); + let offset = ndim - shape.len(); + let mut result = SmallVec::with_capacity(ndim); + for i in 0..ndim { + if i < offset { + result.push(0); + } else { + let orig_idx = i - offset; + if shape[orig_idx] == 1 && target_shape[i] > 1 { + result.push(0); + } else { + result.push(strides[orig_idx]); + } + } + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_contiguous_strides() { + assert_eq!(contiguous_strides(&[2, 3, 4]).as_slice(), &[12, 4, 1]); + assert_eq!(contiguous_strides(&[5]).as_slice(), &[1]); + assert_eq!(contiguous_strides(&[2, 3]).as_slice(), &[3, 1]); + } + + #[test] + fn test_is_contiguous() { + assert!(is_contiguous(&[2, 3], &[3, 1])); + assert!(!is_contiguous(&[3, 2], &[1, 3])); // transposed + } + + #[test] + fn test_broadcast_shape() { + assert_eq!(broadcast_shape(&[3, 1], &[1, 4]).unwrap().as_slice(), &[3, 4]); + assert_eq!(broadcast_shape(&[2, 3, 4], &[4]).unwrap().as_slice(), &[2, 3, 4]); + assert_eq!(broadcast_shape(&[1], &[5, 3]).unwrap().as_slice(), &[5, 3]); + assert!(broadcast_shape(&[3], &[4]).is_none()); + } + + #[test] + fn test_broadcast_strides() { + // [3,1] with strides [1,1] broadcast to [3,4] + assert_eq!(broadcast_strides(&[3, 1], &[1, 1], &[3, 4]).as_slice(), &[1, 0]); + } +} diff --git a/crates/xserv-tensor/src/storage.rs b/crates/xserv-tensor/src/storage.rs new file mode 100644 index 0000000..ea4ade7 --- /dev/null +++ b/crates/xserv-tensor/src/storage.rs @@ -0,0 +1,119 @@ +use std::sync::Arc; +use xserv_cuda::{GpuBuffer, Result as CudaResult}; + +enum StorageInner { + Cpu { data: Vec }, + Cuda { buffer: GpuBuffer }, +} + +/// Reference-counted storage for tensor data. Multiple tensors can share +/// the same storage (e.g., after transpose or slice — view semantics). +#[derive(Clone)] +pub struct Storage(Arc); + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Device { + Cpu, + Cuda(u32), +} + +impl std::fmt::Display for Device { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Device::Cpu => write!(f, "cpu"), + Device::Cuda(i) => write!(f, "cuda:{i}"), + } + } +} + +impl Storage { + pub fn cpu(data: Vec) -> Self { + Self(Arc::new(StorageInner::Cpu { data })) + } + + pub fn cuda(buffer: GpuBuffer) -> Self { + Self(Arc::new(StorageInner::Cuda { buffer })) + } + + pub fn device(&self) -> Device { + match self.0.as_ref() { + StorageInner::Cpu { .. } => Device::Cpu, + StorageInner::Cuda { .. } => Device::Cuda(0), + } + } + + pub fn len_bytes(&self) -> usize { + match self.0.as_ref() { + StorageInner::Cpu { data } => data.len(), + StorageInner::Cuda { buffer } => buffer.len(), + } + } + + /// Get a read-only view of CPU data. Panics if storage is on GPU. + pub fn as_cpu_bytes(&self) -> &[u8] { + match self.0.as_ref() { + StorageInner::Cpu { data } => data, + StorageInner::Cuda { .. } => panic!("cannot access GPU storage as CPU bytes"), + } + } + + pub fn gpu_buffer(&self) -> &GpuBuffer { + match self.0.as_ref() { + StorageInner::Cuda { buffer } => buffer, + StorageInner::Cpu { .. } => panic!("cannot access CPU storage as GPU buffer"), + } + } + + /// Copy to a different device. If already on the target device, clones the Arc (no copy). + pub fn to_device(&self, target: Device) -> CudaResult { + let current = self.device(); + if current == target { + return Ok(self.clone()); + } + match (current, target) { + (Device::Cpu, Device::Cuda(_dev)) => { + let cpu_data = self.as_cpu_bytes(); + let mut buf = GpuBuffer::alloc(cpu_data.len())?; + buf.copy_from_host(cpu_data)?; + Ok(Storage::cuda(buf)) + } + (Device::Cuda(_), Device::Cpu) => { + let gpu_buf = self.gpu_buffer(); + let mut data = vec![0u8; gpu_buf.len()]; + gpu_buf.copy_to_host(&mut data)?; + Ok(Storage::cpu(data)) + } + (Device::Cuda(_), Device::Cuda(_)) => { + let src = self.gpu_buffer(); + let mut dst = GpuBuffer::alloc(src.len())?; + dst.copy_from_device(src)?; + Ok(Storage::cuda(dst)) + } + _ => unreachable!(), + } + } + + /// Create a new owned copy of the storage on the same device. + pub fn deep_copy(&self) -> CudaResult { + match self.0.as_ref() { + StorageInner::Cpu { data } => Ok(Storage::cpu(data.clone())), + StorageInner::Cuda { buffer } => { + let mut dst = GpuBuffer::alloc(buffer.len())?; + dst.copy_from_device(buffer)?; + Ok(Storage::cuda(dst)) + } + } + } + + /// Allocate zeroed storage on the given device. + pub fn zeros(len_bytes: usize, device: Device) -> CudaResult { + match device { + Device::Cpu => Ok(Storage::cpu(vec![0u8; len_bytes])), + Device::Cuda(_) => { + let mut buf = GpuBuffer::alloc(len_bytes)?; + buf.zero()?; + Ok(Storage::cuda(buf)) + } + } + } +} diff --git a/crates/xserv-tensor/src/tensor.rs b/crates/xserv-tensor/src/tensor.rs new file mode 100644 index 0000000..e94b9f4 --- /dev/null +++ b/crates/xserv-tensor/src/tensor.rs @@ -0,0 +1,228 @@ +use crate::dtype::{DType, TensorDType}; +use crate::shape::{self, Dims}; +use crate::storage::{Device, Storage}; + +/// Multi-dimensional array with CPU or GPU storage. +/// +/// Tensors support view semantics: transpose, slice, etc. share +/// the underlying storage and only change shape/strides/offset. +#[derive(Clone)] +pub struct Tensor { + storage: Storage, + shape: Dims, + strides: Dims, + offset: usize, + dtype: DType, +} + +impl Tensor { + // --- Creation --- + + pub fn from_slice(data: &[T], shape: &[usize]) -> Self { + let numel: usize = shape.iter().product(); + assert_eq!(data.len(), numel, "data length mismatch with shape"); + let bytes = unsafe { + std::slice::from_raw_parts(data.as_ptr() as *const u8, numel * T::DTYPE.size_bytes()) + }; + Self { + storage: Storage::cpu(bytes.to_vec()), + shape: Dims::from_slice(shape), + strides: shape::contiguous_strides(shape), + offset: 0, + dtype: T::DTYPE, + } + } + + pub fn zeros(shape: &[usize], dtype: DType, device: Device) -> Self { + let numel = shape::num_elements(shape); + let len_bytes = numel * dtype.size_bytes(); + let storage = Storage::zeros(len_bytes, device).expect("alloc failed"); + Self { + storage, + shape: Dims::from_slice(shape), + strides: shape::contiguous_strides(shape), + offset: 0, + dtype, + } + } + + pub fn ones(shape: &[usize], dtype: DType) -> Self { + let numel = shape::num_elements(shape); + match dtype { + DType::F32 => Self::from_slice(&vec![1.0f32; numel], shape), + DType::F16 => Self::from_slice(&vec![half::f16::from_f32(1.0); numel], shape), + DType::BF16 => Self::from_slice(&vec![half::bf16::from_f32(1.0); numel], shape), + } + } + + // --- Properties --- + + pub fn shape(&self) -> &[usize] { &self.shape } + pub fn strides(&self) -> &[usize] { &self.strides } + pub fn dtype(&self) -> DType { self.dtype } + pub fn ndim(&self) -> usize { self.shape.len() } + pub fn numel(&self) -> usize { shape::num_elements(&self.shape) } + pub fn offset(&self) -> usize { self.offset } + + pub fn device(&self) -> Device { self.storage.device() } + + pub fn is_contiguous(&self) -> bool { + shape::is_contiguous(&self.shape, &self.strides) + } + + // --- Shape operations (view, no copy) --- + + pub fn reshape(&self, new_shape: &[usize]) -> Self { + assert!(self.is_contiguous(), "reshape requires contiguous tensor"); + let new_numel: usize = new_shape.iter().product(); + assert_eq!(new_numel, self.numel(), "reshape numel mismatch"); + Self { + storage: self.storage.clone(), + shape: Dims::from_slice(new_shape), + strides: shape::contiguous_strides(new_shape), + offset: self.offset, + dtype: self.dtype, + } + } + + pub fn transpose(&self, dim0: usize, dim1: usize) -> Self { + assert!(dim0 < self.ndim() && dim1 < self.ndim()); + let mut new_shape = self.shape.clone(); + let mut new_strides = self.strides.clone(); + new_shape.swap(dim0, dim1); + new_strides.swap(dim0, dim1); + Self { + storage: self.storage.clone(), + shape: new_shape, + strides: new_strides, + offset: self.offset, + dtype: self.dtype, + } + } + + pub fn squeeze(&self, dim: usize) -> Self { + assert!(dim < self.ndim() && self.shape[dim] == 1); + let mut new_shape = self.shape.clone(); + let mut new_strides = self.strides.clone(); + new_shape.remove(dim); + new_strides.remove(dim); + Self { + storage: self.storage.clone(), + shape: new_shape, + strides: new_strides, + offset: self.offset, + dtype: self.dtype, + } + } + + pub fn unsqueeze(&self, dim: usize) -> Self { + assert!(dim <= self.ndim()); + let mut new_shape = self.shape.clone(); + let mut new_strides = self.strides.clone(); + new_shape.insert(dim, 1); + let stride_val = if dim < self.strides.len() { self.strides[dim] } else { 1 }; + new_strides.insert(dim, stride_val); + Self { + storage: self.storage.clone(), + shape: new_shape, + strides: new_strides, + offset: self.offset, + dtype: self.dtype, + } + } + + /// Make contiguous: if already contiguous, return clone (shared storage). + /// Otherwise, copy data into a new contiguous buffer. + pub fn contiguous(&self) -> Self { + if self.is_contiguous() { + return self.clone(); + } + // Copy to contiguous layout on CPU + assert_eq!(self.device(), Device::Cpu, "contiguous() on GPU not yet supported"); + let numel = self.numel(); + let elem_size = self.dtype.size_bytes(); + let src_bytes = self.storage.as_cpu_bytes(); + let mut dst = vec![0u8; numel * elem_size]; + // Iterate all elements using strides + let ndim = self.ndim(); + let mut idx = vec![0usize; ndim]; + for flat in 0..numel { + let src_offset = self.offset + idx.iter().zip(self.strides.iter()).map(|(i, s)| i * s).sum::(); + let src_byte_offset = src_offset * elem_size; + let dst_byte_offset = flat * elem_size; + dst[dst_byte_offset..dst_byte_offset + elem_size] + .copy_from_slice(&src_bytes[src_byte_offset..src_byte_offset + elem_size]); + // Increment index (rightmost first) + for d in (0..ndim).rev() { + idx[d] += 1; + if idx[d] < self.shape[d] { + break; + } + idx[d] = 0; + } + } + Self { + storage: Storage::cpu(dst), + shape: self.shape.clone(), + strides: shape::contiguous_strides(&self.shape), + offset: 0, + dtype: self.dtype, + } + } + + // --- Device transfer --- + + pub fn to_device(&self, device: Device) -> Self { + let t = if self.is_contiguous() { self.clone() } else { self.contiguous() }; + if t.device() == device { + return t; + } + let new_storage = t.storage.to_device(device).expect("device transfer failed"); + Self { + storage: new_storage, + shape: t.shape, + strides: t.strides, + offset: 0, + dtype: t.dtype, + } + } + + // --- Data access (CPU only) --- + + /// Read tensor data as a typed slice. Requires contiguous CPU tensor. + pub fn as_slice(&self) -> &[T] { + assert_eq!(T::DTYPE, self.dtype, "dtype mismatch"); + assert!(self.is_contiguous(), "as_slice requires contiguous"); + assert_eq!(self.device(), Device::Cpu, "as_slice requires CPU"); + let bytes = self.storage.as_cpu_bytes(); + let elem_size = self.dtype.size_bytes(); + let start = self.offset * elem_size; + let len = self.numel(); + unsafe { std::slice::from_raw_parts(bytes[start..].as_ptr() as *const T, len) } + } + + /// Raw pointer to storage start (for GPU kernel launch). + pub fn data_ptr(&self) -> *const u8 { + match self.device() { + Device::Cpu => { + let bytes = self.storage.as_cpu_bytes(); + unsafe { bytes.as_ptr().add(self.offset * self.dtype.size_bytes()) } + } + Device::Cuda(_) => { + let buf = self.storage.gpu_buffer(); + unsafe { buf.as_ptr().add(self.offset * self.dtype.size_bytes()) } + } + } + } + + pub fn storage(&self) -> &Storage { &self.storage } +} + +impl std::fmt::Debug for Tensor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, "Tensor(shape={:?}, dtype={}, device={}, contiguous={})", + self.shape.as_slice(), self.dtype, self.device(), self.is_contiguous() + ) + } +} diff --git a/crates/xserv-tensor/tests/integration.rs b/crates/xserv-tensor/tests/integration.rs new file mode 100644 index 0000000..99b7bdd --- /dev/null +++ b/crates/xserv-tensor/tests/integration.rs @@ -0,0 +1,127 @@ +use half::bf16; +use xserv_tensor::*; + +#[test] +fn test_from_slice_and_shape() { + let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]; + let t = Tensor::from_slice(&data, &[2, 3]); + assert_eq!(t.shape(), &[2, 3]); + assert_eq!(t.strides(), &[3, 1]); + assert_eq!(t.numel(), 6); + assert_eq!(t.ndim(), 2); + assert!(t.is_contiguous()); + assert_eq!(t.dtype(), DType::F32); + assert_eq!(t.device(), Device::Cpu); +} + +#[test] +fn test_as_slice() { + let data = vec![1.0f32, 2.0, 3.0, 4.0]; + let t = Tensor::from_slice(&data, &[4]); + assert_eq!(t.as_slice::(), &[1.0, 2.0, 3.0, 4.0]); +} + +#[test] +fn test_zeros_and_ones() { + let z = Tensor::zeros(&[2, 3], DType::F32, Device::Cpu); + assert_eq!(z.as_slice::(), &[0.0; 6]); + + let o = Tensor::ones(&[3], DType::F32); + assert_eq!(o.as_slice::(), &[1.0, 1.0, 1.0]); +} + +#[test] +fn test_bf16_tensor() { + let data: Vec = vec![bf16::from_f32(1.0), bf16::from_f32(2.5), bf16::from_f32(-3.0)]; + let t = Tensor::from_slice(&data, &[3]); + assert_eq!(t.dtype(), DType::BF16); + let out = t.as_slice::(); + assert_eq!(out[0].to_f32(), 1.0); + assert!((out[1].to_f32() - 2.5).abs() < 0.01); + assert_eq!(out[2].to_f32(), -3.0); +} + +#[test] +fn test_reshape() { + let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]; + let t = Tensor::from_slice(&data, &[2, 3]); + let t2 = t.reshape(&[3, 2]); + assert_eq!(t2.shape(), &[3, 2]); + assert_eq!(t2.as_slice::(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); + + let t3 = t.reshape(&[6]); + assert_eq!(t3.shape(), &[6]); +} + +#[test] +fn test_transpose() { + let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]; + let t = Tensor::from_slice(&data, &[2, 3]); + let tt = t.transpose(0, 1); + assert_eq!(tt.shape(), &[3, 2]); + assert_eq!(tt.strides(), &[1, 3]); + assert!(!tt.is_contiguous()); +} + +#[test] +fn test_contiguous_from_transpose() { + let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]; + // Original [2,3]: [[1,2,3],[4,5,6]] + let t = Tensor::from_slice(&data, &[2, 3]); + // Transpose to [3,2]: [[1,4],[2,5],[3,6]] + let tt = t.transpose(0, 1); + let tc = tt.contiguous(); + assert!(tc.is_contiguous()); + assert_eq!(tc.shape(), &[3, 2]); + assert_eq!(tc.as_slice::(), &[1.0, 4.0, 2.0, 5.0, 3.0, 6.0]); +} + +#[test] +fn test_squeeze_unsqueeze() { + let data = vec![1.0f32, 2.0, 3.0]; + let t = Tensor::from_slice(&data, &[1, 3]); + let squeezed = t.squeeze(0); + assert_eq!(squeezed.shape(), &[3]); + + let unsqueezed = squeezed.unsqueeze(0); + assert_eq!(unsqueezed.shape(), &[1, 3]); + + let unsqueezed2 = squeezed.unsqueeze(1); + assert_eq!(unsqueezed2.shape(), &[3, 1]); +} + +#[test] +fn test_cpu_to_gpu_roundtrip() { + xserv_cuda::device::set_device(0).unwrap(); + + let data = vec![1.0f32, 2.0, 3.0, 4.0]; + let cpu_t = Tensor::from_slice(&data, &[2, 2]); + let gpu_t = cpu_t.to_device(Device::Cuda(0)); + assert_eq!(gpu_t.device(), Device::Cuda(0)); + assert_eq!(gpu_t.shape(), &[2, 2]); + + let back = gpu_t.to_device(Device::Cpu); + assert_eq!(back.device(), Device::Cpu); + assert_eq!(back.as_slice::(), &[1.0, 2.0, 3.0, 4.0]); +} + +#[test] +fn test_zeros_gpu() { + xserv_cuda::device::set_device(0).unwrap(); + + let t = Tensor::zeros(&[4, 4], DType::F32, Device::Cuda(0)); + assert_eq!(t.device(), Device::Cuda(0)); + assert_eq!(t.shape(), &[4, 4]); + + let cpu = t.to_device(Device::Cpu); + assert_eq!(cpu.as_slice::(), &[0.0f32; 16]); +} + +#[test] +fn test_debug_format() { + let t = Tensor::from_slice(&[1.0f32], &[1]); + let dbg = format!("{:?}", t); + assert!(dbg.contains("shape=[1]")); + assert!(dbg.contains("f32")); + assert!(dbg.contains("cpu")); +}