phase 2: tensor abstraction layer

- DType enum (F32, F16, BF16) with TensorDType trait - Shape utilities: contiguous_strides, broadcast_shape, broadcast_strides - Storage with Arc reference counting (CPU Vec<u8> or GPU GpuBuffer) - Device enum (Cpu, Cuda(id)) with to_device transfer - Tensor type with strided layout: reshape, transpose, squeeze, unsqueeze - contiguous() copies non-contiguous views to contiguous layout - from_slice, zeros, ones constructors - as_slice<T> for typed CPU read access, data_ptr for GPU kernel launch - CPU↔GPU roundtrip verified - All 27 tests pass (12 cuda + 4 shape + 11 tensor) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 19:45:22 +08:00
parent c8f7bc0c3c
commit a83971fa25
8 changed files with 654 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
 resolver = "2"
 members = [
    "crates/xserv-cuda",
    "crates/xserv-tensor",
 ]
 [workspace.package]
--- a/crates/xserv-tensor/Cargo.toml
+++ b/crates/xserv-tensor/Cargo.toml
@@ -0,0 +1,9 @@
 [package]
 name = "xserv-tensor"
 version.workspace = true
 edition.workspace = true
 [dependencies]
 xserv-cuda = { path = "../xserv-cuda" }
 half.workspace = true
 smallvec.workspace = true
--- a/crates/xserv-tensor/src/dtype.rs
+++ b/crates/xserv-tensor/src/dtype.rs
@@ -0,0 +1,57 @@
 use half::{bf16, f16};
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum DType {
    F32,
    F16,
    BF16,
 }
 impl DType {
    pub fn size_bytes(self) -> usize {
        match self {
            DType::F32 => 4,
            DType::F16 => 2,
            DType::BF16 => 2,
        }
    }
    pub fn name(self) -> &'static str {
        match self {
            DType::F32 => "f32",
            DType::F16 => "f16",
            DType::BF16 => "bf16",
        }
    }
 }
 impl std::fmt::Display for DType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.name())
    }
 }
 /// Trait for types that can be stored in a Tensor.
 pub trait TensorDType: Copy + Send + Sync + 'static {
    const DTYPE: DType;
    fn to_f64(self) -> f64;
    fn from_f64(v: f64) -> Self;
 }
 impl TensorDType for f32 {
    const DTYPE: DType = DType::F32;
    fn to_f64(self) -> f64 { self as f64 }
    fn from_f64(v: f64) -> Self { v as f32 }
 }
 impl TensorDType for f16 {
    const DTYPE: DType = DType::F16;
    fn to_f64(self) -> f64 { self.to_f32() as f64 }
    fn from_f64(v: f64) -> Self { f16::from_f32(v as f32) }
 }
 impl TensorDType for bf16 {
    const DTYPE: DType = DType::BF16;
    fn to_f64(self) -> f64 { self.to_f32() as f64 }
    fn from_f64(v: f64) -> Self { bf16::from_f32(v as f32) }
 }
--- a/crates/xserv-tensor/src/lib.rs
+++ b/crates/xserv-tensor/src/lib.rs
@@ -0,0 +1,8 @@
 pub mod dtype;
 pub mod shape;
 pub mod storage;
 pub mod tensor;
 pub use dtype::{DType, TensorDType};
 pub use storage::Device;
 pub use tensor::Tensor;
--- a/crates/xserv-tensor/src/shape.rs
+++ b/crates/xserv-tensor/src/shape.rs
@@ -0,0 +1,105 @@
 use smallvec::SmallVec;
 pub type Dims = SmallVec<[usize; 4]>;
 /// Compute contiguous strides for a given shape (row-major / C order).
 /// Example: shape [2, 3, 4] => strides [12, 4, 1]
 pub fn contiguous_strides(shape: &[usize]) -> Dims {
    let mut strides = SmallVec::with_capacity(shape.len());
    strides.resize(shape.len(), 0);
    if shape.is_empty() {
        return strides;
    }
    strides[shape.len() - 1] = 1;
    for i in (0..shape.len() - 1).rev() {
        strides[i] = strides[i + 1] * shape[i + 1];
    }
    strides
 }
 /// Check if the given strides represent contiguous (row-major) layout for the shape.
 pub fn is_contiguous(shape: &[usize], strides: &[usize]) -> bool {
    if shape.is_empty() {
        return true;
    }
    let expected = contiguous_strides(shape);
    strides == expected.as_slice()
 }
 /// Total number of elements given a shape.
 pub fn num_elements(shape: &[usize]) -> usize {
    shape.iter().product()
 }
 /// Compute the shape after broadcasting two shapes together (NumPy rules).
 /// Returns None if shapes are not broadcastable.
 pub fn broadcast_shape(a: &[usize], b: &[usize]) -> Option<Dims> {
    let ndim = a.len().max(b.len());
    let mut result = SmallVec::with_capacity(ndim);
    for i in 0..ndim {
        let da = if i < ndim - a.len() { 1 } else { a[i - (ndim - a.len())] };
        let db = if i < ndim - b.len() { 1 } else { b[i - (ndim - b.len())] };
        if da == db {
            result.push(da);
        } else if da == 1 {
            result.push(db);
        } else if db == 1 {
            result.push(da);
        } else {
            return None;
        }
    }
    Some(result)
 }
 /// Compute broadcast strides: for dimensions where size is 1 but output is >1, stride becomes 0.
 pub fn broadcast_strides(shape: &[usize], strides: &[usize], target_shape: &[usize]) -> Dims {
    let ndim = target_shape.len();
    let offset = ndim - shape.len();
    let mut result = SmallVec::with_capacity(ndim);
    for i in 0..ndim {
        if i < offset {
            result.push(0);
        } else {
            let orig_idx = i - offset;
            if shape[orig_idx] == 1 && target_shape[i] > 1 {
                result.push(0);
            } else {
                result.push(strides[orig_idx]);
            }
        }
    }
    result
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_contiguous_strides() {
        assert_eq!(contiguous_strides(&[2, 3, 4]).as_slice(), &[12, 4, 1]);
        assert_eq!(contiguous_strides(&[5]).as_slice(), &[1]);
        assert_eq!(contiguous_strides(&[2, 3]).as_slice(), &[3, 1]);
    }
    #[test]
    fn test_is_contiguous() {
        assert!(is_contiguous(&[2, 3], &[3, 1]));
        assert!(!is_contiguous(&[3, 2], &[1, 3])); // transposed
    }
    #[test]
    fn test_broadcast_shape() {
        assert_eq!(broadcast_shape(&[3, 1], &[1, 4]).unwrap().as_slice(), &[3, 4]);
        assert_eq!(broadcast_shape(&[2, 3, 4], &[4]).unwrap().as_slice(), &[2, 3, 4]);
        assert_eq!(broadcast_shape(&[1], &[5, 3]).unwrap().as_slice(), &[5, 3]);
        assert!(broadcast_shape(&[3], &[4]).is_none());
    }
    #[test]
    fn test_broadcast_strides() {
        // [3,1] with strides [1,1] broadcast to [3,4]
        assert_eq!(broadcast_strides(&[3, 1], &[1, 1], &[3, 4]).as_slice(), &[1, 0]);
    }
 }
--- a/crates/xserv-tensor/src/storage.rs
+++ b/crates/xserv-tensor/src/storage.rs
@@ -0,0 +1,119 @@
 use std::sync::Arc;
 use xserv_cuda::{GpuBuffer, Result as CudaResult};
 enum StorageInner {
    Cpu { data: Vec<u8> },
    Cuda { buffer: GpuBuffer },
 }
 /// Reference-counted storage for tensor data. Multiple tensors can share
 /// the same storage (e.g., after transpose or slice — view semantics).
 #[derive(Clone)]
 pub struct Storage(Arc<StorageInner>);
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Device {
    Cpu,
    Cuda(u32),
 }
 impl std::fmt::Display for Device {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Device::Cpu => write!(f, "cpu"),
            Device::Cuda(i) => write!(f, "cuda:{i}"),
        }
    }
 }
 impl Storage {
    pub fn cpu(data: Vec<u8>) -> Self {
        Self(Arc::new(StorageInner::Cpu { data }))
    }
    pub fn cuda(buffer: GpuBuffer) -> Self {
        Self(Arc::new(StorageInner::Cuda { buffer }))
    }
    pub fn device(&self) -> Device {
        match self.0.as_ref() {
            StorageInner::Cpu { .. } => Device::Cpu,
            StorageInner::Cuda { .. } => Device::Cuda(0),
        }
    }
    pub fn len_bytes(&self) -> usize {
        match self.0.as_ref() {
            StorageInner::Cpu { data } => data.len(),
            StorageInner::Cuda { buffer } => buffer.len(),
        }
    }
    /// Get a read-only view of CPU data. Panics if storage is on GPU.
    pub fn as_cpu_bytes(&self) -> &[u8] {
        match self.0.as_ref() {
            StorageInner::Cpu { data } => data,
            StorageInner::Cuda { .. } => panic!("cannot access GPU storage as CPU bytes"),
        }
    }
    pub fn gpu_buffer(&self) -> &GpuBuffer {
        match self.0.as_ref() {
            StorageInner::Cuda { buffer } => buffer,
            StorageInner::Cpu { .. } => panic!("cannot access CPU storage as GPU buffer"),
        }
    }
    /// Copy to a different device. If already on the target device, clones the Arc (no copy).
    pub fn to_device(&self, target: Device) -> CudaResult<Self> {
        let current = self.device();
        if current == target {
            return Ok(self.clone());
        }
        match (current, target) {
            (Device::Cpu, Device::Cuda(_dev)) => {
                let cpu_data = self.as_cpu_bytes();
                let mut buf = GpuBuffer::alloc(cpu_data.len())?;
                buf.copy_from_host(cpu_data)?;
                Ok(Storage::cuda(buf))
            }
            (Device::Cuda(_), Device::Cpu) => {
                let gpu_buf = self.gpu_buffer();
                let mut data = vec![0u8; gpu_buf.len()];
                gpu_buf.copy_to_host(&mut data)?;
                Ok(Storage::cpu(data))
            }
            (Device::Cuda(_), Device::Cuda(_)) => {
                let src = self.gpu_buffer();
                let mut dst = GpuBuffer::alloc(src.len())?;
                dst.copy_from_device(src)?;
                Ok(Storage::cuda(dst))
            }
            _ => unreachable!(),
        }
    }
    /// Create a new owned copy of the storage on the same device.
    pub fn deep_copy(&self) -> CudaResult<Self> {
        match self.0.as_ref() {
            StorageInner::Cpu { data } => Ok(Storage::cpu(data.clone())),
            StorageInner::Cuda { buffer } => {
                let mut dst = GpuBuffer::alloc(buffer.len())?;
                dst.copy_from_device(buffer)?;
                Ok(Storage::cuda(dst))
            }
        }
    }
    /// Allocate zeroed storage on the given device.
    pub fn zeros(len_bytes: usize, device: Device) -> CudaResult<Self> {
        match device {
            Device::Cpu => Ok(Storage::cpu(vec![0u8; len_bytes])),
            Device::Cuda(_) => {
                let mut buf = GpuBuffer::alloc(len_bytes)?;
                buf.zero()?;
                Ok(Storage::cuda(buf))
            }
        }
    }
 }
--- a/crates/xserv-tensor/src/tensor.rs
+++ b/crates/xserv-tensor/src/tensor.rs
@@ -0,0 +1,228 @@
 use crate::dtype::{DType, TensorDType};
 use crate::shape::{self, Dims};
 use crate::storage::{Device, Storage};
 /// Multi-dimensional array with CPU or GPU storage.
 ///
 /// Tensors support view semantics: transpose, slice, etc. share
 /// the underlying storage and only change shape/strides/offset.
 #[derive(Clone)]
 pub struct Tensor {
    storage: Storage,
    shape: Dims,
    strides: Dims,
    offset: usize,
    dtype: DType,
 }
 impl Tensor {
    // --- Creation ---
    pub fn from_slice<T: TensorDType>(data: &[T], shape: &[usize]) -> Self {
        let numel: usize = shape.iter().product();
        assert_eq!(data.len(), numel, "data length mismatch with shape");
        let bytes = unsafe {
            std::slice::from_raw_parts(data.as_ptr() as *const u8, numel * T::DTYPE.size_bytes())
        };
        Self {
            storage: Storage::cpu(bytes.to_vec()),
            shape: Dims::from_slice(shape),
            strides: shape::contiguous_strides(shape),
            offset: 0,
            dtype: T::DTYPE,
        }
    }
    pub fn zeros(shape: &[usize], dtype: DType, device: Device) -> Self {
        let numel = shape::num_elements(shape);
        let len_bytes = numel * dtype.size_bytes();
        let storage = Storage::zeros(len_bytes, device).expect("alloc failed");
        Self {
            storage,
            shape: Dims::from_slice(shape),
            strides: shape::contiguous_strides(shape),
            offset: 0,
            dtype,
        }
    }
    pub fn ones(shape: &[usize], dtype: DType) -> Self {
        let numel = shape::num_elements(shape);
        match dtype {
            DType::F32 => Self::from_slice(&vec![1.0f32; numel], shape),
            DType::F16 => Self::from_slice(&vec![half::f16::from_f32(1.0); numel], shape),
            DType::BF16 => Self::from_slice(&vec![half::bf16::from_f32(1.0); numel], shape),
        }
    }
    // --- Properties ---
    pub fn shape(&self) -> &[usize] { &self.shape }
    pub fn strides(&self) -> &[usize] { &self.strides }
    pub fn dtype(&self) -> DType { self.dtype }
    pub fn ndim(&self) -> usize { self.shape.len() }
    pub fn numel(&self) -> usize { shape::num_elements(&self.shape) }
    pub fn offset(&self) -> usize { self.offset }
    pub fn device(&self) -> Device { self.storage.device() }
    pub fn is_contiguous(&self) -> bool {
        shape::is_contiguous(&self.shape, &self.strides)
    }
    // --- Shape operations (view, no copy) ---
    pub fn reshape(&self, new_shape: &[usize]) -> Self {
        assert!(self.is_contiguous(), "reshape requires contiguous tensor");
        let new_numel: usize = new_shape.iter().product();
        assert_eq!(new_numel, self.numel(), "reshape numel mismatch");
        Self {
            storage: self.storage.clone(),
            shape: Dims::from_slice(new_shape),
            strides: shape::contiguous_strides(new_shape),
            offset: self.offset,
            dtype: self.dtype,
        }
    }
    pub fn transpose(&self, dim0: usize, dim1: usize) -> Self {
        assert!(dim0 < self.ndim() && dim1 < self.ndim());
        let mut new_shape = self.shape.clone();
        let mut new_strides = self.strides.clone();
        new_shape.swap(dim0, dim1);
        new_strides.swap(dim0, dim1);
        Self {
            storage: self.storage.clone(),
            shape: new_shape,
            strides: new_strides,
            offset: self.offset,
            dtype: self.dtype,
        }
    }
    pub fn squeeze(&self, dim: usize) -> Self {
        assert!(dim < self.ndim() && self.shape[dim] == 1);
        let mut new_shape = self.shape.clone();
        let mut new_strides = self.strides.clone();
        new_shape.remove(dim);
        new_strides.remove(dim);
        Self {
            storage: self.storage.clone(),
            shape: new_shape,
            strides: new_strides,
            offset: self.offset,
            dtype: self.dtype,
        }
    }
    pub fn unsqueeze(&self, dim: usize) -> Self {
        assert!(dim <= self.ndim());
        let mut new_shape = self.shape.clone();
        let mut new_strides = self.strides.clone();
        new_shape.insert(dim, 1);
        let stride_val = if dim < self.strides.len() { self.strides[dim] } else { 1 };
        new_strides.insert(dim, stride_val);
        Self {
            storage: self.storage.clone(),
            shape: new_shape,
            strides: new_strides,
            offset: self.offset,
            dtype: self.dtype,
        }
    }
    /// Make contiguous: if already contiguous, return clone (shared storage).
    /// Otherwise, copy data into a new contiguous buffer.
    pub fn contiguous(&self) -> Self {
        if self.is_contiguous() {
            return self.clone();
        }
        // Copy to contiguous layout on CPU
        assert_eq!(self.device(), Device::Cpu, "contiguous() on GPU not yet supported");
        let numel = self.numel();
        let elem_size = self.dtype.size_bytes();
        let src_bytes = self.storage.as_cpu_bytes();
        let mut dst = vec![0u8; numel * elem_size];
        // Iterate all elements using strides
        let ndim = self.ndim();
        let mut idx = vec![0usize; ndim];
        for flat in 0..numel {
            let src_offset = self.offset + idx.iter().zip(self.strides.iter()).map(|(i, s)| i * s).sum::<usize>();
            let src_byte_offset = src_offset * elem_size;
            let dst_byte_offset = flat * elem_size;
            dst[dst_byte_offset..dst_byte_offset + elem_size]
                .copy_from_slice(&src_bytes[src_byte_offset..src_byte_offset + elem_size]);
            // Increment index (rightmost first)
            for d in (0..ndim).rev() {
                idx[d] += 1;
                if idx[d] < self.shape[d] {
                    break;
                }
                idx[d] = 0;
            }
        }
        Self {
            storage: Storage::cpu(dst),
            shape: self.shape.clone(),
            strides: shape::contiguous_strides(&self.shape),
            offset: 0,
            dtype: self.dtype,
        }
    }
    // --- Device transfer ---
    pub fn to_device(&self, device: Device) -> Self {
        let t = if self.is_contiguous() { self.clone() } else { self.contiguous() };
        if t.device() == device {
            return t;
        }
        let new_storage = t.storage.to_device(device).expect("device transfer failed");
        Self {
            storage: new_storage,
            shape: t.shape,
            strides: t.strides,
            offset: 0,
            dtype: t.dtype,
        }
    }
    // --- Data access (CPU only) ---
    /// Read tensor data as a typed slice. Requires contiguous CPU tensor.
    pub fn as_slice<T: TensorDType>(&self) -> &[T] {
        assert_eq!(T::DTYPE, self.dtype, "dtype mismatch");
        assert!(self.is_contiguous(), "as_slice requires contiguous");
        assert_eq!(self.device(), Device::Cpu, "as_slice requires CPU");
        let bytes = self.storage.as_cpu_bytes();
        let elem_size = self.dtype.size_bytes();
        let start = self.offset * elem_size;
        let len = self.numel();
        unsafe { std::slice::from_raw_parts(bytes[start..].as_ptr() as *const T, len) }
    }
    /// Raw pointer to storage start (for GPU kernel launch).
    pub fn data_ptr(&self) -> *const u8 {
        match self.device() {
            Device::Cpu => {
                let bytes = self.storage.as_cpu_bytes();
                unsafe { bytes.as_ptr().add(self.offset * self.dtype.size_bytes()) }
            }
            Device::Cuda(_) => {
                let buf = self.storage.gpu_buffer();
                unsafe { buf.as_ptr().add(self.offset * self.dtype.size_bytes()) }
            }
        }
    }
    pub fn storage(&self) -> &Storage { &self.storage }
 }
 impl std::fmt::Debug for Tensor {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f, "Tensor(shape={:?}, dtype={}, device={}, contiguous={})",
            self.shape.as_slice(), self.dtype, self.device(), self.is_contiguous()
        )
    }
 }
--- a/crates/xserv-tensor/tests/integration.rs
+++ b/crates/xserv-tensor/tests/integration.rs
@@ -0,0 +1,127 @@
 use half::bf16;
 use xserv_tensor::*;
 #[test]
 fn test_from_slice_and_shape() {
    let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
    let t = Tensor::from_slice(&data, &[2, 3]);
    assert_eq!(t.shape(), &[2, 3]);
    assert_eq!(t.strides(), &[3, 1]);
    assert_eq!(t.numel(), 6);
    assert_eq!(t.ndim(), 2);
    assert!(t.is_contiguous());
    assert_eq!(t.dtype(), DType::F32);
    assert_eq!(t.device(), Device::Cpu);
 }
 #[test]
 fn test_as_slice() {
    let data = vec![1.0f32, 2.0, 3.0, 4.0];
    let t = Tensor::from_slice(&data, &[4]);
    assert_eq!(t.as_slice::<f32>(), &[1.0, 2.0, 3.0, 4.0]);
 }
 #[test]
 fn test_zeros_and_ones() {
    let z = Tensor::zeros(&[2, 3], DType::F32, Device::Cpu);
    assert_eq!(z.as_slice::<f32>(), &[0.0; 6]);
    let o = Tensor::ones(&[3], DType::F32);
    assert_eq!(o.as_slice::<f32>(), &[1.0, 1.0, 1.0]);
 }
 #[test]
 fn test_bf16_tensor() {
    let data: Vec<bf16> = vec![bf16::from_f32(1.0), bf16::from_f32(2.5), bf16::from_f32(-3.0)];
    let t = Tensor::from_slice(&data, &[3]);
    assert_eq!(t.dtype(), DType::BF16);
    let out = t.as_slice::<bf16>();
    assert_eq!(out[0].to_f32(), 1.0);
    assert!((out[1].to_f32() - 2.5).abs() < 0.01);
    assert_eq!(out[2].to_f32(), -3.0);
 }
 #[test]
 fn test_reshape() {
    let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
    let t = Tensor::from_slice(&data, &[2, 3]);
    let t2 = t.reshape(&[3, 2]);
    assert_eq!(t2.shape(), &[3, 2]);
    assert_eq!(t2.as_slice::<f32>(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
    let t3 = t.reshape(&[6]);
    assert_eq!(t3.shape(), &[6]);
 }
 #[test]
 fn test_transpose() {
    let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
    let t = Tensor::from_slice(&data, &[2, 3]);
    let tt = t.transpose(0, 1);
    assert_eq!(tt.shape(), &[3, 2]);
    assert_eq!(tt.strides(), &[1, 3]);
    assert!(!tt.is_contiguous());
 }
 #[test]
 fn test_contiguous_from_transpose() {
    let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
    // Original [2,3]: [[1,2,3],[4,5,6]]
    let t = Tensor::from_slice(&data, &[2, 3]);
    // Transpose to [3,2]: [[1,4],[2,5],[3,6]]
    let tt = t.transpose(0, 1);
    let tc = tt.contiguous();
    assert!(tc.is_contiguous());
    assert_eq!(tc.shape(), &[3, 2]);
    assert_eq!(tc.as_slice::<f32>(), &[1.0, 4.0, 2.0, 5.0, 3.0, 6.0]);
 }
 #[test]
 fn test_squeeze_unsqueeze() {
    let data = vec![1.0f32, 2.0, 3.0];
    let t = Tensor::from_slice(&data, &[1, 3]);
    let squeezed = t.squeeze(0);
    assert_eq!(squeezed.shape(), &[3]);
    let unsqueezed = squeezed.unsqueeze(0);
    assert_eq!(unsqueezed.shape(), &[1, 3]);
    let unsqueezed2 = squeezed.unsqueeze(1);
    assert_eq!(unsqueezed2.shape(), &[3, 1]);
 }
 #[test]
 fn test_cpu_to_gpu_roundtrip() {
    xserv_cuda::device::set_device(0).unwrap();
    let data = vec![1.0f32, 2.0, 3.0, 4.0];
    let cpu_t = Tensor::from_slice(&data, &[2, 2]);
    let gpu_t = cpu_t.to_device(Device::Cuda(0));
    assert_eq!(gpu_t.device(), Device::Cuda(0));
    assert_eq!(gpu_t.shape(), &[2, 2]);
    let back = gpu_t.to_device(Device::Cpu);
    assert_eq!(back.device(), Device::Cpu);
    assert_eq!(back.as_slice::<f32>(), &[1.0, 2.0, 3.0, 4.0]);
 }
 #[test]
 fn test_zeros_gpu() {
    xserv_cuda::device::set_device(0).unwrap();
    let t = Tensor::zeros(&[4, 4], DType::F32, Device::Cuda(0));
    assert_eq!(t.device(), Device::Cuda(0));
    assert_eq!(t.shape(), &[4, 4]);
    let cpu = t.to_device(Device::Cpu);
    assert_eq!(cpu.as_slice::<f32>(), &[0.0f32; 16]);
 }
 #[test]
 fn test_debug_format() {
    let t = Tensor::from_slice(&[1.0f32], &[1]);
    let dbg = format!("{:?}", t);
    assert!(dbg.contains("shape=[1]"));
    assert!(dbg.contains("f32"));
    assert!(dbg.contains("cpu"));
 }