Strict code review identified 30+ issues across correctness, performance, and architecture. This commit addresses 14 of them with verified fixes, restructures Phase 12 for honest continuous batching, and updates Phase 14 to target FA2 (RTX 5090 SM120 lacks TMEM required by FA4). Bug fixes: - FIX-01: Global cuBLAS handle (thread-local singleton, was per-call) - FIX-02: Remove 19 unnecessary cudaDeviceSynchronize calls from kernels - FIX-03: Qwen3 ChatML template (was plain text concatenation) - FIX-04: EOS token from tokenizer (was hardcoded 151645) - FIX-05: Storage tracks actual GPU device ordinal (was always Cuda(0)) - FIX-06: unsqueeze stride preserves contiguous layout - FIX-08: CudaDeviceProp replaced with heap buffer (was UB-prone padding) - FIX-09: Tokenizer byte_fallback to <0xNN> tokens (was panic) Feature additions: - FIX-10: SSE streaming (/v1/chat/completions, OpenAI-compatible) - FIX-11: Correct usage statistics (prompt/completion/total tokens) - FIX-13: Temperature / top-k / top-p sampling with SamplingParams Performance improvements: - FIX-07: Caching allocator wired up (thread-local pool, pooled flag) - FIX-12: KV cache staging buffers (zero-alloc get_kv_len via borrow_raw) - FIX-14: GPU strided copy kernel (eliminates contiguous() CPU round-trip) Architecture: - Phase 12 engine restructured: prefill/decode separation, honest TODO for batched GPU forward (requires Flash Attention) - Phase 14 updated: FA2 for SM120 (FA4 requires TMEM, absent on 5090) - Qwen3-7B → Qwen3-8B typo fixed across all docs (36 layers, hidden 4096) Validated on dash5 (8x RTX 5090): - 52/52 API prompts pass (EN/CN/code), SSE streaming verified - Logits match HF transformers 9/10 top-1, 4.0/5 avg top-5 overlap - 8 concurrent requests: 5.99x scheduling speedup (batch_size=4) - Throughput: 10.3 tok/s (serial), 30% of HF baseline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
317 lines
11 KiB
Rust
317 lines
11 KiB
Rust
use std::sync::OnceLock;
|
|
|
|
use crate::dtype::{DType, TensorDType};
|
|
use crate::shape::{self, Dims};
|
|
use crate::storage::{Device, Storage};
|
|
|
|
/// Global hook for GPU strided-to-contiguous copy.
|
|
/// Set by `xserv-kernels` (or any crate that provides a GPU kernel) via
|
|
/// `register_gpu_contiguous`. When set, `contiguous()` on a non-contiguous
|
|
/// GPU tensor calls this instead of doing a CPU round-trip.
|
|
static GPU_CONTIGUOUS_FN: OnceLock<fn(&Tensor) -> Tensor> = OnceLock::new();
|
|
|
|
/// Register a function that makes a non-contiguous GPU tensor contiguous.
|
|
/// Intended to be called once by the kernel crate at startup.
|
|
pub fn register_gpu_contiguous(f: fn(&Tensor) -> Tensor) {
|
|
let _ = GPU_CONTIGUOUS_FN.set(f);
|
|
}
|
|
|
|
/// Multi-dimensional array with CPU or GPU storage.
|
|
///
|
|
/// Tensors support view semantics: transpose, slice, etc. share
|
|
/// the underlying storage and only change shape/strides/offset.
|
|
#[derive(Clone)]
|
|
pub struct Tensor {
|
|
storage: Storage,
|
|
shape: Dims,
|
|
strides: Dims,
|
|
offset: usize,
|
|
dtype: DType,
|
|
}
|
|
|
|
impl Tensor {
|
|
// --- Creation ---
|
|
|
|
/// Create a tensor from raw components (for advanced use like GPU KV cache).
|
|
pub fn from_storage(storage: Storage, shape: Dims, strides: Dims, offset: usize, dtype: DType) -> Self {
|
|
Self { storage, shape, strides, offset, dtype }
|
|
}
|
|
|
|
pub fn from_slice<T: TensorDType>(data: &[T], shape: &[usize]) -> Self {
|
|
let numel: usize = shape.iter().product();
|
|
assert_eq!(data.len(), numel, "data length mismatch with shape");
|
|
let bytes = unsafe {
|
|
std::slice::from_raw_parts(data.as_ptr() as *const u8, numel * T::DTYPE.size_bytes())
|
|
};
|
|
Self {
|
|
storage: Storage::cpu(bytes.to_vec()),
|
|
shape: Dims::from_slice(shape),
|
|
strides: shape::contiguous_strides(shape),
|
|
offset: 0,
|
|
dtype: T::DTYPE,
|
|
}
|
|
}
|
|
|
|
pub fn zeros(shape: &[usize], dtype: DType, device: Device) -> Self {
|
|
let numel = shape::num_elements(shape);
|
|
let len_bytes = numel * dtype.size_bytes();
|
|
let storage = Storage::zeros(len_bytes, device).expect("alloc failed");
|
|
Self {
|
|
storage,
|
|
shape: Dims::from_slice(shape),
|
|
strides: shape::contiguous_strides(shape),
|
|
offset: 0,
|
|
dtype,
|
|
}
|
|
}
|
|
|
|
pub fn ones(shape: &[usize], dtype: DType) -> Self {
|
|
let numel = shape::num_elements(shape);
|
|
match dtype {
|
|
DType::F32 => Self::from_slice(&vec![1.0f32; numel], shape),
|
|
DType::F16 => Self::from_slice(&vec![half::f16::from_f32(1.0); numel], shape),
|
|
DType::BF16 => Self::from_slice(&vec![half::bf16::from_f32(1.0); numel], shape),
|
|
}
|
|
}
|
|
|
|
// --- Properties ---
|
|
|
|
pub fn shape(&self) -> &[usize] { &self.shape }
|
|
pub fn strides(&self) -> &[usize] { &self.strides }
|
|
pub fn dtype(&self) -> DType { self.dtype }
|
|
pub fn ndim(&self) -> usize { self.shape.len() }
|
|
pub fn numel(&self) -> usize { shape::num_elements(&self.shape) }
|
|
pub fn offset(&self) -> usize { self.offset }
|
|
|
|
pub fn device(&self) -> Device { self.storage.device() }
|
|
|
|
pub fn is_contiguous(&self) -> bool {
|
|
shape::is_contiguous(&self.shape, &self.strides)
|
|
}
|
|
|
|
// --- Shape operations (view, no copy) ---
|
|
|
|
pub fn reshape(&self, new_shape: &[usize]) -> Self {
|
|
assert!(self.is_contiguous(), "reshape requires contiguous tensor");
|
|
let new_numel: usize = new_shape.iter().product();
|
|
assert_eq!(new_numel, self.numel(), "reshape numel mismatch");
|
|
Self {
|
|
storage: self.storage.clone(),
|
|
shape: Dims::from_slice(new_shape),
|
|
strides: shape::contiguous_strides(new_shape),
|
|
offset: self.offset,
|
|
dtype: self.dtype,
|
|
}
|
|
}
|
|
|
|
pub fn transpose(&self, dim0: usize, dim1: usize) -> Self {
|
|
assert!(dim0 < self.ndim() && dim1 < self.ndim());
|
|
let mut new_shape = self.shape.clone();
|
|
let mut new_strides = self.strides.clone();
|
|
new_shape.swap(dim0, dim1);
|
|
new_strides.swap(dim0, dim1);
|
|
Self {
|
|
storage: self.storage.clone(),
|
|
shape: new_shape,
|
|
strides: new_strides,
|
|
offset: self.offset,
|
|
dtype: self.dtype,
|
|
}
|
|
}
|
|
|
|
pub fn squeeze(&self, dim: usize) -> Self {
|
|
assert!(dim < self.ndim() && self.shape[dim] == 1);
|
|
let mut new_shape = self.shape.clone();
|
|
let mut new_strides = self.strides.clone();
|
|
new_shape.remove(dim);
|
|
new_strides.remove(dim);
|
|
Self {
|
|
storage: self.storage.clone(),
|
|
shape: new_shape,
|
|
strides: new_strides,
|
|
offset: self.offset,
|
|
dtype: self.dtype,
|
|
}
|
|
}
|
|
|
|
pub fn unsqueeze(&self, dim: usize) -> Self {
|
|
assert!(dim <= self.ndim());
|
|
let mut new_shape = self.shape.clone();
|
|
new_shape.insert(dim, 1);
|
|
let new_strides = if self.is_contiguous() {
|
|
shape::contiguous_strides(&new_shape)
|
|
} else {
|
|
let mut s = self.strides.clone();
|
|
let stride_val = if dim < self.strides.len() { self.strides[dim] } else { 1 };
|
|
s.insert(dim, stride_val);
|
|
s
|
|
};
|
|
Self {
|
|
storage: self.storage.clone(),
|
|
shape: new_shape,
|
|
strides: new_strides,
|
|
offset: self.offset,
|
|
dtype: self.dtype,
|
|
}
|
|
}
|
|
|
|
/// Make contiguous: if already contiguous, return clone (shared storage).
|
|
/// Otherwise, copy data into a new contiguous buffer.
|
|
pub fn contiguous(&self) -> Self {
|
|
if self.is_contiguous() {
|
|
return self.clone();
|
|
}
|
|
// For GPU tensors: use the registered GPU kernel if available,
|
|
// otherwise fall back to CPU round-trip.
|
|
if matches!(self.device(), Device::Cuda(_)) {
|
|
if let Some(gpu_fn) = GPU_CONTIGUOUS_FN.get() {
|
|
return gpu_fn(self);
|
|
}
|
|
let cpu = self.to_device(Device::Cpu);
|
|
let contig = cpu.contiguous();
|
|
return contig.to_device(self.device());
|
|
}
|
|
let numel = self.numel();
|
|
let elem_size = self.dtype.size_bytes();
|
|
let src_bytes = self.storage.as_cpu_bytes();
|
|
let mut dst = vec![0u8; numel * elem_size];
|
|
// Iterate all elements using strides
|
|
let ndim = self.ndim();
|
|
let mut idx = vec![0usize; ndim];
|
|
for flat in 0..numel {
|
|
let src_offset = self.offset + idx.iter().zip(self.strides.iter()).map(|(i, s)| i * s).sum::<usize>();
|
|
let src_byte_offset = src_offset * elem_size;
|
|
let dst_byte_offset = flat * elem_size;
|
|
dst[dst_byte_offset..dst_byte_offset + elem_size]
|
|
.copy_from_slice(&src_bytes[src_byte_offset..src_byte_offset + elem_size]);
|
|
// Increment index (rightmost first)
|
|
for d in (0..ndim).rev() {
|
|
idx[d] += 1;
|
|
if idx[d] < self.shape[d] {
|
|
break;
|
|
}
|
|
idx[d] = 0;
|
|
}
|
|
}
|
|
Self {
|
|
storage: Storage::cpu(dst),
|
|
shape: self.shape.clone(),
|
|
strides: shape::contiguous_strides(&self.shape),
|
|
offset: 0,
|
|
dtype: self.dtype,
|
|
}
|
|
}
|
|
|
|
// --- Device transfer ---
|
|
|
|
pub fn to_device(&self, device: Device) -> Self {
|
|
if self.device() == device {
|
|
return self.clone();
|
|
}
|
|
// Transfer the raw storage (preserving strides/offset).
|
|
// Non-contiguous layout is preserved — the user can call contiguous() after.
|
|
let new_storage = self.storage.to_device(device).expect("device transfer failed");
|
|
Self {
|
|
storage: new_storage,
|
|
shape: self.shape.clone(),
|
|
strides: self.strides.clone(),
|
|
offset: self.offset,
|
|
dtype: self.dtype,
|
|
}
|
|
}
|
|
|
|
// --- Data access (CPU only) ---
|
|
|
|
/// Read tensor data as a typed slice. Requires contiguous CPU tensor.
|
|
pub fn as_slice<T: TensorDType>(&self) -> &[T] {
|
|
assert_eq!(T::DTYPE, self.dtype, "dtype mismatch");
|
|
assert!(self.is_contiguous(), "as_slice requires contiguous");
|
|
assert_eq!(self.device(), Device::Cpu, "as_slice requires CPU");
|
|
let bytes = self.storage.as_cpu_bytes();
|
|
let elem_size = self.dtype.size_bytes();
|
|
let start = self.offset * elem_size;
|
|
let len = self.numel();
|
|
unsafe { std::slice::from_raw_parts(bytes[start..].as_ptr() as *const T, len) }
|
|
}
|
|
|
|
/// Raw pointer to storage start (for GPU kernel launch).
|
|
pub fn data_ptr(&self) -> *const u8 {
|
|
match self.device() {
|
|
Device::Cpu => {
|
|
let bytes = self.storage.as_cpu_bytes();
|
|
unsafe { bytes.as_ptr().add(self.offset * self.dtype.size_bytes()) }
|
|
}
|
|
Device::Cuda(_) => {
|
|
let buf = self.storage.gpu_buffer();
|
|
unsafe { buf.as_ptr().add(self.offset * self.dtype.size_bytes()) }
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn storage(&self) -> &Storage { &self.storage }
|
|
}
|
|
|
|
impl std::fmt::Debug for Tensor {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
write!(
|
|
f, "Tensor(shape={:?}, dtype={}, device={}, contiguous={})",
|
|
self.shape.as_slice(), self.dtype, self.device(), self.is_contiguous()
|
|
)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn contiguous_2d() -> Tensor {
|
|
Tensor::from_slice(&[1.0f32; 12], &[3, 4])
|
|
}
|
|
|
|
#[test]
|
|
fn unsqueeze_dim0_contiguous() {
|
|
let t = contiguous_2d();
|
|
let u = t.unsqueeze(0);
|
|
assert_eq!(u.shape(), &[1, 3, 4]);
|
|
assert!(u.is_contiguous());
|
|
assert_eq!(u.strides(), &[12, 4, 1]);
|
|
}
|
|
|
|
#[test]
|
|
fn unsqueeze_dim1_contiguous() {
|
|
let t = contiguous_2d();
|
|
let u = t.unsqueeze(1);
|
|
assert_eq!(u.shape(), &[3, 1, 4]);
|
|
assert!(u.is_contiguous());
|
|
assert_eq!(u.strides(), &[4, 4, 1]);
|
|
}
|
|
|
|
#[test]
|
|
fn unsqueeze_dim2_contiguous() {
|
|
let t = contiguous_2d();
|
|
let u = t.unsqueeze(2);
|
|
assert_eq!(u.shape(), &[3, 4, 1]);
|
|
assert!(u.is_contiguous());
|
|
assert_eq!(u.strides(), &[4, 1, 1]);
|
|
}
|
|
|
|
#[test]
|
|
fn unsqueeze_noncontiguous() {
|
|
// Transpose makes [3,4] into [4,3] with strides [1,4] (non-contiguous)
|
|
let t = contiguous_2d().transpose(0, 1);
|
|
assert!(!t.is_contiguous());
|
|
let u = t.unsqueeze(0);
|
|
assert_eq!(u.shape(), &[1, 4, 3]);
|
|
// Non-contiguous path: stride_val copied from strides[0]=1
|
|
assert_eq!(u.strides(), &[1, 1, 4]);
|
|
}
|
|
|
|
#[test]
|
|
fn unsqueeze_squeeze_roundtrip() {
|
|
let t = contiguous_2d();
|
|
let u = t.unsqueeze(1).squeeze(1);
|
|
assert_eq!(u.shape(), t.shape());
|
|
assert!(u.is_contiguous());
|
|
}
|
|
}
|