Files
xserv/crates/xserv-kernels/src/transpose.rs
Gahow Wang ee68d3565d fix: comprehensive review + 14 bug fixes + Phase 12/14 overhaul
Strict code review identified 30+ issues across correctness, performance,
and architecture. This commit addresses 14 of them with verified fixes,
restructures Phase 12 for honest continuous batching, and updates Phase 14
to target FA2 (RTX 5090 SM120 lacks TMEM required by FA4).

Bug fixes:
- FIX-01: Global cuBLAS handle (thread-local singleton, was per-call)
- FIX-02: Remove 19 unnecessary cudaDeviceSynchronize calls from kernels
- FIX-03: Qwen3 ChatML template (was plain text concatenation)
- FIX-04: EOS token from tokenizer (was hardcoded 151645)
- FIX-05: Storage tracks actual GPU device ordinal (was always Cuda(0))
- FIX-06: unsqueeze stride preserves contiguous layout
- FIX-08: CudaDeviceProp replaced with heap buffer (was UB-prone padding)
- FIX-09: Tokenizer byte_fallback to <0xNN> tokens (was panic)

Feature additions:
- FIX-10: SSE streaming (/v1/chat/completions, OpenAI-compatible)
- FIX-11: Correct usage statistics (prompt/completion/total tokens)
- FIX-13: Temperature / top-k / top-p sampling with SamplingParams

Performance improvements:
- FIX-07: Caching allocator wired up (thread-local pool, pooled flag)
- FIX-12: KV cache staging buffers (zero-alloc get_kv_len via borrow_raw)
- FIX-14: GPU strided copy kernel (eliminates contiguous() CPU round-trip)

Architecture:
- Phase 12 engine restructured: prefill/decode separation, honest TODO
  for batched GPU forward (requires Flash Attention)
- Phase 14 updated: FA2 for SM120 (FA4 requires TMEM, absent on 5090)
- Qwen3-7B → Qwen3-8B typo fixed across all docs (36 layers, hidden 4096)

Validated on dash5 (8x RTX 5090):
- 52/52 API prompts pass (EN/CN/code), SSE streaming verified
- Logits match HF transformers 9/10 top-1, 4.0/5 avg top-5 overlap
- 8 concurrent requests: 5.99x scheduling speedup (batch_size=4)
- Throughput: 10.3 tok/s (serial), 30% of HF baseline

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 17:53:28 +08:00

143 lines
6.2 KiB
Rust

use std::ffi::c_void;
use xserv_tensor::{DType, Device, Tensor};
unsafe extern "C" {
fn launch_reshape_heads_bf16(inp: *const c_void, out: *mut c_void, seq_len: i32, num_heads: i32, head_dim: i32, stream: *mut c_void);
fn launch_merge_heads_bf16(inp: *const c_void, out: *mut c_void, seq_len: i32, num_heads: i32, head_dim: i32, stream: *mut c_void);
fn launch_transpose_hsd_to_shd_bf16(inp: *const c_void, out: *mut c_void, seq_len: i32, num_heads: i32, head_dim: i32, stream: *mut c_void);
fn launch_transpose_shd_to_hsd_bf16(inp: *const c_void, out: *mut c_void, seq_len: i32, num_heads: i32, head_dim: i32, stream: *mut c_void);
fn launch_repeat_kv_bf16(inp: *const c_void, out: *mut c_void, kv_heads: i32, n_rep: i32, seq_len: i32, head_dim: i32, stream: *mut c_void);
fn launch_strided_copy_bf16(inp: *const c_void, out: *mut c_void, numel: i32, ndim: i32,
shape0: i32, shape1: i32, shape2: i32, shape3: i32,
in_stride0: i32, in_stride1: i32, in_stride2: i32, in_stride3: i32,
in_offset: i32, stream: *mut c_void);
fn launch_strided_copy_f32(inp: *const c_void, out: *mut c_void, numel: i32, ndim: i32,
shape0: i32, shape1: i32, shape2: i32, shape3: i32,
in_stride0: i32, in_stride1: i32, in_stride2: i32, in_stride3: i32,
in_offset: i32, stream: *mut c_void);
}
/// [S, H*D] → [1, H, S, D] on GPU (BF16)
pub fn reshape_heads_gpu(x: &Tensor, seq_len: usize, num_heads: usize, head_dim: usize) -> Tensor {
assert_eq!(x.dtype(), DType::BF16);
assert!(x.is_contiguous() && matches!(x.device(), Device::Cuda(_)));
let out = Tensor::zeros(&[1, num_heads, seq_len, head_dim], DType::BF16, x.device());
unsafe {
launch_reshape_heads_bf16(
x.data_ptr() as _, out.data_ptr() as *mut c_void,
seq_len as i32, num_heads as i32, head_dim as i32, std::ptr::null_mut(),
);
}
out
}
/// [1, H, S, D] → [S, H*D] on GPU (BF16)
pub fn merge_heads_gpu(x: &Tensor, seq_len: usize, num_heads: usize, head_dim: usize) -> Tensor {
assert_eq!(x.dtype(), DType::BF16);
assert!(x.is_contiguous() && matches!(x.device(), Device::Cuda(_)));
let hidden = num_heads * head_dim;
let out = Tensor::zeros(&[seq_len, hidden], DType::BF16, x.device());
unsafe {
launch_merge_heads_bf16(
x.data_ptr() as _, out.data_ptr() as *mut c_void,
seq_len as i32, num_heads as i32, head_dim as i32, std::ptr::null_mut(),
);
}
out
}
/// [1, H, S, D] → [S, H, D] for RoPE on GPU (BF16)
pub fn transpose_for_rope_gpu(x: &Tensor, seq_len: usize, num_heads: usize, head_dim: usize) -> Tensor {
assert_eq!(x.dtype(), DType::BF16);
assert!(x.is_contiguous() && matches!(x.device(), Device::Cuda(_)));
let out = Tensor::zeros(&[seq_len, num_heads, head_dim], DType::BF16, x.device());
unsafe {
launch_transpose_hsd_to_shd_bf16(
x.data_ptr() as _, out.data_ptr() as *mut c_void,
seq_len as i32, num_heads as i32, head_dim as i32, std::ptr::null_mut(),
);
}
out
}
/// [S, H, D] → [1, H, S, D] after RoPE on GPU (BF16)
pub fn transpose_from_rope_gpu(x: &Tensor, seq_len: usize, num_heads: usize, head_dim: usize) -> Tensor {
assert_eq!(x.dtype(), DType::BF16);
assert!(x.is_contiguous() && matches!(x.device(), Device::Cuda(_)));
let out = Tensor::zeros(&[1, num_heads, seq_len, head_dim], DType::BF16, x.device());
unsafe {
launch_transpose_shd_to_hsd_bf16(
x.data_ptr() as _, out.data_ptr() as *mut c_void,
seq_len as i32, num_heads as i32, head_dim as i32, std::ptr::null_mut(),
);
}
out
}
/// [1, KV_H, S, D] → [1, KV_H*n_rep, S, D] on GPU (BF16)
pub fn repeat_kv_gpu(x: &Tensor, n_rep: usize) -> Tensor {
if n_rep == 1 { return x.clone(); }
assert_eq!(x.dtype(), DType::BF16);
assert!(x.is_contiguous() && matches!(x.device(), Device::Cuda(_)));
let kv_heads = x.shape()[1];
let seq_len = x.shape()[2];
let head_dim = x.shape()[3];
let new_heads = kv_heads * n_rep;
let out = Tensor::zeros(&[1, new_heads, seq_len, head_dim], DType::BF16, x.device());
unsafe {
launch_repeat_kv_bf16(
x.data_ptr() as _, out.data_ptr() as *mut c_void,
kv_heads as i32, n_rep as i32, seq_len as i32, head_dim as i32, std::ptr::null_mut(),
);
}
out
}
/// Make a non-contiguous GPU tensor contiguous via a strided copy kernel.
/// Supports BF16 and F32, up to 4D tensors (padded to 4D internally).
pub fn strided_to_contiguous_gpu(x: &Tensor) -> Tensor {
assert!(matches!(x.device(), Device::Cuda(_)), "expected GPU tensor");
assert!(!x.is_contiguous(), "tensor is already contiguous");
assert!(x.ndim() <= 4, "strided_to_contiguous_gpu supports up to 4D");
let ndim = x.ndim();
let numel = x.numel();
// Pad shape and strides to 4D (prepend 1s for shape, 0s for strides)
let mut shape4 = [1i32; 4];
let mut strides4 = [0i32; 4];
let pad = 4 - ndim;
for i in 0..ndim {
shape4[pad + i] = x.shape()[i] as i32;
strides4[pad + i] = x.strides()[i] as i32;
}
let out = Tensor::zeros(x.shape(), x.dtype(), x.device());
// Use storage base pointer + element offset, because strides are relative to
// element 0 of the storage, not the data_ptr() (which already adds byte offset).
let storage_ptr = x.storage().gpu_buffer().as_ptr();
let in_offset = x.offset() as i32;
unsafe {
match x.dtype() {
DType::BF16 => launch_strided_copy_bf16(
storage_ptr as _, out.data_ptr() as *mut c_void,
numel as i32, ndim as i32,
shape4[0], shape4[1], shape4[2], shape4[3],
strides4[0], strides4[1], strides4[2], strides4[3],
in_offset, std::ptr::null_mut(),
),
DType::F32 => launch_strided_copy_f32(
storage_ptr as _, out.data_ptr() as *mut c_void,
numel as i32, ndim as i32,
shape4[0], shape4[1], shape4[2], shape4[3],
strides4[0], strides4[1], strides4[2], strides4[3],
in_offset, std::ptr::null_mut(),
),
_ => panic!("strided_to_contiguous_gpu: unsupported dtype {:?}", x.dtype()),
}
}
out
}