Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e1e75fc7f6 | |||
| 6035ffdc0b | |||
| c8e8153702 | |||
| 51a0f2eb14 | |||
| d77f921a12 | |||
| a83971fa25 |
@@ -2,6 +2,10 @@
|
||||
resolver = "2"
|
||||
members = [
|
||||
"crates/xserv-cuda",
|
||||
"crates/xserv-tensor",
|
||||
"crates/xserv-kernels",
|
||||
"crates/xserv-model",
|
||||
"crates/xserv-tokenizer",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -12,3 +16,7 @@ license = "MIT"
|
||||
[workspace.dependencies]
|
||||
half = "2"
|
||||
smallvec = "1"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
safetensors = "0.5"
|
||||
regex = "1"
|
||||
|
||||
@@ -23,7 +23,7 @@ impl std::error::Error for CudaError {}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, CudaError>;
|
||||
|
||||
pub(crate) fn check(code: i32) -> Result<()> {
|
||||
pub fn check(code: i32) -> Result<()> {
|
||||
if code == ffi::CUDA_SUCCESS {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
12
crates/xserv-kernels/Cargo.toml
Normal file
12
crates/xserv-kernels/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "xserv-kernels"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
cc = "1"
|
||||
|
||||
[dependencies]
|
||||
xserv-cuda = { path = "../xserv-cuda" }
|
||||
xserv-tensor = { path = "../xserv-tensor" }
|
||||
half.workspace = true
|
||||
29
crates/xserv-kernels/build.rs
Normal file
29
crates/xserv-kernels/build.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use std::env;
|
||||
|
||||
fn main() {
|
||||
let cuda_path = env::var("CUDA_HOME")
|
||||
.or_else(|_| env::var("CUDA_PATH"))
|
||||
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
||||
|
||||
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
||||
println!("cargo:rustc-link-lib=dylib=cudart");
|
||||
println!("cargo:rustc-link-lib=dylib=cublas");
|
||||
|
||||
cc::Build::new()
|
||||
.cuda(true)
|
||||
.cudart("shared")
|
||||
.flag("-gencode=arch=compute_120,code=sm_120")
|
||||
.include("../../csrc")
|
||||
.file("../../csrc/gemm/naive.cu")
|
||||
.file("../../csrc/gemm/tiled.cu")
|
||||
.file("../../csrc/normalization/rmsnorm.cu")
|
||||
.file("../../csrc/normalization/layernorm.cu")
|
||||
.file("../../csrc/activation/activations.cu")
|
||||
.file("../../csrc/reduce/softmax.cu")
|
||||
.file("../../csrc/embedding/embedding.cu")
|
||||
.file("../../csrc/embedding/rope.cu")
|
||||
.file("../../csrc/attention/causal_mask.cu")
|
||||
.compile("xserv_kernels");
|
||||
|
||||
println!("cargo:rerun-if-changed=../../csrc/");
|
||||
}
|
||||
59
crates/xserv-kernels/src/activation.rs
Normal file
59
crates/xserv-kernels/src/activation.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
unsafe extern "C" {
|
||||
fn launch_gelu_f32(x: *const c_void, out: *mut c_void, n: i32, stream: *mut c_void);
|
||||
fn launch_gelu_bf16(x: *const c_void, out: *mut c_void, n: i32, stream: *mut c_void);
|
||||
fn launch_silu_f32(x: *const c_void, out: *mut c_void, n: i32, stream: *mut c_void);
|
||||
fn launch_silu_bf16(x: *const c_void, out: *mut c_void, n: i32, stream: *mut c_void);
|
||||
fn launch_scale_f32(x: *const c_void, out: *mut c_void, scale: f32, n: i32, stream: *mut c_void);
|
||||
fn launch_scale_bf16(x: *const c_void, out: *mut c_void, scale: f32, n: i32, stream: *mut c_void);
|
||||
}
|
||||
|
||||
pub fn gelu(x: &Tensor) -> Tensor {
|
||||
assert!(x.is_contiguous());
|
||||
assert!(matches!(x.device(), Device::Cuda(_)));
|
||||
let out = Tensor::zeros(x.shape(), x.dtype(), x.device());
|
||||
let n = x.numel() as i32;
|
||||
unsafe {
|
||||
match x.dtype() {
|
||||
DType::F32 => launch_gelu_f32(x.data_ptr() as _, out.data_ptr() as *mut c_void, n, std::ptr::null_mut()),
|
||||
DType::BF16 => launch_gelu_bf16(x.data_ptr() as _, out.data_ptr() as *mut c_void, n, std::ptr::null_mut()),
|
||||
_ => panic!("unsupported dtype for gelu"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
out
|
||||
}
|
||||
|
||||
pub fn silu(x: &Tensor) -> Tensor {
|
||||
assert!(x.is_contiguous());
|
||||
assert!(matches!(x.device(), Device::Cuda(_)));
|
||||
let out = Tensor::zeros(x.shape(), x.dtype(), x.device());
|
||||
let n = x.numel() as i32;
|
||||
unsafe {
|
||||
match x.dtype() {
|
||||
DType::F32 => launch_silu_f32(x.data_ptr() as _, out.data_ptr() as *mut c_void, n, std::ptr::null_mut()),
|
||||
DType::BF16 => launch_silu_bf16(x.data_ptr() as _, out.data_ptr() as *mut c_void, n, std::ptr::null_mut()),
|
||||
_ => panic!("unsupported dtype for silu"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
out
|
||||
}
|
||||
|
||||
pub fn scale(x: &Tensor, scale_val: f32) -> Tensor {
|
||||
assert!(x.is_contiguous());
|
||||
assert!(matches!(x.device(), Device::Cuda(_)));
|
||||
let out = Tensor::zeros(x.shape(), x.dtype(), x.device());
|
||||
let n = x.numel() as i32;
|
||||
unsafe {
|
||||
match x.dtype() {
|
||||
DType::F32 => launch_scale_f32(x.data_ptr() as _, out.data_ptr() as *mut c_void, scale_val, n, std::ptr::null_mut()),
|
||||
DType::BF16 => launch_scale_bf16(x.data_ptr() as _, out.data_ptr() as *mut c_void, scale_val, n, std::ptr::null_mut()),
|
||||
_ => panic!("unsupported dtype for scale"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
out
|
||||
}
|
||||
77
crates/xserv-kernels/src/attention.rs
Normal file
77
crates/xserv-kernels/src/attention.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_tensor::{DType, Tensor};
|
||||
|
||||
use crate::activation::scale;
|
||||
use crate::gemm::batched_matmul;
|
||||
use crate::softmax::softmax;
|
||||
|
||||
unsafe extern "C" {
|
||||
fn launch_causal_mask_f32(scores: *mut c_void, batch: i32, rows: i32, cols: i32,
|
||||
offset: i32, stream: *mut c_void);
|
||||
fn launch_causal_mask_bf16(scores: *mut c_void, batch: i32, rows: i32, cols: i32,
|
||||
offset: i32, stream: *mut c_void);
|
||||
}
|
||||
|
||||
fn apply_causal_mask(scores: &Tensor, offset: usize) {
|
||||
let ndim = scores.ndim();
|
||||
let rows = scores.shape()[ndim - 2];
|
||||
let cols = scores.shape()[ndim - 1];
|
||||
let batch: usize = scores.shape()[..ndim - 2].iter().product();
|
||||
|
||||
unsafe {
|
||||
match scores.dtype() {
|
||||
DType::F32 => launch_causal_mask_f32(
|
||||
scores.data_ptr() as *mut c_void,
|
||||
batch as i32, rows as i32, cols as i32, offset as i32,
|
||||
std::ptr::null_mut(),
|
||||
),
|
||||
DType::BF16 => launch_causal_mask_bf16(
|
||||
scores.data_ptr() as *mut c_void,
|
||||
batch as i32, rows as i32, cols as i32, offset as i32,
|
||||
std::ptr::null_mut(),
|
||||
),
|
||||
_ => panic!("unsupported dtype for causal mask"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
}
|
||||
|
||||
/// Multi-head attention (naive, materializes S×S score matrix).
|
||||
///
|
||||
/// q, k, v: [batch, num_heads, seq_len, head_dim] — contiguous, on GPU
|
||||
/// Returns: [batch, num_heads, seq_len, head_dim]
|
||||
pub fn attention(q: &Tensor, k: &Tensor, v: &Tensor, causal: bool) -> Tensor {
|
||||
assert_eq!(q.ndim(), 4);
|
||||
assert_eq!(k.ndim(), 4);
|
||||
assert_eq!(v.ndim(), 4);
|
||||
assert!(q.is_contiguous() && k.is_contiguous() && v.is_contiguous());
|
||||
|
||||
let batch = q.shape()[0];
|
||||
let num_heads = q.shape()[1];
|
||||
let q_len = q.shape()[2];
|
||||
let head_dim = q.shape()[3];
|
||||
let kv_len = k.shape()[2];
|
||||
|
||||
assert_eq!(k.shape(), &[batch, num_heads, kv_len, head_dim]);
|
||||
assert_eq!(v.shape(), &[batch, num_heads, kv_len, head_dim]);
|
||||
|
||||
// scores = Q @ K^T → [B, H, q_len, kv_len]
|
||||
let k_t = k.transpose(2, 3).contiguous();
|
||||
let scores = batched_matmul(q, &k_t);
|
||||
|
||||
// Scale by 1/sqrt(head_dim)
|
||||
let scale_factor = 1.0 / (head_dim as f32).sqrt();
|
||||
let scaled_scores = scale(&scores, scale_factor);
|
||||
|
||||
// Causal mask
|
||||
if causal {
|
||||
let offset = kv_len - q_len;
|
||||
apply_causal_mask(&scaled_scores, offset);
|
||||
}
|
||||
|
||||
// Softmax
|
||||
let weights = softmax(&scaled_scores);
|
||||
|
||||
// output = weights @ V → [B, H, q_len, head_dim]
|
||||
batched_matmul(&weights, v)
|
||||
}
|
||||
51
crates/xserv-kernels/src/embedding.rs
Normal file
51
crates/xserv-kernels/src/embedding.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_cuda::GpuBuffer;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
unsafe extern "C" {
|
||||
fn launch_embedding_f32(table: *const c_void, token_ids: *const c_void, out: *mut c_void,
|
||||
num_tokens: i32, hidden_size: i32, stream: *mut c_void);
|
||||
fn launch_embedding_bf16(table: *const c_void, token_ids: *const c_void, out: *mut c_void,
|
||||
num_tokens: i32, hidden_size: i32, stream: *mut c_void);
|
||||
}
|
||||
|
||||
/// Embedding lookup: table[token_ids[i]] for each i.
|
||||
/// table: [vocab_size, hidden_size], token_ids: [num_tokens] (i32 on CPU)
|
||||
pub fn embedding(table: &Tensor, token_ids: &[u32]) -> Tensor {
|
||||
assert_eq!(table.ndim(), 2);
|
||||
assert!(table.is_contiguous());
|
||||
assert!(matches!(table.device(), Device::Cuda(_)));
|
||||
|
||||
let hidden_size = table.shape()[1];
|
||||
let num_tokens = token_ids.len();
|
||||
|
||||
// Upload token_ids to GPU
|
||||
let ids_bytes = unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
token_ids.as_ptr() as *const u8,
|
||||
num_tokens * std::mem::size_of::<u32>(),
|
||||
)
|
||||
};
|
||||
let mut ids_gpu = GpuBuffer::alloc(ids_bytes.len()).expect("alloc token_ids");
|
||||
ids_gpu.copy_from_host(ids_bytes).unwrap();
|
||||
|
||||
let out = Tensor::zeros(&[num_tokens, hidden_size], table.dtype(), table.device());
|
||||
|
||||
unsafe {
|
||||
match table.dtype() {
|
||||
DType::F32 => launch_embedding_f32(
|
||||
table.data_ptr() as _, ids_gpu.as_ptr() as _,
|
||||
out.data_ptr() as *mut c_void,
|
||||
num_tokens as i32, hidden_size as i32, std::ptr::null_mut(),
|
||||
),
|
||||
DType::BF16 => launch_embedding_bf16(
|
||||
table.data_ptr() as _, ids_gpu.as_ptr() as _,
|
||||
out.data_ptr() as *mut c_void,
|
||||
num_tokens as i32, hidden_size as i32, std::ptr::null_mut(),
|
||||
),
|
||||
_ => panic!("unsupported dtype for embedding"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
out
|
||||
}
|
||||
229
crates/xserv-kernels/src/gemm.rs
Normal file
229
crates/xserv-kernels/src/gemm.rs
Normal file
@@ -0,0 +1,229 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_cuda::error::{self, Result};
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum GemmBackend {
|
||||
Naive,
|
||||
Tiled,
|
||||
CuBlas,
|
||||
}
|
||||
|
||||
// --- FFI: custom CUDA kernels ---
|
||||
unsafe extern "C" {
|
||||
fn launch_gemm_naive_f32(a: *const c_void, b: *const c_void, c: *mut c_void, m: i32, n: i32, k: i32, stream: *mut c_void);
|
||||
fn launch_gemm_naive_bf16(a: *const c_void, b: *const c_void, c: *mut c_void, m: i32, n: i32, k: i32, stream: *mut c_void);
|
||||
fn launch_gemm_tiled_f32(a: *const c_void, b: *const c_void, c: *mut c_void, m: i32, n: i32, k: i32, stream: *mut c_void);
|
||||
fn launch_gemm_tiled_bf16(a: *const c_void, b: *const c_void, c: *mut c_void, m: i32, n: i32, k: i32, stream: *mut c_void);
|
||||
}
|
||||
|
||||
// --- FFI: cuBLAS ---
|
||||
type CublasHandle = *mut c_void;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
const CUBLAS_OP_N: i32 = 0;
|
||||
|
||||
// cudaDataType
|
||||
const CUDA_R_32F: i32 = 0;
|
||||
const CUDA_R_16BF: i32 = 14;
|
||||
|
||||
// cublasComputeType
|
||||
const CUBLAS_COMPUTE_32F: i32 = 68;
|
||||
|
||||
unsafe extern "C" {
|
||||
fn cublasCreate_v2(handle: *mut CublasHandle) -> i32;
|
||||
fn cublasDestroy_v2(handle: CublasHandle) -> i32;
|
||||
fn cublasSetStream_v2(handle: CublasHandle, stream: *mut c_void) -> i32;
|
||||
fn cublasGemmEx(
|
||||
handle: CublasHandle,
|
||||
transa: i32, transb: i32,
|
||||
m: i32, n: i32, k: i32,
|
||||
alpha: *const c_void,
|
||||
a: *const c_void, a_type: i32, lda: i32,
|
||||
b: *const c_void, b_type: i32, ldb: i32,
|
||||
beta: *const c_void,
|
||||
c: *mut c_void, c_type: i32, ldc: i32,
|
||||
compute_type: i32,
|
||||
algo: i32,
|
||||
) -> i32;
|
||||
fn cublasGemmStridedBatchedEx(
|
||||
handle: CublasHandle,
|
||||
transa: i32, transb: i32,
|
||||
m: i32, n: i32, k: i32,
|
||||
alpha: *const c_void,
|
||||
a: *const c_void, a_type: i32, lda: i32, stride_a: i64,
|
||||
b: *const c_void, b_type: i32, ldb: i32, stride_b: i64,
|
||||
beta: *const c_void,
|
||||
c: *mut c_void, c_type: i32, ldc: i32, stride_c: i64,
|
||||
batch_count: i32,
|
||||
compute_type: i32,
|
||||
algo: i32,
|
||||
) -> i32;
|
||||
}
|
||||
|
||||
pub struct CublasContext {
|
||||
handle: CublasHandle,
|
||||
}
|
||||
|
||||
impl CublasContext {
|
||||
pub fn new() -> Result<Self> {
|
||||
let mut handle = std::ptr::null_mut();
|
||||
error::check(unsafe { cublasCreate_v2(&mut handle) })?;
|
||||
Ok(Self { handle })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for CublasContext {
|
||||
fn drop(&mut self) {
|
||||
if !self.handle.is_null() {
|
||||
unsafe { cublasDestroy_v2(self.handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Matrix multiplication: C = A @ B
|
||||
/// A: [M, K], B: [K, N], C: [M, N]
|
||||
/// All tensors must be contiguous and on the same GPU.
|
||||
pub fn matmul(a: &Tensor, b: &Tensor, backend: GemmBackend) -> Tensor {
|
||||
assert_eq!(a.ndim(), 2);
|
||||
assert_eq!(b.ndim(), 2);
|
||||
assert_eq!(a.shape()[1], b.shape()[0], "inner dimension mismatch");
|
||||
assert_eq!(a.dtype(), b.dtype(), "dtype mismatch");
|
||||
assert!(a.is_contiguous() && b.is_contiguous(), "matmul requires contiguous tensors");
|
||||
assert!(matches!(a.device(), Device::Cuda(_)), "matmul requires GPU tensors");
|
||||
|
||||
let m = a.shape()[0];
|
||||
let k = a.shape()[1];
|
||||
let n = b.shape()[1];
|
||||
let dtype = a.dtype();
|
||||
|
||||
let c = Tensor::zeros(&[m, n], dtype, a.device());
|
||||
|
||||
let a_ptr = a.data_ptr() as *const c_void;
|
||||
let b_ptr = b.data_ptr() as *const c_void;
|
||||
let c_ptr = c.data_ptr() as *mut c_void;
|
||||
let null_stream = std::ptr::null_mut();
|
||||
|
||||
match backend {
|
||||
GemmBackend::Naive => {
|
||||
unsafe {
|
||||
match dtype {
|
||||
DType::F32 => launch_gemm_naive_f32(a_ptr, b_ptr, c_ptr, m as i32, n as i32, k as i32, null_stream),
|
||||
DType::BF16 => launch_gemm_naive_bf16(a_ptr, b_ptr, c_ptr, m as i32, n as i32, k as i32, null_stream),
|
||||
_ => panic!("unsupported dtype for naive GEMM"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
}
|
||||
GemmBackend::Tiled => {
|
||||
unsafe {
|
||||
match dtype {
|
||||
DType::F32 => launch_gemm_tiled_f32(a_ptr, b_ptr, c_ptr, m as i32, n as i32, k as i32, null_stream),
|
||||
DType::BF16 => launch_gemm_tiled_bf16(a_ptr, b_ptr, c_ptr, m as i32, n as i32, k as i32, null_stream),
|
||||
_ => panic!("unsupported dtype for tiled GEMM"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
}
|
||||
GemmBackend::CuBlas => {
|
||||
// cuBLAS uses column-major, but we have row-major tensors.
|
||||
// Trick: compute C^T = B^T @ A^T, which gives us C in row-major.
|
||||
// cuBLAS sees our row-major data as column-major transposed.
|
||||
let ctx = CublasContext::new().unwrap();
|
||||
let alpha = 1.0f32;
|
||||
let beta = 0.0f32;
|
||||
|
||||
let (a_type, b_type, c_type) = match dtype {
|
||||
DType::F32 => (CUDA_R_32F, CUDA_R_32F, CUDA_R_32F),
|
||||
DType::BF16 => (CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF),
|
||||
_ => panic!("unsupported dtype for cuBLAS GEMM"),
|
||||
};
|
||||
|
||||
unsafe {
|
||||
cublasSetStream_v2(ctx.handle, null_stream);
|
||||
// Row-major trick: swap A/B and transpose flags
|
||||
// C(row-major) = A @ B <=> C^T(col-major) = B^T @ A^T
|
||||
error::check(cublasGemmEx(
|
||||
ctx.handle,
|
||||
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||
n as i32, m as i32, k as i32,
|
||||
&alpha as *const f32 as *const c_void,
|
||||
b_ptr, b_type, n as i32, // B as col-major = B^T
|
||||
a_ptr, a_type, k as i32, // A as col-major = A^T
|
||||
&beta as *const f32 as *const c_void,
|
||||
c_ptr, c_type, n as i32, // C as col-major = C^T
|
||||
CUBLAS_COMPUTE_32F,
|
||||
-1, // default algo
|
||||
)).expect("cuBLAS GEMM failed");
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
c
|
||||
}
|
||||
|
||||
/// Batched matrix multiplication via cuBLAS: C[b] = A[b] @ B[b]
|
||||
/// a: [..., M, K], b: [..., K, N] → [..., M, N]
|
||||
/// Leading dimensions must match and tensors must be contiguous.
|
||||
pub fn batched_matmul(a: &Tensor, b: &Tensor) -> Tensor {
|
||||
assert!(a.ndim() >= 2 && b.ndim() >= 2);
|
||||
assert_eq!(a.ndim(), b.ndim());
|
||||
assert!(a.is_contiguous() && b.is_contiguous());
|
||||
assert!(matches!(a.device(), Device::Cuda(_)));
|
||||
assert_eq!(a.dtype(), b.dtype());
|
||||
|
||||
let ndim = a.ndim();
|
||||
let m = a.shape()[ndim - 2];
|
||||
let k = a.shape()[ndim - 1];
|
||||
let n = b.shape()[ndim - 1];
|
||||
assert_eq!(b.shape()[ndim - 2], k, "inner dimension mismatch");
|
||||
|
||||
// Compute batch count from leading dimensions
|
||||
let batch: usize = a.shape()[..ndim - 2].iter().product();
|
||||
assert_eq!(
|
||||
b.shape()[..ndim - 2].iter().product::<usize>(),
|
||||
batch,
|
||||
"batch dimensions mismatch"
|
||||
);
|
||||
|
||||
let mut out_shape: Vec<usize> = a.shape()[..ndim - 2].to_vec();
|
||||
out_shape.push(m);
|
||||
out_shape.push(n);
|
||||
let c = Tensor::zeros(&out_shape, a.dtype(), a.device());
|
||||
|
||||
let dtype = a.dtype();
|
||||
let (a_type, b_type, c_type) = match dtype {
|
||||
DType::F32 => (CUDA_R_32F, CUDA_R_32F, CUDA_R_32F),
|
||||
DType::BF16 => (CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF),
|
||||
_ => panic!("unsupported dtype for batched matmul"),
|
||||
};
|
||||
|
||||
let alpha = 1.0f32;
|
||||
let beta = 0.0f32;
|
||||
// cuBLAS strides are in elements (not bytes)
|
||||
let stride_a = (m * k) as i64;
|
||||
let stride_b = (k * n) as i64;
|
||||
let stride_c = (m * n) as i64;
|
||||
|
||||
let ctx = CublasContext::new().unwrap();
|
||||
unsafe {
|
||||
cublasSetStream_v2(ctx.handle, std::ptr::null_mut());
|
||||
// Row-major trick: C = A @ B ⟺ C^T = B^T @ A^T (col-major)
|
||||
error::check(cublasGemmStridedBatchedEx(
|
||||
ctx.handle,
|
||||
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||
n as i32, m as i32, k as i32,
|
||||
&alpha as *const f32 as *const c_void,
|
||||
b.data_ptr() as _, b_type, n as i32, stride_b,
|
||||
a.data_ptr() as _, a_type, k as i32, stride_a,
|
||||
&beta as *const f32 as *const c_void,
|
||||
c.data_ptr() as *mut c_void, c_type, n as i32, stride_c,
|
||||
batch as i32,
|
||||
CUBLAS_COMPUTE_32F,
|
||||
-1,
|
||||
)).expect("cuBLAS batched GEMM failed");
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
c
|
||||
}
|
||||
39
crates/xserv-kernels/src/layernorm.rs
Normal file
39
crates/xserv-kernels/src/layernorm.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
unsafe extern "C" {
|
||||
fn launch_layernorm_f32(x: *const c_void, gamma: *const c_void, beta: *const c_void,
|
||||
out: *mut c_void, rows: i32, hidden_size: i32, eps: f32, stream: *mut c_void);
|
||||
fn launch_layernorm_bf16(x: *const c_void, gamma: *const c_void, beta: *const c_void,
|
||||
out: *mut c_void, rows: i32, hidden_size: i32, eps: f32, stream: *mut c_void);
|
||||
}
|
||||
|
||||
pub fn layernorm(x: &Tensor, gamma: &Tensor, beta: &Tensor, eps: f32) -> Tensor {
|
||||
assert!(x.ndim() >= 1);
|
||||
assert!(x.is_contiguous() && gamma.is_contiguous() && beta.is_contiguous());
|
||||
assert!(matches!(x.device(), Device::Cuda(_)));
|
||||
let hidden_size = *x.shape().last().unwrap();
|
||||
assert_eq!(gamma.shape(), &[hidden_size]);
|
||||
assert_eq!(beta.shape(), &[hidden_size]);
|
||||
|
||||
let rows = x.numel() / hidden_size;
|
||||
let out = Tensor::zeros(x.shape(), x.dtype(), x.device());
|
||||
|
||||
unsafe {
|
||||
match x.dtype() {
|
||||
DType::F32 => launch_layernorm_f32(
|
||||
x.data_ptr() as _, gamma.data_ptr() as _, beta.data_ptr() as _,
|
||||
out.data_ptr() as *mut c_void,
|
||||
rows as i32, hidden_size as i32, eps, std::ptr::null_mut(),
|
||||
),
|
||||
DType::BF16 => launch_layernorm_bf16(
|
||||
x.data_ptr() as _, gamma.data_ptr() as _, beta.data_ptr() as _,
|
||||
out.data_ptr() as *mut c_void,
|
||||
rows as i32, hidden_size as i32, eps, std::ptr::null_mut(),
|
||||
),
|
||||
_ => panic!("unsupported dtype for layernorm"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
out
|
||||
}
|
||||
17
crates/xserv-kernels/src/lib.rs
Normal file
17
crates/xserv-kernels/src/lib.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
pub mod activation;
|
||||
pub mod attention;
|
||||
pub mod embedding;
|
||||
pub mod gemm;
|
||||
pub mod layernorm;
|
||||
pub mod rmsnorm;
|
||||
pub mod rope;
|
||||
pub mod softmax;
|
||||
|
||||
pub use activation::{gelu, scale, silu};
|
||||
pub use attention::attention;
|
||||
pub use embedding::embedding;
|
||||
pub use gemm::{batched_matmul, matmul, GemmBackend};
|
||||
pub use layernorm::layernorm;
|
||||
pub use rmsnorm::rmsnorm;
|
||||
pub use rope::{rope_inplace, RopeCache};
|
||||
pub use softmax::softmax;
|
||||
37
crates/xserv-kernels/src/rmsnorm.rs
Normal file
37
crates/xserv-kernels/src/rmsnorm.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
unsafe extern "C" {
|
||||
fn launch_rmsnorm_f32(x: *const c_void, gamma: *const c_void, out: *mut c_void,
|
||||
rows: i32, hidden_size: i32, eps: f32, stream: *mut c_void);
|
||||
fn launch_rmsnorm_bf16(x: *const c_void, gamma: *const c_void, out: *mut c_void,
|
||||
rows: i32, hidden_size: i32, eps: f32, stream: *mut c_void);
|
||||
}
|
||||
|
||||
pub fn rmsnorm(x: &Tensor, gamma: &Tensor, eps: f32) -> Tensor {
|
||||
assert!(x.ndim() >= 1);
|
||||
assert!(x.is_contiguous() && gamma.is_contiguous());
|
||||
assert!(matches!(x.device(), Device::Cuda(_)));
|
||||
let hidden_size = *x.shape().last().unwrap();
|
||||
assert_eq!(gamma.shape(), &[hidden_size]);
|
||||
assert_eq!(x.dtype(), gamma.dtype());
|
||||
|
||||
let rows = x.numel() / hidden_size;
|
||||
let out = Tensor::zeros(x.shape(), x.dtype(), x.device());
|
||||
|
||||
unsafe {
|
||||
match x.dtype() {
|
||||
DType::F32 => launch_rmsnorm_f32(
|
||||
x.data_ptr() as _, gamma.data_ptr() as _, out.data_ptr() as *mut c_void,
|
||||
rows as i32, hidden_size as i32, eps, std::ptr::null_mut(),
|
||||
),
|
||||
DType::BF16 => launch_rmsnorm_bf16(
|
||||
x.data_ptr() as _, gamma.data_ptr() as _, out.data_ptr() as *mut c_void,
|
||||
rows as i32, hidden_size as i32, eps, std::ptr::null_mut(),
|
||||
),
|
||||
_ => panic!("unsupported dtype for rmsnorm"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
out
|
||||
}
|
||||
85
crates/xserv-kernels/src/rope.rs
Normal file
85
crates/xserv-kernels/src/rope.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_cuda::GpuBuffer;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
unsafe extern "C" {
|
||||
fn launch_rope_f32(x: *mut c_void, cos_cache: *const c_void, sin_cache: *const c_void,
|
||||
positions: *const c_void, num_tokens: i32, num_heads: i32,
|
||||
head_dim: i32, stream: *mut c_void);
|
||||
fn launch_rope_bf16(x: *mut c_void, cos_cache: *const c_void, sin_cache: *const c_void,
|
||||
positions: *const c_void, num_tokens: i32, num_heads: i32,
|
||||
head_dim: i32, stream: *mut c_void);
|
||||
fn launch_compute_rope_cache(cos_cache: *mut c_void, sin_cache: *mut c_void,
|
||||
max_seq_len: i32, half_dim: i32, theta: f32,
|
||||
stream: *mut c_void);
|
||||
}
|
||||
|
||||
pub struct RopeCache {
|
||||
pub cos: GpuBuffer,
|
||||
pub sin: GpuBuffer,
|
||||
pub max_seq_len: usize,
|
||||
pub half_dim: usize,
|
||||
}
|
||||
|
||||
impl RopeCache {
|
||||
pub fn new(max_seq_len: usize, head_dim: usize, theta: f32) -> Self {
|
||||
let half_dim = head_dim / 2;
|
||||
let nbytes = max_seq_len * half_dim * std::mem::size_of::<f32>();
|
||||
let mut cos = GpuBuffer::alloc(nbytes).expect("alloc cos_cache");
|
||||
let mut sin = GpuBuffer::alloc(nbytes).expect("alloc sin_cache");
|
||||
|
||||
unsafe {
|
||||
launch_compute_rope_cache(
|
||||
cos.as_mut_ptr() as _, sin.as_mut_ptr() as _,
|
||||
max_seq_len as i32, half_dim as i32, theta, std::ptr::null_mut(),
|
||||
);
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
|
||||
Self { cos, sin, max_seq_len, half_dim }
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply RoPE in-place to x.
|
||||
/// x: [num_tokens, num_heads, head_dim] on GPU
|
||||
/// positions: [num_tokens] (u32 on CPU, will be uploaded)
|
||||
pub fn rope_inplace(x: &Tensor, cache: &RopeCache, positions: &[u32]) {
|
||||
assert_eq!(x.ndim(), 3);
|
||||
assert!(x.is_contiguous());
|
||||
assert!(matches!(x.device(), Device::Cuda(_)));
|
||||
let num_tokens = x.shape()[0];
|
||||
let num_heads = x.shape()[1];
|
||||
let head_dim = x.shape()[2];
|
||||
assert_eq!(head_dim / 2, cache.half_dim);
|
||||
assert_eq!(positions.len(), num_tokens);
|
||||
|
||||
let pos_bytes = unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
positions.as_ptr() as *const u8,
|
||||
num_tokens * std::mem::size_of::<u32>(),
|
||||
)
|
||||
};
|
||||
let mut pos_gpu = GpuBuffer::alloc(pos_bytes.len()).expect("alloc positions");
|
||||
pos_gpu.copy_from_host(pos_bytes).unwrap();
|
||||
|
||||
unsafe {
|
||||
match x.dtype() {
|
||||
DType::F32 => launch_rope_f32(
|
||||
x.data_ptr() as *mut c_void,
|
||||
cache.cos.as_ptr() as _, cache.sin.as_ptr() as _,
|
||||
pos_gpu.as_ptr() as _,
|
||||
num_tokens as i32, num_heads as i32, head_dim as i32,
|
||||
std::ptr::null_mut(),
|
||||
),
|
||||
DType::BF16 => launch_rope_bf16(
|
||||
x.data_ptr() as *mut c_void,
|
||||
cache.cos.as_ptr() as _, cache.sin.as_ptr() as _,
|
||||
pos_gpu.as_ptr() as _,
|
||||
num_tokens as i32, num_heads as i32, head_dim as i32,
|
||||
std::ptr::null_mut(),
|
||||
),
|
||||
_ => panic!("unsupported dtype for rope"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
}
|
||||
34
crates/xserv-kernels/src/softmax.rs
Normal file
34
crates/xserv-kernels/src/softmax.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use std::ffi::c_void;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
unsafe extern "C" {
|
||||
fn launch_softmax_f32(x: *const c_void, out: *mut c_void, rows: i32, cols: i32, stream: *mut c_void);
|
||||
fn launch_softmax_bf16(x: *const c_void, out: *mut c_void, rows: i32, cols: i32, stream: *mut c_void);
|
||||
}
|
||||
|
||||
/// Softmax along the last dimension.
|
||||
pub fn softmax(x: &Tensor) -> Tensor {
|
||||
assert!(x.ndim() >= 1);
|
||||
assert!(x.is_contiguous());
|
||||
assert!(matches!(x.device(), Device::Cuda(_)));
|
||||
|
||||
let cols = *x.shape().last().unwrap();
|
||||
let rows = x.numel() / cols;
|
||||
let out = Tensor::zeros(x.shape(), x.dtype(), x.device());
|
||||
|
||||
unsafe {
|
||||
match x.dtype() {
|
||||
DType::F32 => launch_softmax_f32(
|
||||
x.data_ptr() as _, out.data_ptr() as *mut c_void,
|
||||
rows as i32, cols as i32, std::ptr::null_mut(),
|
||||
),
|
||||
DType::BF16 => launch_softmax_bf16(
|
||||
x.data_ptr() as _, out.data_ptr() as *mut c_void,
|
||||
rows as i32, cols as i32, std::ptr::null_mut(),
|
||||
),
|
||||
_ => panic!("unsupported dtype for softmax"),
|
||||
}
|
||||
}
|
||||
xserv_cuda::device::synchronize().unwrap();
|
||||
out
|
||||
}
|
||||
187
crates/xserv-kernels/tests/attention_test.rs
Normal file
187
crates/xserv-kernels/tests/attention_test.rs
Normal file
@@ -0,0 +1,187 @@
|
||||
use xserv_kernels::*;
|
||||
use xserv_tensor::{Device, Tensor};
|
||||
|
||||
fn init() { xserv_cuda::device::set_device(0).unwrap(); }
|
||||
|
||||
fn cpu_attention(q: &[f32], k: &[f32], v: &[f32],
|
||||
batch: usize, heads: usize, q_len: usize, kv_len: usize, head_dim: usize,
|
||||
causal: bool) -> Vec<f32> {
|
||||
let mut out = vec![0.0f32; batch * heads * q_len * head_dim];
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
|
||||
for b in 0..batch {
|
||||
for h in 0..heads {
|
||||
// scores = Q @ K^T, scaled
|
||||
let mut scores = vec![0.0f32; q_len * kv_len];
|
||||
for i in 0..q_len {
|
||||
for j in 0..kv_len {
|
||||
let mut s = 0.0f32;
|
||||
for d in 0..head_dim {
|
||||
let qi = q[((b * heads + h) * q_len + i) * head_dim + d];
|
||||
let ki = k[((b * heads + h) * kv_len + j) * head_dim + d];
|
||||
s += qi * ki;
|
||||
}
|
||||
scores[i * kv_len + j] = s * scale;
|
||||
}
|
||||
}
|
||||
// causal mask
|
||||
if causal {
|
||||
let offset = kv_len - q_len;
|
||||
for i in 0..q_len {
|
||||
for j in 0..kv_len {
|
||||
if j > i + offset {
|
||||
scores[i * kv_len + j] = f32::NEG_INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// softmax per row
|
||||
for i in 0..q_len {
|
||||
let row = &mut scores[i * kv_len..(i + 1) * kv_len];
|
||||
let max = row.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||||
let mut sum = 0.0f32;
|
||||
for v in row.iter_mut() {
|
||||
*v = (*v - max).exp();
|
||||
sum += *v;
|
||||
}
|
||||
for v in row.iter_mut() {
|
||||
*v /= sum;
|
||||
}
|
||||
}
|
||||
// output = weights @ V
|
||||
for i in 0..q_len {
|
||||
for d in 0..head_dim {
|
||||
let mut s = 0.0f32;
|
||||
for j in 0..kv_len {
|
||||
let w = scores[i * kv_len + j];
|
||||
let vi = v[((b * heads + h) * kv_len + j) * head_dim + d];
|
||||
s += w * vi;
|
||||
}
|
||||
out[((b * heads + h) * q_len + i) * head_dim + d] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn check_close(a: &[f32], b: &[f32], atol: f32, name: &str) {
|
||||
assert_eq!(a.len(), b.len(), "{name}: length mismatch");
|
||||
let mut max_err = 0.0f32;
|
||||
for (i, (x, y)) in a.iter().zip(b).enumerate() {
|
||||
let err = (x - y).abs();
|
||||
if err > max_err { max_err = err; }
|
||||
assert!(err <= atol, "{name}: mismatch at [{i}]: got {x}, expected {y}, err {err}");
|
||||
}
|
||||
println!("{name}: max_err = {max_err:.6e}");
|
||||
}
|
||||
|
||||
fn make_data(n: usize) -> Vec<f32> {
|
||||
(0..n).map(|i| ((i % 17) as f32 - 8.0) * 0.05).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batched_matmul() {
|
||||
init();
|
||||
let batch = 4;
|
||||
let heads = 8;
|
||||
let m = 32;
|
||||
let k = 64;
|
||||
let n = 32;
|
||||
|
||||
let a_data = make_data(batch * heads * m * k);
|
||||
let b_data = make_data(batch * heads * k * n);
|
||||
|
||||
let a = Tensor::from_slice(&a_data, &[batch, heads, m, k]).to_device(Device::Cuda(0));
|
||||
let b = Tensor::from_slice(&b_data, &[batch, heads, k, n]).to_device(Device::Cuda(0));
|
||||
let c = batched_matmul(&a, &b).to_device(Device::Cpu);
|
||||
|
||||
assert_eq!(c.shape(), &[batch, heads, m, n]);
|
||||
|
||||
// Verify one batch element
|
||||
let a_cpu = &a_data[0..m * k];
|
||||
let b_cpu = &b_data[0..k * n];
|
||||
let mut expected = vec![0.0f32; m * n];
|
||||
for i in 0..m {
|
||||
for j in 0..n {
|
||||
let mut s = 0.0f32;
|
||||
for kk in 0..k { s += a_cpu[i * k + kk] * b_cpu[kk * n + j]; }
|
||||
expected[i * n + j] = s;
|
||||
}
|
||||
}
|
||||
let result = c.as_slice::<f32>();
|
||||
check_close(&result[0..m * n], &expected, 1e-3, "batched_matmul[0]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attention_no_causal() {
|
||||
init();
|
||||
let b = 1; let h = 2; let s = 8; let d = 16;
|
||||
let q_data = make_data(b * h * s * d);
|
||||
let k_data = make_data(b * h * s * d);
|
||||
let v_data = make_data(b * h * s * d);
|
||||
let expected = cpu_attention(&q_data, &k_data, &v_data, b, h, s, s, d, false);
|
||||
|
||||
let q = Tensor::from_slice(&q_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let k = Tensor::from_slice(&k_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let v = Tensor::from_slice(&v_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let out = attention(&q, &k, &v, false).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-4, "attention_no_causal");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attention_causal() {
|
||||
init();
|
||||
let b = 1; let h = 2; let s = 16; let d = 32;
|
||||
let q_data = make_data(b * h * s * d);
|
||||
let k_data = make_data(b * h * s * d);
|
||||
let v_data = make_data(b * h * s * d);
|
||||
let expected = cpu_attention(&q_data, &k_data, &v_data, b, h, s, s, d, true);
|
||||
|
||||
let q = Tensor::from_slice(&q_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let k = Tensor::from_slice(&k_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let v = Tensor::from_slice(&v_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let out = attention(&q, &k, &v, true).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-3, "attention_causal");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attention_causal_larger() {
|
||||
init();
|
||||
let b = 2; let h = 4; let s = 64; let d = 64;
|
||||
let q_data = make_data(b * h * s * d);
|
||||
let k_data = make_data(b * h * s * d);
|
||||
let v_data = make_data(b * h * s * d);
|
||||
let expected = cpu_attention(&q_data, &k_data, &v_data, b, h, s, s, d, true);
|
||||
|
||||
let q = Tensor::from_slice(&q_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let k = Tensor::from_slice(&k_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let v = Tensor::from_slice(&v_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let out = attention(&q, &k, &v, true).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-2, "attention_causal_larger");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attention_causal_first_row_sees_only_first_token() {
|
||||
init();
|
||||
let b = 1; let h = 1; let s = 4; let d = 8;
|
||||
let q_data = make_data(b * h * s * d);
|
||||
let k_data = make_data(b * h * s * d);
|
||||
let v_data: Vec<f32> = (0..s * d).map(|i| {
|
||||
if i < d { 1.0 } else { 0.0 } // only first V row is nonzero
|
||||
}).collect();
|
||||
|
||||
let q = Tensor::from_slice(&q_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let k = Tensor::from_slice(&k_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let v = Tensor::from_slice(&v_data, &[b, h, s, d]).to_device(Device::Cuda(0));
|
||||
let out = attention(&q, &k, &v, true).to_device(Device::Cpu);
|
||||
|
||||
// First row (position 0) with causal mask can only see position 0.
|
||||
// So attention weight for position 0 is 1.0 for token 0 only.
|
||||
// output[0] should be exactly V[0] = [1, 1, 1, ...1]
|
||||
let result = out.as_slice::<f32>();
|
||||
for i in 0..d {
|
||||
assert!((result[i] - 1.0).abs() < 1e-5,
|
||||
"first row should equal V[0], got {} at dim {}", result[i], i);
|
||||
}
|
||||
}
|
||||
152
crates/xserv-kernels/tests/gemm_test.rs
Normal file
152
crates/xserv-kernels/tests/gemm_test.rs
Normal file
@@ -0,0 +1,152 @@
|
||||
use half::bf16;
|
||||
use xserv_kernels::{matmul, GemmBackend};
|
||||
use xserv_tensor::{Device, Tensor};
|
||||
|
||||
fn cpu_matmul_f32(a: &[f32], b: &[f32], m: usize, n: usize, k: usize) -> Vec<f32> {
|
||||
let mut c = vec![0.0f32; m * n];
|
||||
for i in 0..m {
|
||||
for j in 0..n {
|
||||
let mut sum = 0.0f32;
|
||||
for kk in 0..k {
|
||||
sum += a[i * k + kk] * b[kk * n + j];
|
||||
}
|
||||
c[i * n + j] = sum;
|
||||
}
|
||||
}
|
||||
c
|
||||
}
|
||||
|
||||
fn check_close_f32(result: &[f32], expected: &[f32], atol: f32) {
|
||||
assert_eq!(result.len(), expected.len());
|
||||
for (i, (r, e)) in result.iter().zip(expected).enumerate() {
|
||||
assert!(
|
||||
(r - e).abs() <= atol,
|
||||
"mismatch at index {i}: got {r}, expected {e}, diff {}",
|
||||
(r - e).abs()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn check_close_bf16(result: &[bf16], expected: &[f32], atol: f32) {
|
||||
assert_eq!(result.len(), expected.len());
|
||||
for (i, (r, e)) in result.iter().zip(expected).enumerate() {
|
||||
let rv = r.to_f32();
|
||||
assert!(
|
||||
(rv - e).abs() <= atol,
|
||||
"mismatch at index {i}: got {rv}, expected {e}, diff {}",
|
||||
(rv - e).abs()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn run_gemm_test_f32(backend: GemmBackend, m: usize, n: usize, k: usize) {
|
||||
xserv_cuda::device::set_device(0).unwrap();
|
||||
|
||||
let a_data: Vec<f32> = (0..m * k).map(|i| ((i % 7) as f32 - 3.0) * 0.1).collect();
|
||||
let b_data: Vec<f32> = (0..k * n).map(|i| ((i % 11) as f32 - 5.0) * 0.1).collect();
|
||||
let expected = cpu_matmul_f32(&a_data, &b_data, m, n, k);
|
||||
|
||||
let a = Tensor::from_slice(&a_data, &[m, k]).to_device(Device::Cuda(0));
|
||||
let b = Tensor::from_slice(&b_data, &[k, n]).to_device(Device::Cuda(0));
|
||||
let c = matmul(&a, &b, backend);
|
||||
|
||||
let c_cpu = c.to_device(Device::Cpu);
|
||||
check_close_f32(c_cpu.as_slice::<f32>(), &expected, 1e-4);
|
||||
}
|
||||
|
||||
fn run_gemm_test_bf16(backend: GemmBackend, m: usize, n: usize, k: usize) {
|
||||
xserv_cuda::device::set_device(0).unwrap();
|
||||
|
||||
let a_f32: Vec<f32> = (0..m * k).map(|i| ((i % 7) as f32 - 3.0) * 0.1).collect();
|
||||
let b_f32: Vec<f32> = (0..k * n).map(|i| ((i % 11) as f32 - 5.0) * 0.1).collect();
|
||||
let expected = cpu_matmul_f32(&a_f32, &b_f32, m, n, k);
|
||||
|
||||
let a_data: Vec<bf16> = a_f32.iter().map(|&v| bf16::from_f32(v)).collect();
|
||||
let b_data: Vec<bf16> = b_f32.iter().map(|&v| bf16::from_f32(v)).collect();
|
||||
|
||||
let a = Tensor::from_slice(&a_data, &[m, k]).to_device(Device::Cuda(0));
|
||||
let b = Tensor::from_slice(&b_data, &[k, n]).to_device(Device::Cuda(0));
|
||||
let c = matmul(&a, &b, backend);
|
||||
|
||||
let c_cpu = c.to_device(Device::Cpu);
|
||||
check_close_bf16(c_cpu.as_slice::<bf16>(), &expected, 0.1);
|
||||
}
|
||||
|
||||
// --- F32 tests ---
|
||||
|
||||
#[test]
|
||||
fn test_gemm_naive_f32_small() { run_gemm_test_f32(GemmBackend::Naive, 4, 4, 4); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_naive_f32_medium() { run_gemm_test_f32(GemmBackend::Naive, 64, 64, 64); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_naive_f32_rect() { run_gemm_test_f32(GemmBackend::Naive, 32, 64, 48); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_tiled_f32_small() { run_gemm_test_f32(GemmBackend::Tiled, 4, 4, 4); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_tiled_f32_medium() { run_gemm_test_f32(GemmBackend::Tiled, 128, 128, 128); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_tiled_f32_rect() { run_gemm_test_f32(GemmBackend::Tiled, 65, 33, 97); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_cublas_f32_small() { run_gemm_test_f32(GemmBackend::CuBlas, 4, 4, 4); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_cublas_f32_medium() { run_gemm_test_f32(GemmBackend::CuBlas, 256, 256, 256); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_cublas_f32_rect() { run_gemm_test_f32(GemmBackend::CuBlas, 65, 33, 97); }
|
||||
|
||||
// --- BF16 tests ---
|
||||
|
||||
#[test]
|
||||
fn test_gemm_naive_bf16_small() { run_gemm_test_bf16(GemmBackend::Naive, 4, 4, 4); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_naive_bf16_medium() { run_gemm_test_bf16(GemmBackend::Naive, 64, 64, 64); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_tiled_bf16_small() { run_gemm_test_bf16(GemmBackend::Tiled, 4, 4, 4); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_tiled_bf16_medium() { run_gemm_test_bf16(GemmBackend::Tiled, 128, 128, 128); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_cublas_bf16_small() { run_gemm_test_bf16(GemmBackend::CuBlas, 4, 4, 4); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_cublas_bf16_medium() { run_gemm_test_bf16(GemmBackend::CuBlas, 256, 256, 256); }
|
||||
|
||||
// --- Larger benchmark-style tests ---
|
||||
|
||||
#[test]
|
||||
fn test_gemm_cublas_f32_1024() { run_gemm_test_f32(GemmBackend::CuBlas, 1024, 1024, 1024); }
|
||||
|
||||
#[test]
|
||||
fn test_gemm_consistency_all_backends() {
|
||||
xserv_cuda::device::set_device(0).unwrap();
|
||||
|
||||
let m = 64;
|
||||
let n = 64;
|
||||
let k = 64;
|
||||
let a_data: Vec<f32> = (0..m * k).map(|i| ((i % 7) as f32 - 3.0) * 0.1).collect();
|
||||
let b_data: Vec<f32> = (0..k * n).map(|i| ((i % 11) as f32 - 5.0) * 0.1).collect();
|
||||
|
||||
let a = Tensor::from_slice(&a_data, &[m, k]).to_device(Device::Cuda(0));
|
||||
let b = Tensor::from_slice(&b_data, &[k, n]).to_device(Device::Cuda(0));
|
||||
|
||||
let c_naive = matmul(&a, &b, GemmBackend::Naive).to_device(Device::Cpu);
|
||||
let c_tiled = matmul(&a, &b, GemmBackend::Tiled).to_device(Device::Cpu);
|
||||
let c_cublas = matmul(&a, &b, GemmBackend::CuBlas).to_device(Device::Cpu);
|
||||
|
||||
let naive = c_naive.as_slice::<f32>();
|
||||
let tiled = c_tiled.as_slice::<f32>();
|
||||
let cublas = c_cublas.as_slice::<f32>();
|
||||
|
||||
check_close_f32(naive, cublas, 1e-4);
|
||||
check_close_f32(tiled, cublas, 1e-4);
|
||||
}
|
||||
302
crates/xserv-kernels/tests/ops_test.rs
Normal file
302
crates/xserv-kernels/tests/ops_test.rs
Normal file
@@ -0,0 +1,302 @@
|
||||
use half::bf16;
|
||||
use xserv_kernels::*;
|
||||
use xserv_tensor::{Device, Tensor};
|
||||
|
||||
fn init() { xserv_cuda::device::set_device(0).unwrap(); }
|
||||
|
||||
// --- CPU reference implementations ---
|
||||
|
||||
fn cpu_rmsnorm(x: &[f32], gamma: &[f32], eps: f32, hidden: usize) -> Vec<f32> {
|
||||
let rows = x.len() / hidden;
|
||||
let mut out = vec![0.0f32; x.len()];
|
||||
for r in 0..rows {
|
||||
let row = &x[r * hidden..(r + 1) * hidden];
|
||||
let sum_sq: f32 = row.iter().map(|v| v * v).sum();
|
||||
let rms_inv = 1.0 / (sum_sq / hidden as f32 + eps).sqrt();
|
||||
for i in 0..hidden {
|
||||
out[r * hidden + i] = row[i] * rms_inv * gamma[i];
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn cpu_layernorm(x: &[f32], gamma: &[f32], beta: &[f32], eps: f32, hidden: usize) -> Vec<f32> {
|
||||
let rows = x.len() / hidden;
|
||||
let mut out = vec![0.0f32; x.len()];
|
||||
for r in 0..rows {
|
||||
let row = &x[r * hidden..(r + 1) * hidden];
|
||||
let mean: f32 = row.iter().sum::<f32>() / hidden as f32;
|
||||
let var: f32 = row.iter().map(|v| (v - mean) * (v - mean)).sum::<f32>() / hidden as f32;
|
||||
let inv_std = 1.0 / (var + eps).sqrt();
|
||||
for i in 0..hidden {
|
||||
out[r * hidden + i] = gamma[i] * (row[i] - mean) * inv_std + beta[i];
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn cpu_gelu(x: &[f32]) -> Vec<f32> {
|
||||
let sqrt_2_over_pi = 0.7978845608f32;
|
||||
x.iter().map(|&v| {
|
||||
let inner = sqrt_2_over_pi * (v + 0.044715 * v * v * v);
|
||||
0.5 * v * (1.0 + inner.tanh())
|
||||
}).collect()
|
||||
}
|
||||
|
||||
fn cpu_silu(x: &[f32]) -> Vec<f32> {
|
||||
x.iter().map(|&v| v / (1.0 + (-v).exp())).collect()
|
||||
}
|
||||
|
||||
fn cpu_softmax(x: &[f32], cols: usize) -> Vec<f32> {
|
||||
let rows = x.len() / cols;
|
||||
let mut out = vec![0.0f32; x.len()];
|
||||
for r in 0..rows {
|
||||
let row = &x[r * cols..(r + 1) * cols];
|
||||
let max = row.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||||
let exps: Vec<f32> = row.iter().map(|v| (v - max).exp()).collect();
|
||||
let sum: f32 = exps.iter().sum();
|
||||
for i in 0..cols {
|
||||
out[r * cols + i] = exps[i] / sum;
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn cpu_rope(x: &mut [f32], positions: &[u32], num_heads: usize, head_dim: usize, theta: f32) {
|
||||
let half_dim = head_dim / 2;
|
||||
let num_tokens = positions.len();
|
||||
for t in 0..num_tokens {
|
||||
let pos = positions[t] as f32;
|
||||
for h in 0..num_heads {
|
||||
for i in 0..half_dim {
|
||||
let freq = 1.0 / theta.powf(2.0 * i as f32 / head_dim as f32);
|
||||
let angle = pos * freq;
|
||||
let cos_val = angle.cos();
|
||||
let sin_val = angle.sin();
|
||||
let base = (t * num_heads + h) * head_dim;
|
||||
let x0 = x[base + 2 * i];
|
||||
let x1 = x[base + 2 * i + 1];
|
||||
x[base + 2 * i] = x0 * cos_val - x1 * sin_val;
|
||||
x[base + 2 * i + 1] = x0 * sin_val + x1 * cos_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn check_close(result: &[f32], expected: &[f32], atol: f32, name: &str) {
|
||||
assert_eq!(result.len(), expected.len(), "{name}: length mismatch");
|
||||
let mut max_err = 0.0f32;
|
||||
for (i, (r, e)) in result.iter().zip(expected).enumerate() {
|
||||
let err = (r - e).abs();
|
||||
if err > max_err { max_err = err; }
|
||||
assert!(err <= atol, "{name}: mismatch at [{i}]: got {r}, expected {e}, err {err}");
|
||||
}
|
||||
println!("{name}: max_err = {max_err:.6e}");
|
||||
}
|
||||
|
||||
fn make_data(n: usize) -> Vec<f32> {
|
||||
(0..n).map(|i| ((i % 17) as f32 - 8.0) * 0.1).collect()
|
||||
}
|
||||
|
||||
// === RMSNorm ===
|
||||
|
||||
#[test]
|
||||
fn test_rmsnorm_f32() {
|
||||
init();
|
||||
let hidden = 768;
|
||||
let rows = 4;
|
||||
let x_data = make_data(rows * hidden);
|
||||
let gamma_data: Vec<f32> = (0..hidden).map(|i| 0.5 + (i % 3) as f32 * 0.2).collect();
|
||||
let expected = cpu_rmsnorm(&x_data, &gamma_data, 1e-5, hidden);
|
||||
|
||||
let x = Tensor::from_slice(&x_data, &[rows, hidden]).to_device(Device::Cuda(0));
|
||||
let gamma = Tensor::from_slice(&gamma_data, &[hidden]).to_device(Device::Cuda(0));
|
||||
let out = rmsnorm(&x, &gamma, 1e-5).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-4, "rmsnorm_f32");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rmsnorm_bf16() {
|
||||
init();
|
||||
let hidden = 768;
|
||||
let rows = 4;
|
||||
let x_f32 = make_data(rows * hidden);
|
||||
let gamma_f32: Vec<f32> = (0..hidden).map(|i| 0.5 + (i % 3) as f32 * 0.2).collect();
|
||||
let expected = cpu_rmsnorm(&x_f32, &gamma_f32, 1e-5, hidden);
|
||||
|
||||
let x_bf16: Vec<bf16> = x_f32.iter().map(|&v| bf16::from_f32(v)).collect();
|
||||
let gamma_bf16: Vec<bf16> = gamma_f32.iter().map(|&v| bf16::from_f32(v)).collect();
|
||||
let x = Tensor::from_slice(&x_bf16, &[rows, hidden]).to_device(Device::Cuda(0));
|
||||
let gamma = Tensor::from_slice(&gamma_bf16, &[hidden]).to_device(Device::Cuda(0));
|
||||
let out = rmsnorm(&x, &gamma, 1e-5).to_device(Device::Cpu);
|
||||
|
||||
let result: Vec<f32> = out.as_slice::<bf16>().iter().map(|v| v.to_f32()).collect();
|
||||
check_close(&result, &expected, 0.05, "rmsnorm_bf16");
|
||||
}
|
||||
|
||||
// === LayerNorm ===
|
||||
|
||||
#[test]
|
||||
fn test_layernorm_f32() {
|
||||
init();
|
||||
let hidden = 768;
|
||||
let rows = 4;
|
||||
let x_data = make_data(rows * hidden);
|
||||
let gamma_data: Vec<f32> = (0..hidden).map(|i| 0.8 + (i % 5) as f32 * 0.1).collect();
|
||||
let beta_data: Vec<f32> = (0..hidden).map(|i| ((i % 7) as f32 - 3.0) * 0.01).collect();
|
||||
let expected = cpu_layernorm(&x_data, &gamma_data, &beta_data, 1e-5, hidden);
|
||||
|
||||
let x = Tensor::from_slice(&x_data, &[rows, hidden]).to_device(Device::Cuda(0));
|
||||
let gamma = Tensor::from_slice(&gamma_data, &[hidden]).to_device(Device::Cuda(0));
|
||||
let beta = Tensor::from_slice(&beta_data, &[hidden]).to_device(Device::Cuda(0));
|
||||
let out = layernorm(&x, &gamma, &beta, 1e-5).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-4, "layernorm_f32");
|
||||
}
|
||||
|
||||
// === GELU ===
|
||||
|
||||
#[test]
|
||||
fn test_gelu_f32() {
|
||||
init();
|
||||
let data = make_data(10000);
|
||||
let expected = cpu_gelu(&data);
|
||||
let x = Tensor::from_slice(&data, &[10000]).to_device(Device::Cuda(0));
|
||||
let out = gelu(&x).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-5, "gelu_f32");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gelu_bf16() {
|
||||
init();
|
||||
let data_f32 = make_data(10000);
|
||||
let expected = cpu_gelu(&data_f32);
|
||||
let data_bf16: Vec<bf16> = data_f32.iter().map(|&v| bf16::from_f32(v)).collect();
|
||||
let x = Tensor::from_slice(&data_bf16, &[10000]).to_device(Device::Cuda(0));
|
||||
let out = gelu(&x).to_device(Device::Cpu);
|
||||
let result: Vec<f32> = out.as_slice::<bf16>().iter().map(|v| v.to_f32()).collect();
|
||||
check_close(&result, &expected, 0.02, "gelu_bf16");
|
||||
}
|
||||
|
||||
// === SiLU ===
|
||||
|
||||
#[test]
|
||||
fn test_silu_f32() {
|
||||
init();
|
||||
let data = make_data(10000);
|
||||
let expected = cpu_silu(&data);
|
||||
let x = Tensor::from_slice(&data, &[10000]).to_device(Device::Cuda(0));
|
||||
let out = silu(&x).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-5, "silu_f32");
|
||||
}
|
||||
|
||||
// === Softmax ===
|
||||
|
||||
#[test]
|
||||
fn test_softmax_f32() {
|
||||
init();
|
||||
let rows = 8;
|
||||
let cols = 256;
|
||||
let data = make_data(rows * cols);
|
||||
let expected = cpu_softmax(&data, cols);
|
||||
let x = Tensor::from_slice(&data, &[rows, cols]).to_device(Device::Cuda(0));
|
||||
let out = softmax(&x).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-5, "softmax_f32");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_softmax_sum_to_one() {
|
||||
init();
|
||||
let rows = 4;
|
||||
let cols = 2048;
|
||||
let data: Vec<f32> = (0..rows * cols).map(|i| ((i % 31) as f32 - 15.0) * 0.5).collect();
|
||||
let x = Tensor::from_slice(&data, &[rows, cols]).to_device(Device::Cuda(0));
|
||||
let out = softmax(&x).to_device(Device::Cpu);
|
||||
let result = out.as_slice::<f32>();
|
||||
for r in 0..rows {
|
||||
let row_sum: f32 = result[r * cols..(r + 1) * cols].iter().sum();
|
||||
assert!((row_sum - 1.0).abs() < 1e-5, "softmax row {r} sum = {row_sum}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_softmax_large_values() {
|
||||
init();
|
||||
let data = vec![1000.0f32, 1001.0, 999.0, 1000.5];
|
||||
let expected = cpu_softmax(&data, 4);
|
||||
let x = Tensor::from_slice(&data, &[1, 4]).to_device(Device::Cuda(0));
|
||||
let out = softmax(&x).to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-5, "softmax_large");
|
||||
}
|
||||
|
||||
// === Embedding ===
|
||||
|
||||
#[test]
|
||||
fn test_embedding_f32() {
|
||||
init();
|
||||
let vocab_size = 100;
|
||||
let hidden = 64;
|
||||
let table_data: Vec<f32> = (0..vocab_size * hidden).map(|i| i as f32 * 0.01).collect();
|
||||
let token_ids: Vec<u32> = vec![0, 5, 99, 42, 1];
|
||||
|
||||
let table = Tensor::from_slice(&table_data, &[vocab_size, hidden]).to_device(Device::Cuda(0));
|
||||
let out = embedding(&table, &token_ids).to_device(Device::Cpu);
|
||||
|
||||
assert_eq!(out.shape(), &[5, hidden]);
|
||||
let result = out.as_slice::<f32>();
|
||||
for (seq_idx, &tid) in token_ids.iter().enumerate() {
|
||||
for i in 0..hidden {
|
||||
let expected = table_data[tid as usize * hidden + i];
|
||||
let got = result[seq_idx * hidden + i];
|
||||
assert!((got - expected).abs() < 1e-6,
|
||||
"embedding mismatch at [{seq_idx},{i}]: got {got}, expected {expected}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === RoPE ===
|
||||
|
||||
#[test]
|
||||
fn test_rope_f32() {
|
||||
init();
|
||||
let num_tokens = 4;
|
||||
let num_heads = 2;
|
||||
let head_dim = 8;
|
||||
let theta = 10000.0f32;
|
||||
let positions: Vec<u32> = vec![0, 1, 2, 3];
|
||||
|
||||
let x_data: Vec<f32> = (0..num_tokens * num_heads * head_dim)
|
||||
.map(|i| ((i % 13) as f32 - 6.0) * 0.1)
|
||||
.collect();
|
||||
let mut expected = x_data.clone();
|
||||
cpu_rope(&mut expected, &positions, num_heads, head_dim, theta);
|
||||
|
||||
let x = Tensor::from_slice(&x_data, &[num_tokens, num_heads, head_dim])
|
||||
.to_device(Device::Cuda(0));
|
||||
let cache = RopeCache::new(64, head_dim, theta);
|
||||
rope_inplace(&x, &cache, &positions);
|
||||
|
||||
let out = x.to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &expected, 1e-4, "rope_f32");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rope_position_0_identity() {
|
||||
init();
|
||||
// At position 0, all angles are 0, so cos=1, sin=0 → identity transform
|
||||
let num_tokens = 1;
|
||||
let num_heads = 2;
|
||||
let head_dim = 8;
|
||||
let positions: Vec<u32> = vec![0];
|
||||
|
||||
let x_data: Vec<f32> = (0..num_tokens * num_heads * head_dim)
|
||||
.map(|i| (i as f32 + 1.0) * 0.1)
|
||||
.collect();
|
||||
|
||||
let x = Tensor::from_slice(&x_data, &[num_tokens, num_heads, head_dim])
|
||||
.to_device(Device::Cuda(0));
|
||||
let cache = RopeCache::new(64, head_dim, 10000.0);
|
||||
rope_inplace(&x, &cache, &positions);
|
||||
|
||||
let out = x.to_device(Device::Cpu);
|
||||
check_close(out.as_slice::<f32>(), &x_data, 1e-6, "rope_pos0");
|
||||
}
|
||||
14
crates/xserv-model/Cargo.toml
Normal file
14
crates/xserv-model/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "xserv-model"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
xserv-cuda = { path = "../xserv-cuda" }
|
||||
xserv-tensor = { path = "../xserv-tensor" }
|
||||
xserv-kernels = { path = "../xserv-kernels" }
|
||||
xserv-tokenizer = { path = "../xserv-tokenizer" }
|
||||
half.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
safetensors.workspace = true
|
||||
78
crates/xserv-model/src/bin/xserv-cli.rs
Normal file
78
crates/xserv-model/src/bin/xserv-cli.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use std::io::{self, Write};
|
||||
use std::path::PathBuf;
|
||||
use xserv_model::{GPT2, ModelConfig};
|
||||
use xserv_model::loader;
|
||||
use xserv_model::gpt2::sample_greedy;
|
||||
use xserv_tokenizer::Tokenizer;
|
||||
use xserv_tensor::Device;
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
if args.len() < 2 {
|
||||
eprintln!("Usage: xserv-cli <model-dir> [--max-tokens N]");
|
||||
eprintln!(" model-dir: path to HF model directory (containing model.safetensors, config.json, tokenizer.json)");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let model_dir = PathBuf::from(&args[1]);
|
||||
let max_tokens: usize = args.iter()
|
||||
.position(|a| a == "--max-tokens")
|
||||
.and_then(|i| args.get(i + 1))
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(100);
|
||||
|
||||
xserv_cuda::device::set_device(0).unwrap();
|
||||
let info = xserv_cuda::device::device_info(0).unwrap();
|
||||
eprintln!("GPU: {} ({} MB free)", info.name, info.free_memory / 1024 / 1024);
|
||||
|
||||
// Load config
|
||||
let config = ModelConfig::from_file(&model_dir.join("config.json"));
|
||||
eprintln!("Model: {:?}, layers={}, hidden={}, heads={}, vocab={}",
|
||||
config.model_type, config.num_layers(), config.hidden(),
|
||||
config.num_heads(), config.vocab_size);
|
||||
|
||||
// Load weights
|
||||
eprintln!("Loading weights...");
|
||||
let weights = loader::load_model_dir(&model_dir, Device::Cuda(0));
|
||||
eprintln!("Loaded {} tensors", weights.len());
|
||||
|
||||
// GPT-2 uses weight names without "model." prefix
|
||||
let model = GPT2::from_weights(config, weights);
|
||||
|
||||
// Load tokenizer
|
||||
let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
|
||||
eprintln!("Tokenizer loaded (vocab_size={})", tokenizer.vocab_size());
|
||||
eprintln!("Ready.\n");
|
||||
|
||||
// Interactive loop
|
||||
loop {
|
||||
print!("xserv> ");
|
||||
io::stdout().flush().unwrap();
|
||||
let mut input = String::new();
|
||||
if io::stdin().read_line(&mut input).unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
let input = input.trim();
|
||||
if input.is_empty() { continue; }
|
||||
if input == "quit" || input == "exit" { break; }
|
||||
|
||||
let mut token_ids = tokenizer.encode(input);
|
||||
print!("{input}");
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
for _ in 0..max_tokens {
|
||||
let logits = model.forward(&token_ids);
|
||||
let next = sample_greedy(&logits);
|
||||
token_ids.push(next);
|
||||
|
||||
let text = tokenizer.decode(&[next]);
|
||||
print!("{text}");
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
if tokenizer.eos_token_id() == Some(next) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
96
crates/xserv-model/src/config.rs
Normal file
96
crates/xserv-model/src/config.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use serde::Deserialize;
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct ModelConfig {
|
||||
pub architectures: Option<Vec<String>>,
|
||||
pub model_type: Option<String>,
|
||||
|
||||
// Modern HF naming
|
||||
#[serde(default)]
|
||||
pub hidden_size: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub intermediate_size: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub num_attention_heads: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub num_key_value_heads: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub num_hidden_layers: Option<usize>,
|
||||
pub vocab_size: usize,
|
||||
#[serde(default)]
|
||||
pub max_position_embeddings: Option<usize>,
|
||||
|
||||
// GPT-2 naming
|
||||
#[serde(default)]
|
||||
pub n_embd: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub n_head: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub n_layer: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub n_positions: Option<usize>,
|
||||
#[serde(default)]
|
||||
pub n_inner: Option<usize>,
|
||||
|
||||
// Normalization
|
||||
#[serde(default)]
|
||||
pub layer_norm_eps: Option<f64>,
|
||||
#[serde(default)]
|
||||
pub layer_norm_epsilon: Option<f64>,
|
||||
#[serde(default)]
|
||||
pub rms_norm_eps: Option<f64>,
|
||||
|
||||
// Other
|
||||
#[serde(default)]
|
||||
pub rope_theta: Option<f64>,
|
||||
#[serde(default)]
|
||||
pub tie_word_embeddings: Option<bool>,
|
||||
}
|
||||
|
||||
impl ModelConfig {
|
||||
pub fn from_file(path: &Path) -> Self {
|
||||
let data = std::fs::read_to_string(path)
|
||||
.unwrap_or_else(|e| panic!("failed to read {}: {e}", path.display()));
|
||||
serde_json::from_str(&data)
|
||||
.unwrap_or_else(|e| panic!("failed to parse {}: {e}", path.display()))
|
||||
}
|
||||
|
||||
pub fn hidden(&self) -> usize {
|
||||
self.hidden_size.or(self.n_embd).expect("hidden_size or n_embd required")
|
||||
}
|
||||
|
||||
pub fn num_heads(&self) -> usize {
|
||||
self.num_attention_heads.or(self.n_head).expect("num_attention_heads or n_head required")
|
||||
}
|
||||
|
||||
pub fn num_layers(&self) -> usize {
|
||||
self.num_hidden_layers.or(self.n_layer).expect("num_hidden_layers or n_layer required")
|
||||
}
|
||||
|
||||
pub fn max_seq_len(&self) -> usize {
|
||||
self.max_position_embeddings.or(self.n_positions).unwrap_or(2048)
|
||||
}
|
||||
|
||||
pub fn ffn_hidden(&self) -> usize {
|
||||
self.intermediate_size.or(self.n_inner).unwrap_or(self.hidden() * 4)
|
||||
}
|
||||
|
||||
pub fn num_kv_heads(&self) -> usize {
|
||||
self.num_key_value_heads.unwrap_or(self.num_heads())
|
||||
}
|
||||
|
||||
pub fn head_dim(&self) -> usize {
|
||||
self.hidden() / self.num_heads()
|
||||
}
|
||||
|
||||
pub fn ln_eps(&self) -> f32 {
|
||||
self.layer_norm_eps
|
||||
.or(self.layer_norm_epsilon)
|
||||
.unwrap_or(1e-5) as f32
|
||||
}
|
||||
|
||||
pub fn tied_embeddings(&self) -> bool {
|
||||
self.tie_word_embeddings.unwrap_or(true)
|
||||
}
|
||||
}
|
||||
224
crates/xserv-model/src/gpt2.rs
Normal file
224
crates/xserv-model/src/gpt2.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
use std::collections::HashMap;
|
||||
use xserv_kernels::*;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
use crate::config::ModelConfig;
|
||||
|
||||
pub struct GPT2 {
|
||||
pub config: ModelConfig,
|
||||
wte: Tensor, // [vocab_size, hidden]
|
||||
wpe: Tensor, // [max_pos, hidden]
|
||||
layers: Vec<GPT2Block>,
|
||||
ln_f_g: Tensor, // [hidden]
|
||||
ln_f_b: Tensor, // [hidden]
|
||||
}
|
||||
|
||||
struct GPT2Block {
|
||||
ln_1_g: Tensor,
|
||||
ln_1_b: Tensor,
|
||||
// Attention: combined QKV weight + bias, output weight + bias
|
||||
attn_qkv_w: Tensor, // [hidden, 3*hidden]
|
||||
attn_qkv_b: Tensor, // [3*hidden]
|
||||
attn_out_w: Tensor, // [hidden, hidden]
|
||||
attn_out_b: Tensor, // [hidden]
|
||||
ln_2_g: Tensor,
|
||||
ln_2_b: Tensor,
|
||||
mlp_fc_w: Tensor, // [hidden, 4*hidden]
|
||||
mlp_fc_b: Tensor, // [4*hidden]
|
||||
mlp_proj_w: Tensor, // [4*hidden, hidden]
|
||||
mlp_proj_b: Tensor, // [hidden]
|
||||
}
|
||||
|
||||
impl GPT2 {
|
||||
pub fn from_weights(config: ModelConfig, mut w: HashMap<String, Tensor>) -> Self {
|
||||
let take = |w: &mut HashMap<String, Tensor>, name: &str| -> Tensor {
|
||||
w.remove(name).unwrap_or_else(|| panic!("missing weight: {name}"))
|
||||
};
|
||||
|
||||
let wte = take(&mut w, "wte.weight");
|
||||
let wpe = take(&mut w, "wpe.weight");
|
||||
let ln_f_g = take(&mut w, "ln_f.weight");
|
||||
let ln_f_b = take(&mut w, "ln_f.bias");
|
||||
|
||||
let num_layers = config.num_layers();
|
||||
let mut layers = Vec::with_capacity(num_layers);
|
||||
for i in 0..num_layers {
|
||||
let p = format!("h.{i}");
|
||||
layers.push(GPT2Block {
|
||||
ln_1_g: take(&mut w, &format!("{p}.ln_1.weight")),
|
||||
ln_1_b: take(&mut w, &format!("{p}.ln_1.bias")),
|
||||
attn_qkv_w: take(&mut w, &format!("{p}.attn.c_attn.weight")),
|
||||
attn_qkv_b: take(&mut w, &format!("{p}.attn.c_attn.bias")),
|
||||
attn_out_w: take(&mut w, &format!("{p}.attn.c_proj.weight")),
|
||||
attn_out_b: take(&mut w, &format!("{p}.attn.c_proj.bias")),
|
||||
ln_2_g: take(&mut w, &format!("{p}.ln_2.weight")),
|
||||
ln_2_b: take(&mut w, &format!("{p}.ln_2.bias")),
|
||||
mlp_fc_w: take(&mut w, &format!("{p}.mlp.c_fc.weight")),
|
||||
mlp_fc_b: take(&mut w, &format!("{p}.mlp.c_fc.bias")),
|
||||
mlp_proj_w: take(&mut w, &format!("{p}.mlp.c_proj.weight")),
|
||||
mlp_proj_b: take(&mut w, &format!("{p}.mlp.c_proj.bias")),
|
||||
});
|
||||
}
|
||||
|
||||
Self { config, wte, wpe, layers, ln_f_g, ln_f_b }
|
||||
}
|
||||
|
||||
/// Full forward pass, returns logits [seq_len, vocab_size].
|
||||
pub fn forward(&self, token_ids: &[u32]) -> Tensor {
|
||||
let seq_len = token_ids.len();
|
||||
let hidden = self.config.hidden();
|
||||
let num_heads = self.config.num_heads();
|
||||
let head_dim = self.config.head_dim();
|
||||
|
||||
// Token + position embedding
|
||||
let tok_emb = embedding(&self.wte, token_ids);
|
||||
let pos_ids: Vec<u32> = (0..seq_len as u32).collect();
|
||||
let pos_emb = embedding(&self.wpe, &pos_ids);
|
||||
let mut x = add_tensors(&tok_emb, &pos_emb);
|
||||
|
||||
// Transformer layers
|
||||
for layer in &self.layers {
|
||||
// Pre-LN attention
|
||||
let residual = x.clone();
|
||||
let normed = layernorm(&x, &layer.ln_1_g, &layer.ln_1_b, self.config.ln_eps());
|
||||
|
||||
// QKV projection: [S, H] @ [H, 3H] + [3H] → [S, 3H]
|
||||
let qkv = linear(&normed, &layer.attn_qkv_w, Some(&layer.attn_qkv_b));
|
||||
// Split into Q, K, V and reshape for multi-head
|
||||
let (q, k, v) = split_qkv(&qkv, num_heads, head_dim, seq_len);
|
||||
// Attention: [1, H, S, D]
|
||||
let attn_out = attention(&q, &k, &v, true);
|
||||
// Merge heads: [1, H, S, D] → [S, hidden]
|
||||
let attn_out = merge_heads(&attn_out, seq_len, hidden);
|
||||
// Output projection
|
||||
let attn_out = linear(&attn_out, &layer.attn_out_w, Some(&layer.attn_out_b));
|
||||
x = add_tensors(&residual, &attn_out);
|
||||
|
||||
// Pre-LN MLP
|
||||
let residual = x.clone();
|
||||
let normed = layernorm(&x, &layer.ln_2_g, &layer.ln_2_b, self.config.ln_eps());
|
||||
let fc = linear(&normed, &layer.mlp_fc_w, Some(&layer.mlp_fc_b));
|
||||
let activated = gelu(&fc);
|
||||
let proj = linear(&activated, &layer.mlp_proj_w, Some(&layer.mlp_proj_b));
|
||||
x = add_tensors(&residual, &proj);
|
||||
}
|
||||
|
||||
// Final layer norm
|
||||
let x = layernorm(&x, &self.ln_f_g, &self.ln_f_b, self.config.ln_eps());
|
||||
|
||||
// LM head (tied with wte): [S, H] @ [H, V] → [S, V]
|
||||
// wte is [V, H], so we need wte^T
|
||||
let lm_head = self.wte.transpose(0, 1).contiguous();
|
||||
matmul_2d(&x, &lm_head)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Helper ops ---
|
||||
|
||||
fn linear(x: &Tensor, weight: &Tensor, bias: Option<&Tensor>) -> Tensor {
|
||||
// GPT-2 stores weights as [in, out] (not transposed), so x @ w
|
||||
let out = matmul_2d(x, weight);
|
||||
if let Some(b) = bias {
|
||||
add_bias(&out, b)
|
||||
} else {
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
fn matmul_2d(a: &Tensor, b: &Tensor) -> Tensor {
|
||||
// a: [S, K], b: [K, N] → [S, N]
|
||||
assert_eq!(a.ndim(), 2);
|
||||
assert_eq!(b.ndim(), 2);
|
||||
matmul(a, b, GemmBackend::CuBlas)
|
||||
}
|
||||
|
||||
fn add_tensors(a: &Tensor, b: &Tensor) -> Tensor {
|
||||
// Element-wise add on GPU via a simple approach: scale(a, 1.0) + scale(b, 1.0)
|
||||
// TODO: proper add kernel. For now, go through CPU.
|
||||
assert_eq!(a.shape(), b.shape());
|
||||
assert_eq!(a.dtype(), DType::F32);
|
||||
let a_cpu = a.to_device(Device::Cpu);
|
||||
let b_cpu = b.to_device(Device::Cpu);
|
||||
let a_data = a_cpu.as_slice::<f32>();
|
||||
let b_data = b_cpu.as_slice::<f32>();
|
||||
let sum: Vec<f32> = a_data.iter().zip(b_data).map(|(x, y)| x + y).collect();
|
||||
Tensor::from_slice(&sum, a.shape()).to_device(a.device())
|
||||
}
|
||||
|
||||
fn add_bias(x: &Tensor, bias: &Tensor) -> Tensor {
|
||||
// x: [S, N], bias: [N] → broadcast add
|
||||
assert_eq!(x.ndim(), 2);
|
||||
assert_eq!(bias.ndim(), 1);
|
||||
assert_eq!(x.shape()[1], bias.shape()[0]);
|
||||
let x_cpu = x.to_device(Device::Cpu);
|
||||
let b_cpu = bias.to_device(Device::Cpu);
|
||||
let x_data = x_cpu.as_slice::<f32>();
|
||||
let b_data = b_cpu.as_slice::<f32>();
|
||||
let n = bias.shape()[0];
|
||||
let result: Vec<f32> = x_data.iter().enumerate().map(|(i, &v)| v + b_data[i % n]).collect();
|
||||
Tensor::from_slice(&result, x.shape()).to_device(x.device())
|
||||
}
|
||||
|
||||
fn split_qkv(qkv: &Tensor, num_heads: usize, head_dim: usize, seq_len: usize) -> (Tensor, Tensor, Tensor) {
|
||||
// qkv: [S, 3*H] → Q, K, V each [1, num_heads, S, head_dim]
|
||||
let hidden = num_heads * head_dim;
|
||||
let qkv_cpu = qkv.to_device(Device::Cpu);
|
||||
let data = qkv_cpu.as_slice::<f32>();
|
||||
|
||||
// Split into Q, K, V and directly write in [1, num_heads, S, head_dim] layout
|
||||
let mut q_data = vec![0.0f32; num_heads * seq_len * head_dim];
|
||||
let mut k_data = vec![0.0f32; num_heads * seq_len * head_dim];
|
||||
let mut v_data = vec![0.0f32; num_heads * seq_len * head_dim];
|
||||
|
||||
for s in 0..seq_len {
|
||||
let row = &data[s * 3 * hidden..(s + 1) * 3 * hidden];
|
||||
for h in 0..num_heads {
|
||||
let src_off = h * head_dim;
|
||||
let dst_off = (h * seq_len + s) * head_dim;
|
||||
q_data[dst_off..dst_off + head_dim].copy_from_slice(&row[src_off..src_off + head_dim]);
|
||||
k_data[dst_off..dst_off + head_dim].copy_from_slice(&row[hidden + src_off..hidden + src_off + head_dim]);
|
||||
v_data[dst_off..dst_off + head_dim].copy_from_slice(&row[2 * hidden + src_off..2 * hidden + src_off + head_dim]);
|
||||
}
|
||||
}
|
||||
|
||||
let device = qkv.device();
|
||||
let q = Tensor::from_slice(&q_data, &[1, num_heads, seq_len, head_dim]).to_device(device);
|
||||
let k = Tensor::from_slice(&k_data, &[1, num_heads, seq_len, head_dim]).to_device(device);
|
||||
let v = Tensor::from_slice(&v_data, &[1, num_heads, seq_len, head_dim]).to_device(device);
|
||||
(q, k, v)
|
||||
}
|
||||
|
||||
fn merge_heads(x: &Tensor, seq_len: usize, hidden: usize) -> Tensor {
|
||||
// [1, num_heads, S, head_dim] → [S, hidden]
|
||||
let num_heads = x.shape()[1];
|
||||
let head_dim = x.shape()[3];
|
||||
let x_cpu = x.to_device(Device::Cpu);
|
||||
let src = x_cpu.as_slice::<f32>();
|
||||
|
||||
// src layout: [1][num_heads][seq_len][head_dim]
|
||||
// dst layout: [seq_len][hidden] where hidden = num_heads * head_dim
|
||||
let mut out = vec![0.0f32; seq_len * hidden];
|
||||
for s in 0..seq_len {
|
||||
for h in 0..num_heads {
|
||||
let src_off = (h * seq_len + s) * head_dim;
|
||||
let dst_off = s * hidden + h * head_dim;
|
||||
out[dst_off..dst_off + head_dim].copy_from_slice(&src[src_off..src_off + head_dim]);
|
||||
}
|
||||
}
|
||||
Tensor::from_slice(&out, &[seq_len, hidden]).to_device(x.device())
|
||||
}
|
||||
|
||||
/// Greedy sampling: return the argmax token ID from the last position's logits.
|
||||
pub fn sample_greedy(logits: &Tensor) -> u32 {
|
||||
assert_eq!(logits.ndim(), 2); // [S, V]
|
||||
let logits_cpu = logits.to_device(Device::Cpu);
|
||||
let data = logits_cpu.as_slice::<f32>();
|
||||
let vocab_size = logits.shape()[1];
|
||||
let seq_len = logits.shape()[0];
|
||||
let last_row = &data[(seq_len - 1) * vocab_size..seq_len * vocab_size];
|
||||
last_row.iter()
|
||||
.enumerate()
|
||||
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
|
||||
.map(|(idx, _)| idx as u32)
|
||||
.unwrap()
|
||||
}
|
||||
6
crates/xserv-model/src/lib.rs
Normal file
6
crates/xserv-model/src/lib.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
pub mod config;
|
||||
pub mod gpt2;
|
||||
pub mod loader;
|
||||
|
||||
pub use config::ModelConfig;
|
||||
pub use gpt2::GPT2;
|
||||
87
crates/xserv-model/src/loader.rs
Normal file
87
crates/xserv-model/src/loader.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
use half::{bf16, f16};
|
||||
use safetensors::SafeTensors;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
pub fn load_safetensors(path: &Path, device: Device) -> HashMap<String, Tensor> {
|
||||
let data = std::fs::read(path)
|
||||
.unwrap_or_else(|e| panic!("failed to read {}: {e}", path.display()));
|
||||
let st = SafeTensors::deserialize(&data)
|
||||
.unwrap_or_else(|e| panic!("failed to parse safetensors {}: {e}", path.display()));
|
||||
|
||||
let mut tensors = HashMap::new();
|
||||
|
||||
for (name, view) in st.tensors() {
|
||||
let shape: Vec<usize> = view.shape().to_vec();
|
||||
let raw_bytes = view.data();
|
||||
let dtype = match view.dtype() {
|
||||
safetensors::Dtype::F32 => DType::F32,
|
||||
safetensors::Dtype::F16 => DType::F16,
|
||||
safetensors::Dtype::BF16 => DType::BF16,
|
||||
other => {
|
||||
eprintln!("skipping tensor {name}: unsupported dtype {other:?}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let tensor = make_tensor(raw_bytes, &shape, dtype);
|
||||
let tensor = tensor.to_device(device);
|
||||
tensors.insert(name.to_string(), tensor);
|
||||
}
|
||||
|
||||
tensors
|
||||
}
|
||||
|
||||
/// Load from a directory containing model.safetensors (or sharded files) + config.json.
|
||||
pub fn load_model_dir(dir: &Path, device: Device) -> HashMap<String, Tensor> {
|
||||
let single = dir.join("model.safetensors");
|
||||
if single.exists() {
|
||||
return load_safetensors(&single, device);
|
||||
}
|
||||
|
||||
// Try sharded: model-00001-of-NNNNN.safetensors
|
||||
let mut all_tensors = HashMap::new();
|
||||
let mut entries: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
e.path()
|
||||
.file_name()
|
||||
.map(|f| f.to_string_lossy().ends_with(".safetensors"))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.collect();
|
||||
entries.sort_by_key(|e| e.file_name());
|
||||
|
||||
for entry in entries {
|
||||
let tensors = load_safetensors(&entry.path(), device);
|
||||
all_tensors.extend(tensors);
|
||||
}
|
||||
|
||||
assert!(!all_tensors.is_empty(), "no safetensors files found in {}", dir.display());
|
||||
all_tensors
|
||||
}
|
||||
|
||||
fn make_tensor(raw_bytes: &[u8], shape: &[usize], dtype: DType) -> Tensor {
|
||||
match dtype {
|
||||
DType::F32 => {
|
||||
let floats: &[f32] = unsafe {
|
||||
std::slice::from_raw_parts(raw_bytes.as_ptr() as *const f32, raw_bytes.len() / 4)
|
||||
};
|
||||
Tensor::from_slice(floats, shape)
|
||||
}
|
||||
DType::F16 => {
|
||||
let halfs: &[f16] = unsafe {
|
||||
std::slice::from_raw_parts(raw_bytes.as_ptr() as *const f16, raw_bytes.len() / 2)
|
||||
};
|
||||
Tensor::from_slice(halfs, shape)
|
||||
}
|
||||
DType::BF16 => {
|
||||
let bfs: &[bf16] = unsafe {
|
||||
std::slice::from_raw_parts(raw_bytes.as_ptr() as *const bf16, raw_bytes.len() / 2)
|
||||
};
|
||||
Tensor::from_slice(bfs, shape)
|
||||
}
|
||||
}
|
||||
}
|
||||
9
crates/xserv-tensor/Cargo.toml
Normal file
9
crates/xserv-tensor/Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "xserv-tensor"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
xserv-cuda = { path = "../xserv-cuda" }
|
||||
half.workspace = true
|
||||
smallvec.workspace = true
|
||||
57
crates/xserv-tensor/src/dtype.rs
Normal file
57
crates/xserv-tensor/src/dtype.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use half::{bf16, f16};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum DType {
|
||||
F32,
|
||||
F16,
|
||||
BF16,
|
||||
}
|
||||
|
||||
impl DType {
|
||||
pub fn size_bytes(self) -> usize {
|
||||
match self {
|
||||
DType::F32 => 4,
|
||||
DType::F16 => 2,
|
||||
DType::BF16 => 2,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn name(self) -> &'static str {
|
||||
match self {
|
||||
DType::F32 => "f32",
|
||||
DType::F16 => "f16",
|
||||
DType::BF16 => "bf16",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.name())
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for types that can be stored in a Tensor.
|
||||
pub trait TensorDType: Copy + Send + Sync + 'static {
|
||||
const DTYPE: DType;
|
||||
fn to_f64(self) -> f64;
|
||||
fn from_f64(v: f64) -> Self;
|
||||
}
|
||||
|
||||
impl TensorDType for f32 {
|
||||
const DTYPE: DType = DType::F32;
|
||||
fn to_f64(self) -> f64 { self as f64 }
|
||||
fn from_f64(v: f64) -> Self { v as f32 }
|
||||
}
|
||||
|
||||
impl TensorDType for f16 {
|
||||
const DTYPE: DType = DType::F16;
|
||||
fn to_f64(self) -> f64 { self.to_f32() as f64 }
|
||||
fn from_f64(v: f64) -> Self { f16::from_f32(v as f32) }
|
||||
}
|
||||
|
||||
impl TensorDType for bf16 {
|
||||
const DTYPE: DType = DType::BF16;
|
||||
fn to_f64(self) -> f64 { self.to_f32() as f64 }
|
||||
fn from_f64(v: f64) -> Self { bf16::from_f32(v as f32) }
|
||||
}
|
||||
8
crates/xserv-tensor/src/lib.rs
Normal file
8
crates/xserv-tensor/src/lib.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
pub mod dtype;
|
||||
pub mod shape;
|
||||
pub mod storage;
|
||||
pub mod tensor;
|
||||
|
||||
pub use dtype::{DType, TensorDType};
|
||||
pub use storage::Device;
|
||||
pub use tensor::Tensor;
|
||||
105
crates/xserv-tensor/src/shape.rs
Normal file
105
crates/xserv-tensor/src/shape.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub type Dims = SmallVec<[usize; 4]>;
|
||||
|
||||
/// Compute contiguous strides for a given shape (row-major / C order).
|
||||
/// Example: shape [2, 3, 4] => strides [12, 4, 1]
|
||||
pub fn contiguous_strides(shape: &[usize]) -> Dims {
|
||||
let mut strides = SmallVec::with_capacity(shape.len());
|
||||
strides.resize(shape.len(), 0);
|
||||
if shape.is_empty() {
|
||||
return strides;
|
||||
}
|
||||
strides[shape.len() - 1] = 1;
|
||||
for i in (0..shape.len() - 1).rev() {
|
||||
strides[i] = strides[i + 1] * shape[i + 1];
|
||||
}
|
||||
strides
|
||||
}
|
||||
|
||||
/// Check if the given strides represent contiguous (row-major) layout for the shape.
|
||||
pub fn is_contiguous(shape: &[usize], strides: &[usize]) -> bool {
|
||||
if shape.is_empty() {
|
||||
return true;
|
||||
}
|
||||
let expected = contiguous_strides(shape);
|
||||
strides == expected.as_slice()
|
||||
}
|
||||
|
||||
/// Total number of elements given a shape.
|
||||
pub fn num_elements(shape: &[usize]) -> usize {
|
||||
shape.iter().product()
|
||||
}
|
||||
|
||||
/// Compute the shape after broadcasting two shapes together (NumPy rules).
|
||||
/// Returns None if shapes are not broadcastable.
|
||||
pub fn broadcast_shape(a: &[usize], b: &[usize]) -> Option<Dims> {
|
||||
let ndim = a.len().max(b.len());
|
||||
let mut result = SmallVec::with_capacity(ndim);
|
||||
for i in 0..ndim {
|
||||
let da = if i < ndim - a.len() { 1 } else { a[i - (ndim - a.len())] };
|
||||
let db = if i < ndim - b.len() { 1 } else { b[i - (ndim - b.len())] };
|
||||
if da == db {
|
||||
result.push(da);
|
||||
} else if da == 1 {
|
||||
result.push(db);
|
||||
} else if db == 1 {
|
||||
result.push(da);
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
Some(result)
|
||||
}
|
||||
|
||||
/// Compute broadcast strides: for dimensions where size is 1 but output is >1, stride becomes 0.
|
||||
pub fn broadcast_strides(shape: &[usize], strides: &[usize], target_shape: &[usize]) -> Dims {
|
||||
let ndim = target_shape.len();
|
||||
let offset = ndim - shape.len();
|
||||
let mut result = SmallVec::with_capacity(ndim);
|
||||
for i in 0..ndim {
|
||||
if i < offset {
|
||||
result.push(0);
|
||||
} else {
|
||||
let orig_idx = i - offset;
|
||||
if shape[orig_idx] == 1 && target_shape[i] > 1 {
|
||||
result.push(0);
|
||||
} else {
|
||||
result.push(strides[orig_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_contiguous_strides() {
|
||||
assert_eq!(contiguous_strides(&[2, 3, 4]).as_slice(), &[12, 4, 1]);
|
||||
assert_eq!(contiguous_strides(&[5]).as_slice(), &[1]);
|
||||
assert_eq!(contiguous_strides(&[2, 3]).as_slice(), &[3, 1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_contiguous() {
|
||||
assert!(is_contiguous(&[2, 3], &[3, 1]));
|
||||
assert!(!is_contiguous(&[3, 2], &[1, 3])); // transposed
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_broadcast_shape() {
|
||||
assert_eq!(broadcast_shape(&[3, 1], &[1, 4]).unwrap().as_slice(), &[3, 4]);
|
||||
assert_eq!(broadcast_shape(&[2, 3, 4], &[4]).unwrap().as_slice(), &[2, 3, 4]);
|
||||
assert_eq!(broadcast_shape(&[1], &[5, 3]).unwrap().as_slice(), &[5, 3]);
|
||||
assert!(broadcast_shape(&[3], &[4]).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_broadcast_strides() {
|
||||
// [3,1] with strides [1,1] broadcast to [3,4]
|
||||
assert_eq!(broadcast_strides(&[3, 1], &[1, 1], &[3, 4]).as_slice(), &[1, 0]);
|
||||
}
|
||||
}
|
||||
119
crates/xserv-tensor/src/storage.rs
Normal file
119
crates/xserv-tensor/src/storage.rs
Normal file
@@ -0,0 +1,119 @@
|
||||
use std::sync::Arc;
|
||||
use xserv_cuda::{GpuBuffer, Result as CudaResult};
|
||||
|
||||
enum StorageInner {
|
||||
Cpu { data: Vec<u8> },
|
||||
Cuda { buffer: GpuBuffer },
|
||||
}
|
||||
|
||||
/// Reference-counted storage for tensor data. Multiple tensors can share
|
||||
/// the same storage (e.g., after transpose or slice — view semantics).
|
||||
#[derive(Clone)]
|
||||
pub struct Storage(Arc<StorageInner>);
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Device {
|
||||
Cpu,
|
||||
Cuda(u32),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Device {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Device::Cpu => write!(f, "cpu"),
|
||||
Device::Cuda(i) => write!(f, "cuda:{i}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Storage {
|
||||
pub fn cpu(data: Vec<u8>) -> Self {
|
||||
Self(Arc::new(StorageInner::Cpu { data }))
|
||||
}
|
||||
|
||||
pub fn cuda(buffer: GpuBuffer) -> Self {
|
||||
Self(Arc::new(StorageInner::Cuda { buffer }))
|
||||
}
|
||||
|
||||
pub fn device(&self) -> Device {
|
||||
match self.0.as_ref() {
|
||||
StorageInner::Cpu { .. } => Device::Cpu,
|
||||
StorageInner::Cuda { .. } => Device::Cuda(0),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len_bytes(&self) -> usize {
|
||||
match self.0.as_ref() {
|
||||
StorageInner::Cpu { data } => data.len(),
|
||||
StorageInner::Cuda { buffer } => buffer.len(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a read-only view of CPU data. Panics if storage is on GPU.
|
||||
pub fn as_cpu_bytes(&self) -> &[u8] {
|
||||
match self.0.as_ref() {
|
||||
StorageInner::Cpu { data } => data,
|
||||
StorageInner::Cuda { .. } => panic!("cannot access GPU storage as CPU bytes"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gpu_buffer(&self) -> &GpuBuffer {
|
||||
match self.0.as_ref() {
|
||||
StorageInner::Cuda { buffer } => buffer,
|
||||
StorageInner::Cpu { .. } => panic!("cannot access CPU storage as GPU buffer"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy to a different device. If already on the target device, clones the Arc (no copy).
|
||||
pub fn to_device(&self, target: Device) -> CudaResult<Self> {
|
||||
let current = self.device();
|
||||
if current == target {
|
||||
return Ok(self.clone());
|
||||
}
|
||||
match (current, target) {
|
||||
(Device::Cpu, Device::Cuda(_dev)) => {
|
||||
let cpu_data = self.as_cpu_bytes();
|
||||
let mut buf = GpuBuffer::alloc(cpu_data.len())?;
|
||||
buf.copy_from_host(cpu_data)?;
|
||||
Ok(Storage::cuda(buf))
|
||||
}
|
||||
(Device::Cuda(_), Device::Cpu) => {
|
||||
let gpu_buf = self.gpu_buffer();
|
||||
let mut data = vec![0u8; gpu_buf.len()];
|
||||
gpu_buf.copy_to_host(&mut data)?;
|
||||
Ok(Storage::cpu(data))
|
||||
}
|
||||
(Device::Cuda(_), Device::Cuda(_)) => {
|
||||
let src = self.gpu_buffer();
|
||||
let mut dst = GpuBuffer::alloc(src.len())?;
|
||||
dst.copy_from_device(src)?;
|
||||
Ok(Storage::cuda(dst))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new owned copy of the storage on the same device.
|
||||
pub fn deep_copy(&self) -> CudaResult<Self> {
|
||||
match self.0.as_ref() {
|
||||
StorageInner::Cpu { data } => Ok(Storage::cpu(data.clone())),
|
||||
StorageInner::Cuda { buffer } => {
|
||||
let mut dst = GpuBuffer::alloc(buffer.len())?;
|
||||
dst.copy_from_device(buffer)?;
|
||||
Ok(Storage::cuda(dst))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocate zeroed storage on the given device.
|
||||
pub fn zeros(len_bytes: usize, device: Device) -> CudaResult<Self> {
|
||||
match device {
|
||||
Device::Cpu => Ok(Storage::cpu(vec![0u8; len_bytes])),
|
||||
Device::Cuda(_) => {
|
||||
let mut buf = GpuBuffer::alloc(len_bytes)?;
|
||||
buf.zero()?;
|
||||
Ok(Storage::cuda(buf))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
234
crates/xserv-tensor/src/tensor.rs
Normal file
234
crates/xserv-tensor/src/tensor.rs
Normal file
@@ -0,0 +1,234 @@
|
||||
use crate::dtype::{DType, TensorDType};
|
||||
use crate::shape::{self, Dims};
|
||||
use crate::storage::{Device, Storage};
|
||||
|
||||
/// Multi-dimensional array with CPU or GPU storage.
|
||||
///
|
||||
/// Tensors support view semantics: transpose, slice, etc. share
|
||||
/// the underlying storage and only change shape/strides/offset.
|
||||
#[derive(Clone)]
|
||||
pub struct Tensor {
|
||||
storage: Storage,
|
||||
shape: Dims,
|
||||
strides: Dims,
|
||||
offset: usize,
|
||||
dtype: DType,
|
||||
}
|
||||
|
||||
impl Tensor {
|
||||
// --- Creation ---
|
||||
|
||||
pub fn from_slice<T: TensorDType>(data: &[T], shape: &[usize]) -> Self {
|
||||
let numel: usize = shape.iter().product();
|
||||
assert_eq!(data.len(), numel, "data length mismatch with shape");
|
||||
let bytes = unsafe {
|
||||
std::slice::from_raw_parts(data.as_ptr() as *const u8, numel * T::DTYPE.size_bytes())
|
||||
};
|
||||
Self {
|
||||
storage: Storage::cpu(bytes.to_vec()),
|
||||
shape: Dims::from_slice(shape),
|
||||
strides: shape::contiguous_strides(shape),
|
||||
offset: 0,
|
||||
dtype: T::DTYPE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn zeros(shape: &[usize], dtype: DType, device: Device) -> Self {
|
||||
let numel = shape::num_elements(shape);
|
||||
let len_bytes = numel * dtype.size_bytes();
|
||||
let storage = Storage::zeros(len_bytes, device).expect("alloc failed");
|
||||
Self {
|
||||
storage,
|
||||
shape: Dims::from_slice(shape),
|
||||
strides: shape::contiguous_strides(shape),
|
||||
offset: 0,
|
||||
dtype,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ones(shape: &[usize], dtype: DType) -> Self {
|
||||
let numel = shape::num_elements(shape);
|
||||
match dtype {
|
||||
DType::F32 => Self::from_slice(&vec![1.0f32; numel], shape),
|
||||
DType::F16 => Self::from_slice(&vec![half::f16::from_f32(1.0); numel], shape),
|
||||
DType::BF16 => Self::from_slice(&vec![half::bf16::from_f32(1.0); numel], shape),
|
||||
}
|
||||
}
|
||||
|
||||
// --- Properties ---
|
||||
|
||||
pub fn shape(&self) -> &[usize] { &self.shape }
|
||||
pub fn strides(&self) -> &[usize] { &self.strides }
|
||||
pub fn dtype(&self) -> DType { self.dtype }
|
||||
pub fn ndim(&self) -> usize { self.shape.len() }
|
||||
pub fn numel(&self) -> usize { shape::num_elements(&self.shape) }
|
||||
pub fn offset(&self) -> usize { self.offset }
|
||||
|
||||
pub fn device(&self) -> Device { self.storage.device() }
|
||||
|
||||
pub fn is_contiguous(&self) -> bool {
|
||||
shape::is_contiguous(&self.shape, &self.strides)
|
||||
}
|
||||
|
||||
// --- Shape operations (view, no copy) ---
|
||||
|
||||
pub fn reshape(&self, new_shape: &[usize]) -> Self {
|
||||
assert!(self.is_contiguous(), "reshape requires contiguous tensor");
|
||||
let new_numel: usize = new_shape.iter().product();
|
||||
assert_eq!(new_numel, self.numel(), "reshape numel mismatch");
|
||||
Self {
|
||||
storage: self.storage.clone(),
|
||||
shape: Dims::from_slice(new_shape),
|
||||
strides: shape::contiguous_strides(new_shape),
|
||||
offset: self.offset,
|
||||
dtype: self.dtype,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn transpose(&self, dim0: usize, dim1: usize) -> Self {
|
||||
assert!(dim0 < self.ndim() && dim1 < self.ndim());
|
||||
let mut new_shape = self.shape.clone();
|
||||
let mut new_strides = self.strides.clone();
|
||||
new_shape.swap(dim0, dim1);
|
||||
new_strides.swap(dim0, dim1);
|
||||
Self {
|
||||
storage: self.storage.clone(),
|
||||
shape: new_shape,
|
||||
strides: new_strides,
|
||||
offset: self.offset,
|
||||
dtype: self.dtype,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn squeeze(&self, dim: usize) -> Self {
|
||||
assert!(dim < self.ndim() && self.shape[dim] == 1);
|
||||
let mut new_shape = self.shape.clone();
|
||||
let mut new_strides = self.strides.clone();
|
||||
new_shape.remove(dim);
|
||||
new_strides.remove(dim);
|
||||
Self {
|
||||
storage: self.storage.clone(),
|
||||
shape: new_shape,
|
||||
strides: new_strides,
|
||||
offset: self.offset,
|
||||
dtype: self.dtype,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unsqueeze(&self, dim: usize) -> Self {
|
||||
assert!(dim <= self.ndim());
|
||||
let mut new_shape = self.shape.clone();
|
||||
let mut new_strides = self.strides.clone();
|
||||
new_shape.insert(dim, 1);
|
||||
let stride_val = if dim < self.strides.len() { self.strides[dim] } else { 1 };
|
||||
new_strides.insert(dim, stride_val);
|
||||
Self {
|
||||
storage: self.storage.clone(),
|
||||
shape: new_shape,
|
||||
strides: new_strides,
|
||||
offset: self.offset,
|
||||
dtype: self.dtype,
|
||||
}
|
||||
}
|
||||
|
||||
/// Make contiguous: if already contiguous, return clone (shared storage).
|
||||
/// Otherwise, copy data into a new contiguous buffer.
|
||||
pub fn contiguous(&self) -> Self {
|
||||
if self.is_contiguous() {
|
||||
return self.clone();
|
||||
}
|
||||
// For GPU tensors: round-trip through CPU (correct but slow).
|
||||
// TODO: write a GPU contiguous-copy kernel for performance.
|
||||
if matches!(self.device(), Device::Cuda(_)) {
|
||||
let cpu = self.to_device(Device::Cpu);
|
||||
let contig = cpu.contiguous();
|
||||
return contig.to_device(self.device());
|
||||
}
|
||||
let numel = self.numel();
|
||||
let elem_size = self.dtype.size_bytes();
|
||||
let src_bytes = self.storage.as_cpu_bytes();
|
||||
let mut dst = vec![0u8; numel * elem_size];
|
||||
// Iterate all elements using strides
|
||||
let ndim = self.ndim();
|
||||
let mut idx = vec![0usize; ndim];
|
||||
for flat in 0..numel {
|
||||
let src_offset = self.offset + idx.iter().zip(self.strides.iter()).map(|(i, s)| i * s).sum::<usize>();
|
||||
let src_byte_offset = src_offset * elem_size;
|
||||
let dst_byte_offset = flat * elem_size;
|
||||
dst[dst_byte_offset..dst_byte_offset + elem_size]
|
||||
.copy_from_slice(&src_bytes[src_byte_offset..src_byte_offset + elem_size]);
|
||||
// Increment index (rightmost first)
|
||||
for d in (0..ndim).rev() {
|
||||
idx[d] += 1;
|
||||
if idx[d] < self.shape[d] {
|
||||
break;
|
||||
}
|
||||
idx[d] = 0;
|
||||
}
|
||||
}
|
||||
Self {
|
||||
storage: Storage::cpu(dst),
|
||||
shape: self.shape.clone(),
|
||||
strides: shape::contiguous_strides(&self.shape),
|
||||
offset: 0,
|
||||
dtype: self.dtype,
|
||||
}
|
||||
}
|
||||
|
||||
// --- Device transfer ---
|
||||
|
||||
pub fn to_device(&self, device: Device) -> Self {
|
||||
if self.device() == device {
|
||||
return self.clone();
|
||||
}
|
||||
// Transfer the raw storage (preserving strides/offset).
|
||||
// Non-contiguous layout is preserved — the user can call contiguous() after.
|
||||
let new_storage = self.storage.to_device(device).expect("device transfer failed");
|
||||
Self {
|
||||
storage: new_storage,
|
||||
shape: self.shape.clone(),
|
||||
strides: self.strides.clone(),
|
||||
offset: self.offset,
|
||||
dtype: self.dtype,
|
||||
}
|
||||
}
|
||||
|
||||
// --- Data access (CPU only) ---
|
||||
|
||||
/// Read tensor data as a typed slice. Requires contiguous CPU tensor.
|
||||
pub fn as_slice<T: TensorDType>(&self) -> &[T] {
|
||||
assert_eq!(T::DTYPE, self.dtype, "dtype mismatch");
|
||||
assert!(self.is_contiguous(), "as_slice requires contiguous");
|
||||
assert_eq!(self.device(), Device::Cpu, "as_slice requires CPU");
|
||||
let bytes = self.storage.as_cpu_bytes();
|
||||
let elem_size = self.dtype.size_bytes();
|
||||
let start = self.offset * elem_size;
|
||||
let len = self.numel();
|
||||
unsafe { std::slice::from_raw_parts(bytes[start..].as_ptr() as *const T, len) }
|
||||
}
|
||||
|
||||
/// Raw pointer to storage start (for GPU kernel launch).
|
||||
pub fn data_ptr(&self) -> *const u8 {
|
||||
match self.device() {
|
||||
Device::Cpu => {
|
||||
let bytes = self.storage.as_cpu_bytes();
|
||||
unsafe { bytes.as_ptr().add(self.offset * self.dtype.size_bytes()) }
|
||||
}
|
||||
Device::Cuda(_) => {
|
||||
let buf = self.storage.gpu_buffer();
|
||||
unsafe { buf.as_ptr().add(self.offset * self.dtype.size_bytes()) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn storage(&self) -> &Storage { &self.storage }
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Tensor {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f, "Tensor(shape={:?}, dtype={}, device={}, contiguous={})",
|
||||
self.shape.as_slice(), self.dtype, self.device(), self.is_contiguous()
|
||||
)
|
||||
}
|
||||
}
|
||||
127
crates/xserv-tensor/tests/integration.rs
Normal file
127
crates/xserv-tensor/tests/integration.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
use half::bf16;
|
||||
use xserv_tensor::*;
|
||||
|
||||
#[test]
|
||||
fn test_from_slice_and_shape() {
|
||||
let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
|
||||
let t = Tensor::from_slice(&data, &[2, 3]);
|
||||
assert_eq!(t.shape(), &[2, 3]);
|
||||
assert_eq!(t.strides(), &[3, 1]);
|
||||
assert_eq!(t.numel(), 6);
|
||||
assert_eq!(t.ndim(), 2);
|
||||
assert!(t.is_contiguous());
|
||||
assert_eq!(t.dtype(), DType::F32);
|
||||
assert_eq!(t.device(), Device::Cpu);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_as_slice() {
|
||||
let data = vec![1.0f32, 2.0, 3.0, 4.0];
|
||||
let t = Tensor::from_slice(&data, &[4]);
|
||||
assert_eq!(t.as_slice::<f32>(), &[1.0, 2.0, 3.0, 4.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_zeros_and_ones() {
|
||||
let z = Tensor::zeros(&[2, 3], DType::F32, Device::Cpu);
|
||||
assert_eq!(z.as_slice::<f32>(), &[0.0; 6]);
|
||||
|
||||
let o = Tensor::ones(&[3], DType::F32);
|
||||
assert_eq!(o.as_slice::<f32>(), &[1.0, 1.0, 1.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bf16_tensor() {
|
||||
let data: Vec<bf16> = vec![bf16::from_f32(1.0), bf16::from_f32(2.5), bf16::from_f32(-3.0)];
|
||||
let t = Tensor::from_slice(&data, &[3]);
|
||||
assert_eq!(t.dtype(), DType::BF16);
|
||||
let out = t.as_slice::<bf16>();
|
||||
assert_eq!(out[0].to_f32(), 1.0);
|
||||
assert!((out[1].to_f32() - 2.5).abs() < 0.01);
|
||||
assert_eq!(out[2].to_f32(), -3.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reshape() {
|
||||
let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
|
||||
let t = Tensor::from_slice(&data, &[2, 3]);
|
||||
let t2 = t.reshape(&[3, 2]);
|
||||
assert_eq!(t2.shape(), &[3, 2]);
|
||||
assert_eq!(t2.as_slice::<f32>(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
|
||||
|
||||
let t3 = t.reshape(&[6]);
|
||||
assert_eq!(t3.shape(), &[6]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_transpose() {
|
||||
let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
|
||||
let t = Tensor::from_slice(&data, &[2, 3]);
|
||||
let tt = t.transpose(0, 1);
|
||||
assert_eq!(tt.shape(), &[3, 2]);
|
||||
assert_eq!(tt.strides(), &[1, 3]);
|
||||
assert!(!tt.is_contiguous());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_contiguous_from_transpose() {
|
||||
let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
|
||||
// Original [2,3]: [[1,2,3],[4,5,6]]
|
||||
let t = Tensor::from_slice(&data, &[2, 3]);
|
||||
// Transpose to [3,2]: [[1,4],[2,5],[3,6]]
|
||||
let tt = t.transpose(0, 1);
|
||||
let tc = tt.contiguous();
|
||||
assert!(tc.is_contiguous());
|
||||
assert_eq!(tc.shape(), &[3, 2]);
|
||||
assert_eq!(tc.as_slice::<f32>(), &[1.0, 4.0, 2.0, 5.0, 3.0, 6.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_squeeze_unsqueeze() {
|
||||
let data = vec![1.0f32, 2.0, 3.0];
|
||||
let t = Tensor::from_slice(&data, &[1, 3]);
|
||||
let squeezed = t.squeeze(0);
|
||||
assert_eq!(squeezed.shape(), &[3]);
|
||||
|
||||
let unsqueezed = squeezed.unsqueeze(0);
|
||||
assert_eq!(unsqueezed.shape(), &[1, 3]);
|
||||
|
||||
let unsqueezed2 = squeezed.unsqueeze(1);
|
||||
assert_eq!(unsqueezed2.shape(), &[3, 1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cpu_to_gpu_roundtrip() {
|
||||
xserv_cuda::device::set_device(0).unwrap();
|
||||
|
||||
let data = vec![1.0f32, 2.0, 3.0, 4.0];
|
||||
let cpu_t = Tensor::from_slice(&data, &[2, 2]);
|
||||
let gpu_t = cpu_t.to_device(Device::Cuda(0));
|
||||
assert_eq!(gpu_t.device(), Device::Cuda(0));
|
||||
assert_eq!(gpu_t.shape(), &[2, 2]);
|
||||
|
||||
let back = gpu_t.to_device(Device::Cpu);
|
||||
assert_eq!(back.device(), Device::Cpu);
|
||||
assert_eq!(back.as_slice::<f32>(), &[1.0, 2.0, 3.0, 4.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_zeros_gpu() {
|
||||
xserv_cuda::device::set_device(0).unwrap();
|
||||
|
||||
let t = Tensor::zeros(&[4, 4], DType::F32, Device::Cuda(0));
|
||||
assert_eq!(t.device(), Device::Cuda(0));
|
||||
assert_eq!(t.shape(), &[4, 4]);
|
||||
|
||||
let cpu = t.to_device(Device::Cpu);
|
||||
assert_eq!(cpu.as_slice::<f32>(), &[0.0f32; 16]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_debug_format() {
|
||||
let t = Tensor::from_slice(&[1.0f32], &[1]);
|
||||
let dbg = format!("{:?}", t);
|
||||
assert!(dbg.contains("shape=[1]"));
|
||||
assert!(dbg.contains("f32"));
|
||||
assert!(dbg.contains("cpu"));
|
||||
}
|
||||
9
crates/xserv-tokenizer/Cargo.toml
Normal file
9
crates/xserv-tokenizer/Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "xserv-tokenizer"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
regex.workspace = true
|
||||
251
crates/xserv-tokenizer/src/bpe.rs
Normal file
251
crates/xserv-tokenizer/src/bpe.rs
Normal file
@@ -0,0 +1,251 @@
|
||||
use regex::Regex;
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct Tokenizer {
|
||||
encoder: HashMap<Vec<u8>, u32>,
|
||||
decoder: Vec<Vec<u8>>,
|
||||
merge_ranks: HashMap<(u32, u32), usize>,
|
||||
special_tokens: HashMap<String, u32>,
|
||||
special_token_ids: HashMap<u32, String>,
|
||||
pre_tokenize_re: Regex,
|
||||
eos_token_id: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct TokenizerJson {
|
||||
model: ModelSection,
|
||||
#[serde(default)]
|
||||
added_tokens: Vec<AddedToken>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ModelSection {
|
||||
vocab: HashMap<String, u32>,
|
||||
merges: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct AddedToken {
|
||||
id: u32,
|
||||
content: String,
|
||||
special: bool,
|
||||
}
|
||||
|
||||
impl Tokenizer {
|
||||
pub fn from_file(path: &Path) -> Self {
|
||||
let data = std::fs::read_to_string(path)
|
||||
.unwrap_or_else(|e| panic!("failed to read {}: {e}", path.display()));
|
||||
let tj: TokenizerJson = serde_json::from_str(&data)
|
||||
.unwrap_or_else(|e| panic!("failed to parse tokenizer.json: {e}"));
|
||||
|
||||
// Build encoder: token bytes → ID
|
||||
let mut encoder = HashMap::new();
|
||||
for (token_str, &id) in &tj.model.vocab {
|
||||
let bytes = token_str_to_bytes(token_str);
|
||||
encoder.insert(bytes, id);
|
||||
}
|
||||
|
||||
// Build decoder: ID → token bytes
|
||||
let max_id = tj.model.vocab.values().copied().max().unwrap_or(0);
|
||||
let added_max = tj.added_tokens.iter().map(|t| t.id).max().unwrap_or(0);
|
||||
let vocab_size = (max_id.max(added_max) + 1) as usize;
|
||||
let mut decoder = vec![vec![]; vocab_size];
|
||||
for (token_str, &id) in &tj.model.vocab {
|
||||
decoder[id as usize] = token_str_to_bytes(token_str);
|
||||
}
|
||||
|
||||
// Parse merges
|
||||
let mut merge_ranks = HashMap::new();
|
||||
for (rank, merge_line) in tj.model.merges.iter().enumerate() {
|
||||
let parts: Vec<&str> = merge_line.splitn(2, ' ').collect();
|
||||
if parts.len() != 2 { continue; }
|
||||
let a_bytes = token_str_to_bytes(parts[0]);
|
||||
let b_bytes = token_str_to_bytes(parts[1]);
|
||||
if let (Some(&a_id), Some(&b_id)) = (encoder.get(&a_bytes), encoder.get(&b_bytes)) {
|
||||
merge_ranks.insert((a_id, b_id), rank);
|
||||
}
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
let mut special_tokens = HashMap::new();
|
||||
let mut special_token_ids = HashMap::new();
|
||||
let mut eos_token_id = None;
|
||||
for at in &tj.added_tokens {
|
||||
if at.special {
|
||||
special_tokens.insert(at.content.clone(), at.id);
|
||||
special_token_ids.insert(at.id, at.content.clone());
|
||||
decoder.resize(decoder.len().max(at.id as usize + 1), vec![]);
|
||||
decoder[at.id as usize] = at.content.as_bytes().to_vec();
|
||||
if at.content == "<|endoftext|>" || at.content == "<|end_of_text|>" {
|
||||
eos_token_id = Some(at.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GPT-2 pre-tokenization regex.
|
||||
// The original uses (?!\S) lookahead which Rust regex doesn't support.
|
||||
// Simplified: collapse trailing whitespace into one match. Functionally equivalent
|
||||
// for BPE since each whitespace chunk gets encoded independently anyway.
|
||||
let pre_tokenize_re = Regex::new(
|
||||
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+"
|
||||
).unwrap();
|
||||
|
||||
Self {
|
||||
encoder,
|
||||
decoder,
|
||||
merge_ranks,
|
||||
special_tokens,
|
||||
special_token_ids,
|
||||
pre_tokenize_re,
|
||||
eos_token_id,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self, text: &str) -> Vec<u32> {
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
// Check for special tokens first (split around them)
|
||||
let mut remaining = text;
|
||||
while !remaining.is_empty() {
|
||||
// Find earliest special token
|
||||
let mut earliest: Option<(usize, &str, u32)> = None;
|
||||
for (st, &id) in &self.special_tokens {
|
||||
if let Some(pos) = remaining.find(st.as_str()) {
|
||||
if earliest.is_none() || pos < earliest.unwrap().0 {
|
||||
earliest = Some((pos, st, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((pos, st, id)) = earliest {
|
||||
if pos > 0 {
|
||||
self.encode_ordinary(&remaining[..pos], &mut tokens);
|
||||
}
|
||||
tokens.push(id);
|
||||
remaining = &remaining[pos + st.len()..];
|
||||
} else {
|
||||
self.encode_ordinary(remaining, &mut tokens);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
fn encode_ordinary(&self, text: &str, out: &mut Vec<u32>) {
|
||||
for mat in self.pre_tokenize_re.find_iter(text) {
|
||||
let word = mat.as_str();
|
||||
let word_bytes: Vec<u8> = word.bytes().collect();
|
||||
let mut token_ids: Vec<u32> = word_bytes.iter().map(|&b| {
|
||||
*self.encoder.get(&vec![b]).unwrap_or_else(|| {
|
||||
panic!("byte {b} not in vocab")
|
||||
})
|
||||
}).collect();
|
||||
|
||||
// BPE merges
|
||||
loop {
|
||||
if token_ids.len() < 2 { break; }
|
||||
let mut best_rank = usize::MAX;
|
||||
let mut best_idx = 0;
|
||||
for i in 0..token_ids.len() - 1 {
|
||||
if let Some(&rank) = self.merge_ranks.get(&(token_ids[i], token_ids[i + 1])) {
|
||||
if rank < best_rank {
|
||||
best_rank = rank;
|
||||
best_idx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if best_rank == usize::MAX { break; }
|
||||
|
||||
let merged_bytes = [
|
||||
self.decoder[token_ids[best_idx] as usize].as_slice(),
|
||||
self.decoder[token_ids[best_idx + 1] as usize].as_slice(),
|
||||
].concat();
|
||||
let merged_id = *self.encoder.get(&merged_bytes).unwrap_or_else(|| {
|
||||
panic!("merged token not in vocab");
|
||||
});
|
||||
token_ids[best_idx] = merged_id;
|
||||
token_ids.remove(best_idx + 1);
|
||||
}
|
||||
|
||||
out.extend_from_slice(&token_ids);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode(&self, token_ids: &[u32]) -> String {
|
||||
let mut bytes = Vec::new();
|
||||
for &id in token_ids {
|
||||
if let Some(b) = self.decoder.get(id as usize) {
|
||||
bytes.extend_from_slice(b);
|
||||
}
|
||||
}
|
||||
String::from_utf8_lossy(&bytes).into_owned()
|
||||
}
|
||||
|
||||
pub fn eos_token_id(&self) -> Option<u32> {
|
||||
self.eos_token_id
|
||||
}
|
||||
|
||||
pub fn vocab_size(&self) -> usize {
|
||||
self.decoder.len()
|
||||
}
|
||||
|
||||
pub fn special_token_id(&self, name: &str) -> Option<u32> {
|
||||
self.special_tokens.get(name).copied()
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a token string from HF vocab (which uses Unicode replacements for bytes)
|
||||
/// back to raw bytes. GPT-2 uses a byte-to-unicode mapping where e.g. byte 0x20 (space)
|
||||
/// is represented as 'Ġ' (U+0120).
|
||||
fn token_str_to_bytes(s: &str) -> Vec<u8> {
|
||||
s.chars().map(|c| unicode_to_byte(c)).collect()
|
||||
}
|
||||
|
||||
fn unicode_to_byte(c: char) -> u8 {
|
||||
let u = c as u32;
|
||||
// GPT-2 byte encoder: maps bytes 0-255 to specific Unicode code points.
|
||||
// Printable ASCII bytes map to themselves. Others are shifted to 256+.
|
||||
match u {
|
||||
0x21..=0x7E => u as u8, // '!' to '~'
|
||||
0xA1..=0xAC => u as u8, // '¡' to '¬'
|
||||
0xAE..=0xFF => u as u8, // '®' to 'ÿ'
|
||||
// Shifted bytes: 0x100 + original_byte for bytes not in the above ranges
|
||||
0x100..=0x1FF => (u - 0x100) as u8 + {
|
||||
// The shift mapping: byte values 0..=32, 127..=160, 173
|
||||
// are shifted to 256..=288, 289+, etc.
|
||||
0
|
||||
},
|
||||
_ => {
|
||||
// Fallback: for the GPT-2 byte encoder, specific mappings
|
||||
byte_from_unicode_gpt2(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn byte_from_unicode_gpt2(c: char) -> u8 {
|
||||
// Build the inverse of GPT-2's bytes_to_unicode mapping.
|
||||
// The mapping assigns printable chars to themselves and shifts unprintable bytes.
|
||||
let u = c as u32;
|
||||
// Direct ASCII printable + Latin-1 supplement printable ranges map identity
|
||||
if (0x21..=0x7E).contains(&u) { return u as u8; }
|
||||
if (0xA1..=0xAC).contains(&u) { return u as u8; }
|
||||
if (0xAE..=0xFF).contains(&u) { return u as u8; }
|
||||
|
||||
// Shifted range: the remaining 68 bytes (0-32, 127-160, 173) get mapped to 256..=323
|
||||
static SHIFTED_BYTES: &[u8] = &[
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31, 32, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
|
||||
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
|
||||
154, 155, 156, 157, 158, 159, 160, 173,
|
||||
];
|
||||
let shifted_start = 256u32;
|
||||
if u >= shifted_start && u < shifted_start + SHIFTED_BYTES.len() as u32 {
|
||||
return SHIFTED_BYTES[(u - shifted_start) as usize];
|
||||
}
|
||||
|
||||
// Shouldn't reach here for valid GPT-2 tokenizer
|
||||
c as u8
|
||||
}
|
||||
3
crates/xserv-tokenizer/src/lib.rs
Normal file
3
crates/xserv-tokenizer/src/lib.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
pub mod bpe;
|
||||
|
||||
pub use bpe::Tokenizer;
|
||||
90
csrc/activation/activations.cu
Normal file
90
csrc/activation/activations.cu
Normal file
@@ -0,0 +1,90 @@
|
||||
#include <cuda_bf16.h>
|
||||
#include <math.h>
|
||||
|
||||
// GELU (tanh approximation):
|
||||
// gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
|
||||
__device__ __forceinline__ float gelu_f(float x) {
|
||||
const float SQRT_2_OVER_PI = 0.7978845608f;
|
||||
float cube = x * x * x;
|
||||
float inner = SQRT_2_OVER_PI * (x + 0.044715f * cube);
|
||||
return 0.5f * x * (1.0f + tanhf(inner));
|
||||
}
|
||||
|
||||
// SiLU (Swish): silu(x) = x * sigmoid(x) = x / (1 + exp(-x))
|
||||
__device__ __forceinline__ float silu_f(float x) {
|
||||
return x / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
__global__ void gelu_f32(const float* x, float* out, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) out[idx] = gelu_f(x[idx]);
|
||||
}
|
||||
|
||||
__global__ void gelu_bf16(const __nv_bfloat16* x, __nv_bfloat16* out, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) out[idx] = __float2bfloat16(gelu_f(__bfloat162float(x[idx])));
|
||||
}
|
||||
|
||||
__global__ void silu_f32(const float* x, float* out, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) out[idx] = silu_f(x[idx]);
|
||||
}
|
||||
|
||||
__global__ void silu_bf16(const __nv_bfloat16* x, __nv_bfloat16* out, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) out[idx] = __float2bfloat16(silu_f(__bfloat162float(x[idx])));
|
||||
}
|
||||
|
||||
__global__ void scale_f32_kernel(const float* x, float* out, float scale, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) out[idx] = x[idx] * scale;
|
||||
}
|
||||
|
||||
__global__ void scale_bf16_kernel(const __nv_bfloat16* x, __nv_bfloat16* out, float scale, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) out[idx] = __float2bfloat16(__bfloat162float(x[idx]) * scale);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_gelu_f32(const void* x, void* out, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
gelu_f32<<<grid, block, 0, (cudaStream_t)stream>>>((const float*)x, (float*)out, n);
|
||||
}
|
||||
|
||||
void launch_gelu_bf16(const void* x, void* out, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
gelu_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (__nv_bfloat16*)out, n);
|
||||
}
|
||||
|
||||
void launch_silu_f32(const void* x, void* out, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
silu_f32<<<grid, block, 0, (cudaStream_t)stream>>>((const float*)x, (float*)out, n);
|
||||
}
|
||||
|
||||
void launch_silu_bf16(const void* x, void* out, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
silu_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (__nv_bfloat16*)out, n);
|
||||
}
|
||||
|
||||
void launch_scale_f32(const void* x, void* out, float scale, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
scale_f32_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)x, (float*)out, scale, n);
|
||||
}
|
||||
|
||||
void launch_scale_bf16(const void* x, void* out, float scale, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
scale_bf16_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (__nv_bfloat16*)out, scale, n);
|
||||
}
|
||||
|
||||
}
|
||||
53
csrc/attention/causal_mask.cu
Normal file
53
csrc/attention/causal_mask.cu
Normal file
@@ -0,0 +1,53 @@
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
// Apply causal mask: set scores[row][col] = -inf where col > row + offset.
|
||||
// offset is used for KV cache: when query starts at position `offset`,
|
||||
// we allow attending to positions [0, offset + row].
|
||||
// scores: [batch, rows, cols] (flattened batch×heads)
|
||||
|
||||
__global__ void causal_mask_f32(
|
||||
float* __restrict__ scores,
|
||||
int rows, int cols, int offset
|
||||
) {
|
||||
int batch_idx = blockIdx.z;
|
||||
int row = blockIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (col < cols && col > row + offset) {
|
||||
scores[batch_idx * rows * cols + row * cols + col] = -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void causal_mask_bf16(
|
||||
__nv_bfloat16* __restrict__ scores,
|
||||
int rows, int cols, int offset
|
||||
) {
|
||||
int batch_idx = blockIdx.z;
|
||||
int row = blockIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (col < cols && col > row + offset) {
|
||||
// BF16 doesn't have proper -inf literal, use a very large negative
|
||||
scores[batch_idx * rows * cols + row * cols + col] = __float2bfloat16(-1e9f);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_causal_mask_f32(void* scores, int batch, int rows, int cols,
|
||||
int offset, void* stream) {
|
||||
int block = 256;
|
||||
dim3 grid((cols + block - 1) / block, rows, batch);
|
||||
causal_mask_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(float*)scores, rows, cols, offset);
|
||||
}
|
||||
|
||||
void launch_causal_mask_bf16(void* scores, int batch, int rows, int cols,
|
||||
int offset, void* stream) {
|
||||
int block = 256;
|
||||
dim3 grid((cols + block - 1) / block, rows, batch);
|
||||
causal_mask_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(__nv_bfloat16*)scores, rows, cols, offset);
|
||||
}
|
||||
|
||||
}
|
||||
50
csrc/common.cuh
Normal file
50
csrc/common.cuh
Normal file
@@ -0,0 +1,50 @@
|
||||
#pragma once
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
// --- Warp-level reductions (no shared memory needed) ---
|
||||
|
||||
__device__ __forceinline__ float warp_reduce_sum(float val) {
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset >>= 1)
|
||||
val += __shfl_down_sync(0xffffffff, val, offset);
|
||||
return val;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float warp_reduce_max(float val) {
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset >>= 1)
|
||||
val = fmaxf(val, __shfl_down_sync(0xffffffff, val, offset));
|
||||
return val;
|
||||
}
|
||||
|
||||
// --- Block-level reductions ---
|
||||
|
||||
__device__ __forceinline__ float block_reduce_sum(float val) {
|
||||
__shared__ float shared[32];
|
||||
int lane = threadIdx.x & 31;
|
||||
int warp_id = threadIdx.x >> 5;
|
||||
int num_warps = (blockDim.x + 31) >> 5;
|
||||
|
||||
val = warp_reduce_sum(val);
|
||||
if (lane == 0) shared[warp_id] = val;
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < num_warps) ? shared[threadIdx.x] : 0.0f;
|
||||
if (warp_id == 0) val = warp_reduce_sum(val);
|
||||
return val;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float block_reduce_max(float val) {
|
||||
__shared__ float shared[32];
|
||||
int lane = threadIdx.x & 31;
|
||||
int warp_id = threadIdx.x >> 5;
|
||||
int num_warps = (blockDim.x + 31) >> 5;
|
||||
|
||||
val = warp_reduce_max(val);
|
||||
if (lane == 0) shared[warp_id] = val;
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < num_warps) ? shared[threadIdx.x] : -INFINITY;
|
||||
if (warp_id == 0) val = warp_reduce_max(val);
|
||||
return val;
|
||||
}
|
||||
55
csrc/embedding/embedding.cu
Normal file
55
csrc/embedding/embedding.cu
Normal file
@@ -0,0 +1,55 @@
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
// Embedding lookup: out[seq_idx] = table[token_ids[seq_idx]]
|
||||
// Grid: num_tokens, Block: handles hidden_size elements per token.
|
||||
|
||||
__global__ void embedding_f32(
|
||||
const float* __restrict__ table, // [vocab_size, hidden_size]
|
||||
const int* __restrict__ token_ids, // [num_tokens]
|
||||
float* __restrict__ out, // [num_tokens, hidden_size]
|
||||
int hidden_size
|
||||
) {
|
||||
int token_idx = blockIdx.x;
|
||||
int tid = token_ids[token_idx];
|
||||
const float* row = table + tid * hidden_size;
|
||||
float* dst = out + token_idx * hidden_size;
|
||||
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
dst[i] = row[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void embedding_bf16(
|
||||
const __nv_bfloat16* __restrict__ table,
|
||||
const int* __restrict__ token_ids,
|
||||
__nv_bfloat16* __restrict__ out,
|
||||
int hidden_size
|
||||
) {
|
||||
int token_idx = blockIdx.x;
|
||||
int tid = token_ids[token_idx];
|
||||
const __nv_bfloat16* row = table + tid * hidden_size;
|
||||
__nv_bfloat16* dst = out + token_idx * hidden_size;
|
||||
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
dst[i] = row[i];
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_embedding_f32(const void* table, const void* token_ids, void* out,
|
||||
int num_tokens, int hidden_size, void* stream) {
|
||||
int block = (hidden_size < 256) ? hidden_size : 256;
|
||||
embedding_f32<<<num_tokens, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)table, (const int*)token_ids, (float*)out, hidden_size);
|
||||
}
|
||||
|
||||
void launch_embedding_bf16(const void* table, const void* token_ids, void* out,
|
||||
int num_tokens, int hidden_size, void* stream) {
|
||||
int block = (hidden_size < 256) ? hidden_size : 256;
|
||||
embedding_bf16<<<num_tokens, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)table, (const int*)token_ids,
|
||||
(__nv_bfloat16*)out, hidden_size);
|
||||
}
|
||||
|
||||
}
|
||||
116
csrc/embedding/rope.cu
Normal file
116
csrc/embedding/rope.cu
Normal file
@@ -0,0 +1,116 @@
|
||||
#include <cuda_bf16.h>
|
||||
#include <math.h>
|
||||
|
||||
// RoPE: Rotary Position Embedding
|
||||
// For each pair (x[2i], x[2i+1]) at position `pos`:
|
||||
// y[2i] = x[2i] * cos - x[2i+1] * sin
|
||||
// y[2i+1] = x[2i] * sin + x[2i+1] * cos
|
||||
// where cos/sin come from precomputed cos_cache/sin_cache.
|
||||
//
|
||||
// cos_cache[pos][i] = cos(pos * freq[i])
|
||||
// sin_cache[pos][i] = sin(pos * freq[i])
|
||||
// freq[i] = 1.0 / (theta ^ (2i / head_dim))
|
||||
|
||||
// Apply RoPE in-place to Q or K tensor.
|
||||
// x shape: [num_tokens, num_heads, head_dim]
|
||||
// cos_cache, sin_cache shape: [max_seq_len, head_dim/2]
|
||||
// positions: [num_tokens] — the position index for each token
|
||||
|
||||
__global__ void rope_f32(
|
||||
float* __restrict__ x, // [num_tokens, num_heads, head_dim]
|
||||
const float* __restrict__ cos_cache, // [max_seq_len, half_dim]
|
||||
const float* __restrict__ sin_cache, // [max_seq_len, half_dim]
|
||||
const int* __restrict__ positions, // [num_tokens]
|
||||
int num_heads, int head_dim
|
||||
) {
|
||||
int token_idx = blockIdx.x;
|
||||
int head_idx = blockIdx.y;
|
||||
int half_dim = head_dim / 2;
|
||||
int pair_idx = threadIdx.x; // which pair (0..half_dim)
|
||||
|
||||
if (pair_idx >= half_dim) return;
|
||||
|
||||
int pos = positions[token_idx];
|
||||
float cos_val = cos_cache[pos * half_dim + pair_idx];
|
||||
float sin_val = sin_cache[pos * half_dim + pair_idx];
|
||||
|
||||
int base = (token_idx * num_heads + head_idx) * head_dim;
|
||||
float x0 = x[base + 2 * pair_idx];
|
||||
float x1 = x[base + 2 * pair_idx + 1];
|
||||
|
||||
x[base + 2 * pair_idx] = x0 * cos_val - x1 * sin_val;
|
||||
x[base + 2 * pair_idx + 1] = x0 * sin_val + x1 * cos_val;
|
||||
}
|
||||
|
||||
__global__ void rope_bf16(
|
||||
__nv_bfloat16* __restrict__ x,
|
||||
const float* __restrict__ cos_cache,
|
||||
const float* __restrict__ sin_cache,
|
||||
const int* __restrict__ positions,
|
||||
int num_heads, int head_dim
|
||||
) {
|
||||
int token_idx = blockIdx.x;
|
||||
int head_idx = blockIdx.y;
|
||||
int half_dim = head_dim / 2;
|
||||
int pair_idx = threadIdx.x;
|
||||
|
||||
if (pair_idx >= half_dim) return;
|
||||
|
||||
int pos = positions[token_idx];
|
||||
float cos_val = cos_cache[pos * half_dim + pair_idx];
|
||||
float sin_val = sin_cache[pos * half_dim + pair_idx];
|
||||
|
||||
int base = (token_idx * num_heads + head_idx) * head_dim;
|
||||
float x0 = __bfloat162float(x[base + 2 * pair_idx]);
|
||||
float x1 = __bfloat162float(x[base + 2 * pair_idx + 1]);
|
||||
|
||||
x[base + 2 * pair_idx] = __float2bfloat16(x0 * cos_val - x1 * sin_val);
|
||||
x[base + 2 * pair_idx + 1] = __float2bfloat16(x0 * sin_val + x1 * cos_val);
|
||||
}
|
||||
|
||||
// Precompute cos/sin cache on GPU
|
||||
__global__ void compute_rope_cache(
|
||||
float* __restrict__ cos_cache, // [max_seq_len, half_dim]
|
||||
float* __restrict__ sin_cache,
|
||||
int max_seq_len, int half_dim, float theta
|
||||
) {
|
||||
int pos = blockIdx.x;
|
||||
int i = threadIdx.x;
|
||||
if (i >= half_dim) return;
|
||||
|
||||
float freq = 1.0f / powf(theta, (float)(2 * i) / (float)(2 * half_dim));
|
||||
float angle = (float)pos * freq;
|
||||
cos_cache[pos * half_dim + i] = cosf(angle);
|
||||
sin_cache[pos * half_dim + i] = sinf(angle);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_rope_f32(void* x, const void* cos_cache, const void* sin_cache,
|
||||
const void* positions, int num_tokens, int num_heads,
|
||||
int head_dim, void* stream) {
|
||||
dim3 grid(num_tokens, num_heads);
|
||||
int block = head_dim / 2;
|
||||
rope_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(float*)x, (const float*)cos_cache, (const float*)sin_cache,
|
||||
(const int*)positions, num_heads, head_dim);
|
||||
}
|
||||
|
||||
void launch_rope_bf16(void* x, const void* cos_cache, const void* sin_cache,
|
||||
const void* positions, int num_tokens, int num_heads,
|
||||
int head_dim, void* stream) {
|
||||
dim3 grid(num_tokens, num_heads);
|
||||
int block = head_dim / 2;
|
||||
rope_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(__nv_bfloat16*)x, (const float*)cos_cache, (const float*)sin_cache,
|
||||
(const int*)positions, num_heads, head_dim);
|
||||
}
|
||||
|
||||
void launch_compute_rope_cache(void* cos_cache, void* sin_cache,
|
||||
int max_seq_len, int half_dim, float theta,
|
||||
void* stream) {
|
||||
compute_rope_cache<<<max_seq_len, half_dim, 0, (cudaStream_t)stream>>>(
|
||||
(float*)cos_cache, (float*)sin_cache, max_seq_len, half_dim, theta);
|
||||
}
|
||||
|
||||
}
|
||||
62
csrc/gemm/naive.cu
Normal file
62
csrc/gemm/naive.cu
Normal file
@@ -0,0 +1,62 @@
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
// Naive GEMM: each thread computes one element of C.
|
||||
// C[i][j] = sum_k A[i][k] * B[k][j]
|
||||
// All matrices are row-major.
|
||||
__global__ void gemm_naive_bf16(
|
||||
const __nv_bfloat16* A, const __nv_bfloat16* B, __nv_bfloat16* C,
|
||||
int M, int N, int K
|
||||
) {
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (row < M && col < N) {
|
||||
float sum = 0.0f;
|
||||
for (int k = 0; k < K; k++) {
|
||||
sum += __bfloat162float(A[row * K + k]) * __bfloat162float(B[k * N + col]);
|
||||
}
|
||||
C[row * N + col] = __float2bfloat16(sum);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gemm_naive_f32(
|
||||
const float* A, const float* B, float* C,
|
||||
int M, int N, int K
|
||||
) {
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (row < M && col < N) {
|
||||
float sum = 0.0f;
|
||||
for (int k = 0; k < K; k++) {
|
||||
sum += A[row * K + k] * B[k * N + col];
|
||||
}
|
||||
C[row * N + col] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_gemm_naive_bf16(
|
||||
const void* A, const void* B, void* C,
|
||||
int M, int N, int K, void* stream
|
||||
) {
|
||||
dim3 block(16, 16);
|
||||
dim3 grid((N + block.x - 1) / block.x, (M + block.y - 1) / block.y);
|
||||
gemm_naive_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)A, (const __nv_bfloat16*)B, (__nv_bfloat16*)C, M, N, K
|
||||
);
|
||||
}
|
||||
|
||||
void launch_gemm_naive_f32(
|
||||
const void* A, const void* B, void* C,
|
||||
int M, int N, int K, void* stream
|
||||
) {
|
||||
dim3 block(16, 16);
|
||||
dim3 grid((N + block.x - 1) / block.x, (M + block.y - 1) / block.y);
|
||||
gemm_naive_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)A, (const float*)B, (float*)C, M, N, K
|
||||
);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
116
csrc/gemm/tiled.cu
Normal file
116
csrc/gemm/tiled.cu
Normal file
@@ -0,0 +1,116 @@
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
// Tiled GEMM using shared memory.
|
||||
// Each thread block loads TILE_SIZE x TILE_SIZE tiles of A and B
|
||||
// into shared memory, then computes a partial dot product.
|
||||
#define TILE_SIZE 32
|
||||
|
||||
__global__ void gemm_tiled_f32(
|
||||
const float* A, const float* B, float* C,
|
||||
int M, int N, int K
|
||||
) {
|
||||
__shared__ float As[TILE_SIZE][TILE_SIZE];
|
||||
__shared__ float Bs[TILE_SIZE][TILE_SIZE];
|
||||
|
||||
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
|
||||
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
|
||||
// Load tile of A
|
||||
int a_col = t * TILE_SIZE + threadIdx.x;
|
||||
if (row < M && a_col < K) {
|
||||
As[threadIdx.y][threadIdx.x] = A[row * K + a_col];
|
||||
} else {
|
||||
As[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
}
|
||||
|
||||
// Load tile of B
|
||||
int b_row = t * TILE_SIZE + threadIdx.y;
|
||||
if (b_row < K && col < N) {
|
||||
Bs[threadIdx.y][threadIdx.x] = B[b_row * N + col];
|
||||
} else {
|
||||
Bs[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < TILE_SIZE; k++) {
|
||||
sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (row < M && col < N) {
|
||||
C[row * N + col] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gemm_tiled_bf16(
|
||||
const __nv_bfloat16* A, const __nv_bfloat16* B, __nv_bfloat16* C,
|
||||
int M, int N, int K
|
||||
) {
|
||||
__shared__ float As[TILE_SIZE][TILE_SIZE];
|
||||
__shared__ float Bs[TILE_SIZE][TILE_SIZE];
|
||||
|
||||
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
|
||||
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
|
||||
int a_col = t * TILE_SIZE + threadIdx.x;
|
||||
if (row < M && a_col < K) {
|
||||
As[threadIdx.y][threadIdx.x] = __bfloat162float(A[row * K + a_col]);
|
||||
} else {
|
||||
As[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
}
|
||||
|
||||
int b_row = t * TILE_SIZE + threadIdx.y;
|
||||
if (b_row < K && col < N) {
|
||||
Bs[threadIdx.y][threadIdx.x] = __bfloat162float(B[b_row * N + col]);
|
||||
} else {
|
||||
Bs[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < TILE_SIZE; k++) {
|
||||
sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (row < M && col < N) {
|
||||
C[row * N + col] = __float2bfloat16(sum);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_gemm_tiled_f32(
|
||||
const void* A, const void* B, void* C,
|
||||
int M, int N, int K, void* stream
|
||||
) {
|
||||
dim3 block(TILE_SIZE, TILE_SIZE);
|
||||
dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
|
||||
gemm_tiled_f32<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)A, (const float*)B, (float*)C, M, N, K
|
||||
);
|
||||
}
|
||||
|
||||
void launch_gemm_tiled_bf16(
|
||||
const void* A, const void* B, void* C,
|
||||
int M, int N, int K, void* stream
|
||||
) {
|
||||
dim3 block(TILE_SIZE, TILE_SIZE);
|
||||
dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
|
||||
gemm_tiled_bf16<<<grid, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)A, (const __nv_bfloat16*)B, (__nv_bfloat16*)C, M, N, K
|
||||
);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
102
csrc/normalization/layernorm.cu
Normal file
102
csrc/normalization/layernorm.cu
Normal file
@@ -0,0 +1,102 @@
|
||||
#include "../common.cuh"
|
||||
|
||||
// LayerNorm: y[i] = gamma[i] * (x[i] - mean) / sqrt(var + eps) + beta[i]
|
||||
// Each block processes one row of shape [hidden_size].
|
||||
|
||||
__global__ void layernorm_f32(
|
||||
const float* __restrict__ x,
|
||||
const float* __restrict__ gamma,
|
||||
const float* __restrict__ beta,
|
||||
float* __restrict__ out,
|
||||
int hidden_size, float eps
|
||||
) {
|
||||
int row = blockIdx.x;
|
||||
const float* x_row = x + row * hidden_size;
|
||||
float* out_row = out + row * hidden_size;
|
||||
|
||||
// Welford online: compute mean and variance in one pass
|
||||
float local_sum = 0.0f;
|
||||
float local_sum_sq = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = x_row[i];
|
||||
local_sum += v;
|
||||
local_sum_sq += v * v;
|
||||
}
|
||||
local_sum = block_reduce_sum(local_sum);
|
||||
local_sum_sq = block_reduce_sum(local_sum_sq);
|
||||
|
||||
__shared__ float s_mean, s_inv_std;
|
||||
if (threadIdx.x == 0) {
|
||||
float mean = local_sum / hidden_size;
|
||||
float var = local_sum_sq / hidden_size - mean * mean;
|
||||
s_mean = mean;
|
||||
s_inv_std = rsqrtf(var + eps);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float mean = s_mean;
|
||||
float inv_std = s_inv_std;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
out_row[i] = gamma[i] * (x_row[i] - mean) * inv_std + beta[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void layernorm_bf16(
|
||||
const __nv_bfloat16* __restrict__ x,
|
||||
const __nv_bfloat16* __restrict__ gamma,
|
||||
const __nv_bfloat16* __restrict__ beta,
|
||||
__nv_bfloat16* __restrict__ out,
|
||||
int hidden_size, float eps
|
||||
) {
|
||||
int row = blockIdx.x;
|
||||
const __nv_bfloat16* x_row = x + row * hidden_size;
|
||||
__nv_bfloat16* out_row = out + row * hidden_size;
|
||||
|
||||
float local_sum = 0.0f;
|
||||
float local_sum_sq = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = __bfloat162float(x_row[i]);
|
||||
local_sum += v;
|
||||
local_sum_sq += v * v;
|
||||
}
|
||||
local_sum = block_reduce_sum(local_sum);
|
||||
local_sum_sq = block_reduce_sum(local_sum_sq);
|
||||
|
||||
__shared__ float s_mean, s_inv_std;
|
||||
if (threadIdx.x == 0) {
|
||||
float mean = local_sum / hidden_size;
|
||||
float var = local_sum_sq / hidden_size - mean * mean;
|
||||
s_mean = mean;
|
||||
s_inv_std = rsqrtf(var + eps);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float mean = s_mean;
|
||||
float inv_std = s_inv_std;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = __bfloat162float(x_row[i]);
|
||||
float g = __bfloat162float(gamma[i]);
|
||||
float b = __bfloat162float(beta[i]);
|
||||
out_row[i] = __float2bfloat16(g * (v - mean) * inv_std + b);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_layernorm_f32(const void* x, const void* gamma, const void* beta,
|
||||
void* out, int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
layernorm_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)x, (const float*)gamma, (const float*)beta,
|
||||
(float*)out, hidden_size, eps);
|
||||
}
|
||||
|
||||
void launch_layernorm_bf16(const void* x, const void* gamma, const void* beta,
|
||||
void* out, int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
layernorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (const __nv_bfloat16*)gamma, (const __nv_bfloat16*)beta,
|
||||
(__nv_bfloat16*)out, hidden_size, eps);
|
||||
}
|
||||
|
||||
}
|
||||
83
csrc/normalization/rmsnorm.cu
Normal file
83
csrc/normalization/rmsnorm.cu
Normal file
@@ -0,0 +1,83 @@
|
||||
#include "../common.cuh"
|
||||
|
||||
// RMSNorm: y[i] = x[i] * rsqrt(mean(x²) + eps) * gamma[i]
|
||||
// Each block processes one row of shape [hidden_size].
|
||||
|
||||
__global__ void rmsnorm_f32(
|
||||
const float* __restrict__ x,
|
||||
const float* __restrict__ gamma,
|
||||
float* __restrict__ out,
|
||||
int hidden_size, float eps
|
||||
) {
|
||||
int row = blockIdx.x;
|
||||
const float* x_row = x + row * hidden_size;
|
||||
float* out_row = out + row * hidden_size;
|
||||
|
||||
float sum_sq = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = x_row[i];
|
||||
sum_sq += v * v;
|
||||
}
|
||||
sum_sq = block_reduce_sum(sum_sq);
|
||||
|
||||
__shared__ float s_rms_inv;
|
||||
if (threadIdx.x == 0) {
|
||||
s_rms_inv = rsqrtf(sum_sq / hidden_size + eps);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float rms_inv = s_rms_inv;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
out_row[i] = x_row[i] * rms_inv * gamma[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void rmsnorm_bf16(
|
||||
const __nv_bfloat16* __restrict__ x,
|
||||
const __nv_bfloat16* __restrict__ gamma,
|
||||
__nv_bfloat16* __restrict__ out,
|
||||
int hidden_size, float eps
|
||||
) {
|
||||
int row = blockIdx.x;
|
||||
const __nv_bfloat16* x_row = x + row * hidden_size;
|
||||
__nv_bfloat16* out_row = out + row * hidden_size;
|
||||
|
||||
float sum_sq = 0.0f;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = __bfloat162float(x_row[i]);
|
||||
sum_sq += v * v;
|
||||
}
|
||||
sum_sq = block_reduce_sum(sum_sq);
|
||||
|
||||
__shared__ float s_rms_inv;
|
||||
if (threadIdx.x == 0) {
|
||||
s_rms_inv = rsqrtf(sum_sq / hidden_size + eps);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float rms_inv = s_rms_inv;
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float v = __bfloat162float(x_row[i]);
|
||||
float g = __bfloat162float(gamma[i]);
|
||||
out_row[i] = __float2bfloat16(v * rms_inv * g);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_rmsnorm_f32(const void* x, const void* gamma, void* out,
|
||||
int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
rmsnorm_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)x, (const float*)gamma, (float*)out, hidden_size, eps);
|
||||
}
|
||||
|
||||
void launch_rmsnorm_bf16(const void* x, const void* gamma, void* out,
|
||||
int rows, int hidden_size, float eps, void* stream) {
|
||||
int block = (hidden_size < 1024) ? hidden_size : 1024;
|
||||
rmsnorm_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (const __nv_bfloat16*)gamma,
|
||||
(__nv_bfloat16*)out, hidden_size, eps);
|
||||
}
|
||||
|
||||
}
|
||||
106
csrc/reduce/softmax.cu
Normal file
106
csrc/reduce/softmax.cu
Normal file
@@ -0,0 +1,106 @@
|
||||
#include "../common.cuh"
|
||||
|
||||
// Safe softmax along the last dimension.
|
||||
// Each block handles one row of length `cols`.
|
||||
// Three-pass: 1) find max, 2) exp + sum, 3) normalize.
|
||||
|
||||
__global__ void softmax_f32(
|
||||
const float* __restrict__ x,
|
||||
float* __restrict__ out,
|
||||
int cols
|
||||
) {
|
||||
int row = blockIdx.x;
|
||||
const float* x_row = x + row * cols;
|
||||
float* out_row = out + row * cols;
|
||||
|
||||
// Pass 1: find max
|
||||
float local_max = -INFINITY;
|
||||
for (int i = threadIdx.x; i < cols; i += blockDim.x) {
|
||||
local_max = fmaxf(local_max, x_row[i]);
|
||||
}
|
||||
float row_max = block_reduce_max(local_max);
|
||||
|
||||
__shared__ float s_max;
|
||||
if (threadIdx.x == 0) s_max = row_max;
|
||||
__syncthreads();
|
||||
row_max = s_max;
|
||||
|
||||
// Pass 2: exp and sum
|
||||
float local_sum = 0.0f;
|
||||
for (int i = threadIdx.x; i < cols; i += blockDim.x) {
|
||||
float e = expf(x_row[i] - row_max);
|
||||
out_row[i] = e;
|
||||
local_sum += e;
|
||||
}
|
||||
float row_sum = block_reduce_sum(local_sum);
|
||||
|
||||
__shared__ float s_inv_sum;
|
||||
if (threadIdx.x == 0) s_inv_sum = 1.0f / row_sum;
|
||||
__syncthreads();
|
||||
float inv_sum = s_inv_sum;
|
||||
|
||||
// Pass 3: normalize
|
||||
for (int i = threadIdx.x; i < cols; i += blockDim.x) {
|
||||
out_row[i] *= inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void softmax_bf16(
|
||||
const __nv_bfloat16* __restrict__ x,
|
||||
__nv_bfloat16* __restrict__ out,
|
||||
int cols
|
||||
) {
|
||||
int row = blockIdx.x;
|
||||
const __nv_bfloat16* x_row = x + row * cols;
|
||||
__nv_bfloat16* out_row = out + row * cols;
|
||||
|
||||
float local_max = -INFINITY;
|
||||
for (int i = threadIdx.x; i < cols; i += blockDim.x) {
|
||||
local_max = fmaxf(local_max, __bfloat162float(x_row[i]));
|
||||
}
|
||||
float row_max = block_reduce_max(local_max);
|
||||
|
||||
__shared__ float s_max;
|
||||
if (threadIdx.x == 0) s_max = row_max;
|
||||
__syncthreads();
|
||||
row_max = s_max;
|
||||
|
||||
// We need float scratch for exp values. Reuse out (write bf16 in pass 3).
|
||||
// Use registers to hold exp values during sum pass instead.
|
||||
float local_sum = 0.0f;
|
||||
for (int i = threadIdx.x; i < cols; i += blockDim.x) {
|
||||
float e = expf(__bfloat162float(x_row[i]) - row_max);
|
||||
// Temporarily store exp in output as bf16 (slight precision loss, acceptable)
|
||||
out_row[i] = __float2bfloat16(e);
|
||||
local_sum += e;
|
||||
}
|
||||
float row_sum = block_reduce_sum(local_sum);
|
||||
|
||||
__shared__ float s_inv_sum;
|
||||
if (threadIdx.x == 0) s_inv_sum = 1.0f / row_sum;
|
||||
__syncthreads();
|
||||
float inv_sum = s_inv_sum;
|
||||
|
||||
for (int i = threadIdx.x; i < cols; i += blockDim.x) {
|
||||
float e = __bfloat162float(out_row[i]);
|
||||
out_row[i] = __float2bfloat16(e * inv_sum);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void launch_softmax_f32(const void* x, void* out, int rows, int cols, void* stream) {
|
||||
int block = (cols < 1024) ? cols : 1024;
|
||||
if (block < 32) block = 32;
|
||||
softmax_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const float*)x, (float*)out, cols);
|
||||
}
|
||||
|
||||
void launch_softmax_bf16(const void* x, void* out, int rows, int cols, void* stream) {
|
||||
int block = (cols < 1024) ? cols : 1024;
|
||||
if (block < 32) block = 32;
|
||||
softmax_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
|
||||
(const __nv_bfloat16*)x, (__nv_bfloat16*)out, cols);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -72,9 +72,31 @@ Wraps cudaStream_t. RAII with Drop calling cudaStreamDestroy.
|
||||
- `build.rs` uses `cc` crate to compile .cu files, link CUDA runtime
|
||||
|
||||
## Test Plan
|
||||
1. Device info: print GPU name, memory, compute capability, SM count
|
||||
2. GpuBuffer: alloc 1GB, H2D copy, D2H copy, verify data
|
||||
3. Vector add kernel: launch from Rust, verify output
|
||||
4. CachingAllocator: alloc→free→realloc same size uses cache (no new cudaMalloc)
|
||||
5. Multi-stream: two concurrent memcpy on different streams
|
||||
6. Benchmark: caching allocator vs raw cudaMalloc (100 cycles)
|
||||
|
||||
- [x] Device info: print GPU name, memory, compute capability, SM count
|
||||
- [x] GpuBuffer: alloc → H2D copy → D2H copy → verify data (256B, 64MB)
|
||||
- [x] GpuBuffer: D2D copy 验证
|
||||
- [x] GpuBuffer: zero fill 验证
|
||||
- [x] Vector add kernel: launch from Rust, verify output
|
||||
- [x] CachingAllocator: alloc→free→realloc same size uses cache (no new cudaMalloc)
|
||||
- [x] CachingAllocator: 不同 size bucket 独立缓存
|
||||
- [x] CudaStream: 创建、同步、Drop
|
||||
- [x] PinnedBuffer: page-locked host memory
|
||||
- [x] Async copy: H2D async + D2H async via stream
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **`cudaDeviceProp` struct 布局不可靠**:CUDA 版本之间 `cudaDeviceProp` 的字段偏移会变化。我们最初用 struct 映射读取 `total_global_mem`,得到了垃圾值(12TB)。正确做法:用 `cudaMemGetInfo` 获取显存信息,用 `cudaDeviceGetAttribute` 获取其他属性。只从 `cudaDeviceProp` 读取 `name` 字段(始终在 struct 最前面,布局稳定)。
|
||||
|
||||
2. **Rust 2024 edition 的 unsafe 语义变更**:
|
||||
- `extern "C"` 块必须加 `unsafe` 前缀 → `unsafe extern "C"`
|
||||
- `unsafe fn` 内部的 unsafe 调用也需要显式 `unsafe {}` 块
|
||||
- 这让代码更安全,但初次移植需要注意
|
||||
|
||||
3. **`cc` crate 的 CUDA 支持是内置的**:不需要 `features = ["cuda"]`(这个 feature 不存在)。只需 `.cuda(true).cudart("shared")`。
|
||||
|
||||
4. **Caching Allocator 的 bucket 策略**:round up to next power of 2(最小 512B)。这意味着申请 513B 会分配 1024B,存在内部碎片。但简单且高效——避免了 free list 中的精确匹配问题。PyTorch 的 CUDACachingAllocator 用了更复杂的策略(best-fit with splitting),但对于推理场景,power-of-2 bucket 已经够用。
|
||||
|
||||
5. **`into_raw` + `from_raw` 模式**:GpuBuffer 的 RAII Drop 和 CachingAllocator 的缓存需求冲突——allocator 需要持有裸指针而不触发 Drop。`into_raw()` 消费 self(`mem::forget`),返回裸指针;`from_raw()` 重新封装。这是 Rust 中管理 RAII 生命周期的标准模式。
|
||||
|
||||
6. **dash5 环境**:CUDA 12.9 已安装但 `nvcc` 不在 PATH(需要 `/usr/local/cuda/bin`)。Rust 需要手动安装 rustup。无 rsync,用 `tar | ssh tar` 同步代码。开发工作流:本地写码 → tar sync → 远程 build+test。
|
||||
|
||||
97
docs/02-tensor.md
Normal file
97
docs/02-tensor.md
Normal file
@@ -0,0 +1,97 @@
|
||||
# Phase 2: Tensor Abstraction Layer — Design Document
|
||||
|
||||
## Goal
|
||||
|
||||
实现核心 Tensor 类型,支持 CPU/GPU 存储、多种数据类型、strided view 操作,作为后续所有算子和模型的数据基础。
|
||||
|
||||
## Module Layout
|
||||
|
||||
```
|
||||
crates/xserv-tensor/
|
||||
├── Cargo.toml
|
||||
└── src/
|
||||
├── lib.rs # re-exports
|
||||
├── dtype.rs # DType enum, TensorDType trait
|
||||
├── shape.rs # strides 计算, broadcast 规则
|
||||
├── storage.rs # Storage (Arc引用计数), Device enum
|
||||
└── tensor.rs # Tensor 主体: 创建, 形状操作, 设备迁移
|
||||
```
|
||||
|
||||
## Key Design Decisions
|
||||
|
||||
### DType + TensorDType Trait
|
||||
|
||||
```rust
|
||||
pub enum DType { F32, F16, BF16 }
|
||||
|
||||
pub trait TensorDType: Copy + Send + Sync + 'static {
|
||||
const DTYPE: DType;
|
||||
fn to_f64(self) -> f64;
|
||||
fn from_f64(v: f64) -> Self;
|
||||
}
|
||||
```
|
||||
|
||||
- 用 `half` crate 的 `bf16`/`f16` 表示半精度类型
|
||||
- `TensorDType` trait 让 `from_slice<T>` 和 `as_slice<T>` 有类型安全
|
||||
- GPU kernel 中通过 `DType` dispatch 到对应的 CUDA 类型 (`__nv_bfloat16` / `float`)
|
||||
|
||||
### Storage 引用计数
|
||||
|
||||
```rust
|
||||
pub struct Storage(Arc<StorageInner>);
|
||||
enum StorageInner {
|
||||
Cpu { data: Vec<u8> },
|
||||
Cuda { buffer: GpuBuffer },
|
||||
}
|
||||
```
|
||||
|
||||
- `Arc` 引用计数让 transpose/slice/reshape 能共享底层数据(view 语义)
|
||||
- 不实现 CoW(copy-on-write),view 只能读不能写
|
||||
- `to_device()` 总是创建新的 Storage
|
||||
|
||||
### Strided Tensor
|
||||
|
||||
```rust
|
||||
pub struct Tensor {
|
||||
storage: Storage,
|
||||
shape: SmallVec<[usize; 4]>,
|
||||
strides: SmallVec<[usize; 4]>,
|
||||
offset: usize,
|
||||
dtype: DType,
|
||||
}
|
||||
```
|
||||
|
||||
- `SmallVec<[usize; 4]>` 避免大多数 tensor (≤4D) 的堆分配
|
||||
- `strides` 以元素为单位(不是字节)
|
||||
- `offset` 支持 slice 操作(view 到 storage 的中间位置)
|
||||
- `is_contiguous()` 检查 strides 是否与 shape 匹配
|
||||
- 非 contiguous 的 tensor 调 `contiguous()` 才能送入 CUDA kernel
|
||||
|
||||
### Broadcast 规则
|
||||
|
||||
实现了 NumPy-style broadcasting:
|
||||
- 维度从尾部对齐
|
||||
- 大小为 1 的维度可以广播到任意大小
|
||||
- `broadcast_strides()` 将 size=1 维度的 stride 置为 0(虚拟广播,不复制数据)
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [x] from_slice → shape/strides 正确
|
||||
- [x] reshape, transpose, squeeze, unsqueeze
|
||||
- [x] transpose 后 contiguous() 重排数据
|
||||
- [x] BF16 tensor 的精度验证
|
||||
- [x] CPU↔GPU roundtrip
|
||||
- [x] zeros on GPU → 拷回 CPU 验证全 0
|
||||
- [x] broadcast_shape 单元测试
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **`SmallVec` 是正确选择**:绝大多数 tensor ≤ 4D,避免了频繁堆分配。LLM 推理中常见的维度是 `[B, S, H]` (3D) 和 `[B, H, S, D]` (4D)。
|
||||
|
||||
2. **View 语义的取舍**:Arc 共享 storage 实现了零拷贝 transpose/reshape,但代价是无法原地修改 view 后的 tensor。对于推理引擎这是可以接受的——推理路径上大部分操作是只读的。
|
||||
|
||||
3. **contiguous() 的隐性开销**:非 contiguous tensor 在送入 kernel 前需要 `contiguous()` 拷贝。这意味着 `transpose → matmul` 会产生一次额外拷贝。后续优化方向:在 kernel 中直接支持 strided input。
|
||||
|
||||
4. **Rust 2024 edition 变化**:`unsafe fn` 内部的 unsafe 调用也需要显式 `unsafe {}` 块,`extern "C"` 块必须加 `unsafe` 前缀。这个 edition 对安全性更严格。
|
||||
|
||||
5. **CPU 实现先行**:先在 CPU 上验证逻辑正确性(如 contiguous 重排),再扩展到 GPU。这个策略在后续 phase 中应该继续沿用。
|
||||
102
docs/03-gemm.md
Normal file
102
docs/03-gemm.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# Phase 3: GEMM — Design Document
|
||||
|
||||
## Goal
|
||||
|
||||
实现矩阵乘法的多个版本(naive → tiled → cuBLAS),建立 benchmark 对比框架,深入理解 GPU 编程中的内存访问模式和优化手段。
|
||||
|
||||
## Module Layout
|
||||
|
||||
```
|
||||
csrc/gemm/
|
||||
├── naive.cu # 每个 thread 算一个输出元素
|
||||
└── tiled.cu # shared memory tiling, 32x32 tiles
|
||||
|
||||
crates/xserv-kernels/
|
||||
├── build.rs # 编译 .cu + 链接 cublas
|
||||
└── src/
|
||||
├── lib.rs
|
||||
└── gemm.rs # FFI 封装, GemmBackend enum, matmul(), CublasContext
|
||||
```
|
||||
|
||||
## Kernel Implementations
|
||||
|
||||
### Version 1: Naive GEMM
|
||||
|
||||
```
|
||||
Grid: (ceil(N/16), ceil(M/16))
|
||||
Block: (16, 16)
|
||||
每个 thread: C[row][col] = sum_k(A[row][k] * B[k][col])
|
||||
```
|
||||
|
||||
- 每个 thread 独立遍历 K 维度做点积
|
||||
- 所有读取走 global memory,无局部性优化
|
||||
- BF16 版本在 FP32 中累加(`__bfloat162float` → 累加 → `__float2bfloat16`)
|
||||
|
||||
### Version 2: Tiled GEMM (Shared Memory)
|
||||
|
||||
```
|
||||
TILE_SIZE = 32
|
||||
Grid: (ceil(N/32), ceil(M/32))
|
||||
Block: (32, 32) = 1024 threads
|
||||
|
||||
每个 tile iteration:
|
||||
1. 协作加载 A[tile] 和 B[tile] 到 shared memory
|
||||
2. __syncthreads()
|
||||
3. 在 shared memory 中做 32 次乘加
|
||||
4. __syncthreads()
|
||||
```
|
||||
|
||||
- 每个 global memory 读取被 TILE_SIZE 个 thread 复用
|
||||
- 理论上减少 global memory 访问 TILE_SIZE 倍
|
||||
- BF16 版本同样在 shared memory 中存 float(FP32 累加)
|
||||
|
||||
### Version 3: cuBLAS
|
||||
|
||||
- `cublasGemmEx` 支持混合精度
|
||||
- **Row-major 适配**:cuBLAS 使用 column-major 布局,我们的 tensor 是 row-major
|
||||
- 利用恒等式:`C = A @ B` (row-major) ⟺ `C^T = B^T @ A^T` (col-major)
|
||||
- 传入 `CUBLAS_OP_N`,让 cuBLAS 把我们的 row-major 数据当作 col-major 的转置
|
||||
- 参数:`m=N, n=M, k=K, lda=N (B), ldb=K (A), ldc=N (C)`
|
||||
|
||||
### Backend Registry
|
||||
|
||||
```rust
|
||||
pub enum GemmBackend { Naive, Tiled, CuBlas }
|
||||
pub fn matmul(a: &Tensor, b: &Tensor, backend: GemmBackend) -> Tensor;
|
||||
```
|
||||
|
||||
运行时可切换 backend,方便 benchmark 对比和逐步替换。
|
||||
|
||||
## CublasContext
|
||||
|
||||
RAII 封装 `cublasHandle_t`,Drop 时调 `cublasDestroy_v2`。
|
||||
目前每次 matmul 创建一个新 handle,后续优化为全局复用。
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [x] F32: naive/tiled/cuBLAS × small(4)/medium(64-256)/rect(65x33x97)
|
||||
- [x] BF16: naive/tiled/cuBLAS × small/medium
|
||||
- [x] 三种 backend 在相同输入上输出一致(cross-backend consistency)
|
||||
- [x] 非方阵测试(M≠N≠K)
|
||||
- [x] 1024x1024 cuBLAS 验证
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **Row-major vs Column-major 陷阱**:这是 GEMM 实现中最容易出错的地方。cuBLAS 的 column-major 假设与 C/Rust 的 row-major 冲突。理解 `C=AB` ⟺ `C^T=B^T A^T` 这个恒等式是关键。实际做法:不做任何显式转置,只是交换 A/B 的传入顺序和调整 leading dimension 参数。
|
||||
|
||||
2. **BF16 的累加精度**:BF16 只有 ~3 位有效数字(vs FP32 的 ~7 位)。如果在 BF16 中累加 K 次乘法,误差会快速放大。正确做法是**在 FP32 中累加,最后才转回 BF16**。我们的 naive 和 tiled kernel 都遵循了这一点(`float sum = 0.0f`)。cuBLAS 通过 `CUBLAS_COMPUTE_32F` 参数控制。
|
||||
|
||||
3. **Shared memory tiling 的核心思想**:global memory 带宽是 GPU 计算的主要瓶颈。通过 shared memory tiling,每个数据从 global memory 读一次,被 TILE_SIZE 个 thread 复用。对于 TILE_SIZE=32,理论上减少 32 倍 global memory 访问。
|
||||
|
||||
4. **`__syncthreads()` 的位置关键**:tile 加载后必须同步(确保所有 thread 写完 shared memory),计算后也要同步(防止下一轮加载覆盖还在使用的数据)。漏掉任何一个 sync 都会产生 race condition 导致结果错误。
|
||||
|
||||
5. **cuBLAS handle 开销**:每次 matmul 创建/销毁 handle 有~0.1ms 开销。生产环境应全局复用一个 handle。Phase 15(性能优化)时需要修复这个问题。
|
||||
|
||||
6. **`error::check` 需要 pub**:Phase 1 中 `check()` 是 `pub(crate)`,Phase 3 需要跨 crate 调用。反思:基础设施 crate 的错误处理函数应该从一开始就设计为 public API。
|
||||
|
||||
## 后续优化方向(Phase 15)
|
||||
|
||||
- Register tiling(每个 thread 算多个输出元素)
|
||||
- Tensor Core WMMA(利用 5090 的硬件加速)
|
||||
- CublasContext 全局复用
|
||||
- 非 contiguous input 支持(避免 matmul 前的拷贝)
|
||||
213
docs/04-transformer-kernels.md
Normal file
213
docs/04-transformer-kernels.md
Normal file
@@ -0,0 +1,213 @@
|
||||
# Phase 4: Transformer Core Kernels — Design Document
|
||||
|
||||
## Goal
|
||||
|
||||
实现 Transformer 所需的所有非 Attention 算子的 CUDA kernel,每个 kernel 都支持 BF16 和 F32,与 PyTorch 参考实现对比验证。
|
||||
|
||||
## Kernel 清单
|
||||
|
||||
| Kernel | 用于 | 核心计算 | 关键优化点 |
|
||||
|--------|------|---------|-----------|
|
||||
| LayerNorm | GPT-2 | `(x - mean) / sqrt(var + eps) * gamma + beta` | Welford online, warp reduce |
|
||||
| RMSNorm | Qwen3 | `x / sqrt(mean(x²) + eps) * gamma` | 无 mean,比 LayerNorm 简单 |
|
||||
| GELU | GPT-2 | `0.5x(1 + tanh(sqrt(2/π)(x + 0.044715x³)))` | tanh 近似,逐元素 |
|
||||
| SiLU | Qwen3 | `x * sigmoid(x)` | 逐元素 |
|
||||
| Softmax | Attention | `exp(x - max) / sum(exp(x - max))` | Online safe softmax, warp reduce |
|
||||
| Embedding | 全部 | `output[i] = table[token_ids[i]]` | Gather, coalesced write |
|
||||
| RoPE | Qwen3 | 对 Q/K 的相邻元素对做旋转 | Precompute freq, in-place |
|
||||
|
||||
## 文件布局
|
||||
|
||||
```
|
||||
csrc/
|
||||
├── normalization/
|
||||
│ ├── layernorm.cu
|
||||
│ └── rmsnorm.cu
|
||||
├── activation/
|
||||
│ ├── gelu.cu
|
||||
│ └── silu.cu
|
||||
├── reduce/
|
||||
│ └── softmax.cu
|
||||
├── embedding/
|
||||
│ ├── embedding.cu
|
||||
│ └── rope.cu
|
||||
|
||||
crates/xserv-kernels/src/
|
||||
├── layernorm.rs
|
||||
├── rmsnorm.rs
|
||||
├── activation.rs # GELU + SiLU
|
||||
├── softmax.rs
|
||||
├── embedding.rs
|
||||
├── rope.rs
|
||||
└── lib.rs # 新增 mod 声明
|
||||
```
|
||||
|
||||
## Kernel 设计细节
|
||||
|
||||
### LayerNorm
|
||||
|
||||
输入 `x: [*, hidden_size]`, 输出 `y: [*, hidden_size]`
|
||||
参数 `gamma, beta: [hidden_size]`
|
||||
|
||||
```
|
||||
y[i] = gamma[i] * (x[i] - mean) / sqrt(var + eps) + beta[i]
|
||||
```
|
||||
|
||||
**GPU 映射**: 每个 thread block 处理一行(一个 hidden_size 向量)。
|
||||
- Phase 1: 并行加载 x,Welford online 算法计算 mean 和 var
|
||||
- Phase 2: warp-level reduce (`__shfl_down_sync`) 聚合 mean/var
|
||||
- Phase 3: block-level reduce via shared memory
|
||||
- Phase 4: 每个 thread 对自己负责的元素做 normalize + affine
|
||||
|
||||
**Block 配置**: `block = min(1024, hidden_size)`, `grid = num_rows`
|
||||
|
||||
### RMSNorm
|
||||
|
||||
比 LayerNorm 简单:不减 mean,只做 `x * rsqrt(mean(x²) + eps) * gamma`。
|
||||
|
||||
```
|
||||
rms = sqrt(sum(x²) / hidden_size + eps)
|
||||
y[i] = x[i] / rms * gamma[i]
|
||||
```
|
||||
|
||||
**GPU 映射**: 同 LayerNorm,每个 block 处理一行。
|
||||
- 只需要一次 reduce(求 sum(x²)),不需要两次(mean + var)。
|
||||
|
||||
### GELU
|
||||
|
||||
逐元素操作,用 tanh 近似:
|
||||
```
|
||||
gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
|
||||
```
|
||||
|
||||
**GPU 映射**: 每个 thread 处理多个元素(向量化),grid 覆盖全部元素。
|
||||
|
||||
### SiLU (Swish)
|
||||
|
||||
逐元素: `silu(x) = x * sigmoid(x) = x / (1 + exp(-x))`
|
||||
|
||||
### Softmax
|
||||
|
||||
输入 `x: [*, seq_len]`, 沿最后一维做 softmax:
|
||||
```
|
||||
1. m = max(x) // 数值稳定
|
||||
2. e[i] = exp(x[i] - m)
|
||||
3. s = sum(e)
|
||||
4. y[i] = e[i] / s
|
||||
```
|
||||
|
||||
**GPU 映射**: 每个 block 处理一行。
|
||||
- 第一遍 reduce: 求 max
|
||||
- 第二遍: exp(x - max) 并 reduce sum
|
||||
- 第三遍: 除以 sum
|
||||
|
||||
**优化**: 可以用 online softmax 合并前两遍(边算 exp 边更新 max),但先实现三遍版本保证正确。
|
||||
|
||||
### Embedding
|
||||
|
||||
```
|
||||
output[seq_idx] = embedding_table[token_ids[seq_idx]]
|
||||
```
|
||||
|
||||
**GPU 映射**: 每个 thread 处理一个 token 的部分维度。
|
||||
- `grid = num_tokens`, `block = hidden_size`(或分多个 thread 处理一个 token)
|
||||
- 写端是 coalesced(连续 thread 写连续地址),读端是 gather(非连续)
|
||||
|
||||
### RoPE (Rotary Position Embedding)
|
||||
|
||||
对 Q/K 的每对相邻元素 `(x0, x1)` 做 2D 旋转:
|
||||
```
|
||||
freq[i] = 1.0 / (theta ^ (2i / dim))
|
||||
cos_val = cos(position * freq[i])
|
||||
sin_val = sin(position * freq[i])
|
||||
y0 = x0 * cos_val - x1 * sin_val
|
||||
y1 = x0 * sin_val + x1 * cos_val
|
||||
```
|
||||
|
||||
**GPU 映射**: 每个 thread 处理一对元素 `(x[2i], x[2i+1])`。
|
||||
- Precompute `cos_cache[max_seq_len][head_dim/2]` 和 `sin_cache` 在初始化时
|
||||
- 运行时 kernel 只做乘加
|
||||
|
||||
**theta**: Qwen3 默认 `rope_theta = 1000000.0`
|
||||
|
||||
## Reduction Pattern(核心学习点)
|
||||
|
||||
所有 Norm 和 Softmax 都涉及 reduction。GPU reduction 的分层结构:
|
||||
|
||||
```
|
||||
Thread-level: 每个 thread 处理多个元素,本地累加
|
||||
↓
|
||||
Warp-level: __shfl_down_sync() 在 32 threads 内规约(无需 shared memory)
|
||||
↓
|
||||
Block-level: shared memory 存各 warp 的结果,warp 0 再规约
|
||||
```
|
||||
|
||||
对于 hidden_size <= 8192(LLM 常见),一个 block 足够,不需要 grid-level reduction。
|
||||
|
||||
### Warp Reduce 模板
|
||||
|
||||
```cuda
|
||||
__device__ float warp_reduce_sum(float val) {
|
||||
for (int offset = 16; offset > 0; offset >>= 1)
|
||||
val += __shfl_down_sync(0xffffffff, val, offset);
|
||||
return val;
|
||||
}
|
||||
```
|
||||
|
||||
### Block Reduce 模板
|
||||
|
||||
```cuda
|
||||
__device__ float block_reduce_sum(float val) {
|
||||
__shared__ float shared[32]; // max 32 warps per block
|
||||
int lane = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
|
||||
val = warp_reduce_sum(val);
|
||||
if (lane == 0) shared[warp_id] = val;
|
||||
__syncthreads();
|
||||
|
||||
val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0f;
|
||||
if (warp_id == 0) val = warp_reduce_sum(val);
|
||||
return val;
|
||||
}
|
||||
```
|
||||
|
||||
## Reference 验证策略
|
||||
|
||||
写 `tools/generate_reference.py` 脚本,用 PyTorch 为每个 op 生成 reference input/output:
|
||||
- 保存为 `.npy` 格式
|
||||
- Rust 测试中加载对比
|
||||
- 或者直接在 Rust 测试中用 CPU 实现计算 expected 值(更简单,不依赖 Python)
|
||||
|
||||
**选择**: 先用 Rust CPU 实现作为 reference(简单),关键 op(RoPE)再与 PyTorch 对比。
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [x] RMSNorm F32: hidden_size=768, 4 rows → max_err 7.2e-7
|
||||
- [x] RMSNorm BF16: 同上 → max_err 7.0e-3
|
||||
- [x] LayerNorm F32: hidden_size=768 → max_err 1.7e-6
|
||||
- [x] GELU F32: 10000 elements → max_err 3.0e-8
|
||||
- [x] GELU BF16: 同上 → max_err 2.4e-3
|
||||
- [x] SiLU F32: 10000 elements → max_err 1.5e-8
|
||||
- [x] Softmax F32: 8×256 → max_err 1.4e-9
|
||||
- [x] Softmax sum=1 验证: 4×2048
|
||||
- [x] Softmax 大值 (1000+) 数值稳定性 → max_err 1.5e-8
|
||||
- [x] Embedding F32: vocab=100, hidden=64, 5 tokens → exact match
|
||||
- [x] RoPE F32: 4 tokens × 2 heads × dim=8 → max_err 6.0e-8
|
||||
- [x] RoPE position=0 恒等验证 → max_err 0
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **`common.cuh` 抽取共用 reduction 是正确的做法**:`warp_reduce_sum/max` 和 `block_reduce_sum/max` 被 RMSNorm, LayerNorm, Softmax 三个 kernel 复用。抽到头文件避免了代码重复,也确保 reduction 逻辑一致。build.rs 中需要 `.include("../../csrc")` 让 nvcc 能找到头文件。
|
||||
|
||||
2. **Shared memory 中广播标量的模式**:Norm 和 Softmax 都需要将 reduce 结果(mean, rms_inv, max, sum)广播给 block 内所有 thread。标准做法:thread 0 写 `__shared__` 变量,`__syncthreads()` 后所有 thread 读。这比让每个 thread 独立做 reduce 高效得多。
|
||||
|
||||
3. **Softmax 三遍 vs 两遍**:我们实现了三遍版本(max → exp+sum → normalize),简单可靠。Online softmax 可以合并前两遍(一遍 pass 内同时跟踪 running max 和 running sum),但需要更复杂的数值更新公式。Flash Attention(Phase 14)会用到 online softmax。
|
||||
|
||||
4. **RoPE 的 position=0 恒等性**:`cos(0)=1, sin(0)=0`,所以 position 0 的旋转是恒等变换。这是一个很好的 sanity check。如果 position=0 时输出不等于输入,说明 kernel 有 bug。
|
||||
|
||||
5. **BF16 Softmax 的精度陷阱**:exp 结果先写成 BF16 再读回做 normalize 会丢精度。理想做法是用 float scratch buffer 暂存 exp 结果。当前实现可接受(误差在 1e-2 量级),但在 attention score 很接近时可能引入可观察的差异。Phase 14 Flash Attention 会解决这个问题(全程 FP32 累加)。
|
||||
|
||||
6. **Embedding 就是 gather 操作**:没有任何计算,纯粹的内存搬运。瓶颈在 global memory 随机读取(token_ids 导致不连续读 table)。写端是 coalesced 的(连续 token 写连续地址)。优化方向:使用向量化加载(`float4`)一次读 128 bit。
|
||||
|
||||
7. **RoPE in-place 修改 Tensor 的设计考量**:RoPE 在数学上是对 Q/K 的 in-place 旋转。我们通过 `data_ptr() as *mut` 绕过了 Rust 的不可变借用。这在 GPU 上是安全的(kernel 内部互不干扰),但 Rust 侧没有 `&mut` 语义保护。后续如果需要更严格的安全性,可以引入 `Tensor::as_mut_ptr()` 方法并要求 `&mut self`。
|
||||
92
docs/05-attention.md
Normal file
92
docs/05-attention.md
Normal file
@@ -0,0 +1,92 @@
|
||||
# Phase 5: Naive Attention Kernel — Design Document
|
||||
|
||||
## Goal
|
||||
|
||||
实现标准 Multi-Head Attention(不做 Flash/Paged 优化),用组合式方法(GEMM + Softmax)完成。这是理解 attention 计算流程的基础,也是后续 Flash Attention 的 baseline。
|
||||
|
||||
## 计算流程
|
||||
|
||||
```
|
||||
Input: Q [B, H, S, D], K [B, H, S, D], V [B, H, S, D]
|
||||
B=batch, H=num_heads, S=seq_len, D=head_dim
|
||||
|
||||
1. scores = Q @ K^T / sqrt(D) → [B, H, S, S]
|
||||
2. scores += causal_mask → 上三角置为 -inf
|
||||
3. weights = softmax(scores, dim=-1) → [B, H, S, S]
|
||||
4. output = weights @ V → [B, H, S, D]
|
||||
```
|
||||
|
||||
## 设计选择
|
||||
|
||||
### 组合式实现(Phase 3 GEMM + Phase 4 Softmax)
|
||||
|
||||
不写新的 fused CUDA kernel,而是复用已有的 matmul 和 softmax:
|
||||
- `scores = batched_matmul(Q, K^T)` — 需要支持 batched GEMM
|
||||
- `masked_fill(scores, causal_mask, -inf)` — 新的逐元素 kernel
|
||||
- `softmax(scores)` — 复用 Phase 4
|
||||
- `output = batched_matmul(weights, V)` — 复用 batched GEMM
|
||||
|
||||
这意味着需要先扩展 matmul 支持 batched GEMM(cublasGemmStridedBatchedEx)。
|
||||
|
||||
### Causal Mask
|
||||
|
||||
不显式构造 mask 矩阵。写一个 kernel:
|
||||
```
|
||||
if (col > row + offset) score = -infinity
|
||||
```
|
||||
其中 offset 用于支持 KV cache 场景(decode 时 query 的 row 偏移)。
|
||||
|
||||
### Batched GEMM via cuBLAS
|
||||
|
||||
`cublasGemmStridedBatchedEx` 在一个 batch 维度上并行执行多个 GEMM:
|
||||
```
|
||||
C[b] = A[b] @ B[b] for b = 0..batch_count
|
||||
stride_a = M * K, stride_b = K * N, stride_c = M * N
|
||||
```
|
||||
|
||||
Attention 中 batch 维度 = B * H(batch_size × num_heads)。
|
||||
|
||||
## 文件布局
|
||||
|
||||
```
|
||||
csrc/attention/
|
||||
└── causal_mask.cu # causal mask fill kernel
|
||||
|
||||
crates/xserv-kernels/src/
|
||||
├── gemm.rs # 扩展: batched_matmul
|
||||
├── attention.rs # NEW: multi_head_attention()
|
||||
└── causal_mask.rs # NEW: causal mask apply
|
||||
```
|
||||
|
||||
## API 设计
|
||||
|
||||
```rust
|
||||
/// Multi-head attention (naive, materializes S×S scores).
|
||||
/// q, k, v: [batch, num_heads, seq_len, head_dim]
|
||||
/// Returns: [batch, num_heads, seq_len, head_dim]
|
||||
pub fn attention(q: &Tensor, k: &Tensor, v: &Tensor, causal: bool) -> Tensor;
|
||||
|
||||
/// Batched matmul: A[b] @ B[b] for all b.
|
||||
/// a: [..., M, K], b: [..., K, N] → [..., M, N]
|
||||
pub fn batched_matmul(a: &Tensor, b: &Tensor) -> Tensor;
|
||||
```
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [x] batched_matmul: [4,8,32,64]×[4,8,64,32] → max_err 2.7e-7
|
||||
- [x] attention (non-causal): B=1,H=2,S=8,D=16 → max_err 4.5e-8
|
||||
- [x] attention (causal): B=1,H=2,S=16,D=32 → max_err 3.0e-8
|
||||
- [x] attention (causal, larger): B=2,H=4,S=64,D=64 → max_err 6.0e-8
|
||||
- [x] causal mask 语义: position 0 只能看到 token 0,output[0] == V[0] → exact
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **`to_device` 不应强制 contiguous**:最初 `to_device()` 会先调 `contiguous()`,而 GPU 的 `contiguous()` 又调 `to_device(Cpu)`,导致无限递归栈溢出。修复:`to_device()` 直接传输 raw storage,保留 strides/offset,用户需要时自己调 `contiguous()`。GPU `contiguous()` 现在走 GPU→CPU→CPU contiguous→CPU→GPU 路径——正确但低效,Phase 15 需要写 GPU contiguous kernel。
|
||||
|
||||
2. **Batched GEMM via `cublasGemmStridedBatchedEx`**:row-major trick 同 Phase 3,额外参数是 stride(元素数,不是字节)。stride_a = M×K, stride_b = K×N, stride_c = M×N。注意初始版本错误地乘了 `elem_size`,cuBLAS 的 stride 单位是元素。
|
||||
|
||||
3. **Attention 的组合式实现足够验证正确性**:没有写 fused kernel,而是复用 `batched_matmul` + `scale` + `causal_mask` + `softmax`。精度极好(max_err < 1e-7),因为每步都在 FP32 中完成。缺点是 S×S score 矩阵完全 materialize(O(S²) 显存),Flash Attention 会解决。
|
||||
|
||||
4. **Scale kernel 的必要性**:原本想在 CPU 上做 scale(round-trip),但那太慢了。加了 `scale_f32/bf16` 逐元素 CUDA kernel。未来可以把 scale 合进 GEMM 的 alpha 参数,省一次 kernel launch。
|
||||
|
||||
5. **Causal mask 的 offset 设计**:`col > row + offset` 中的 offset 为 KV cache 场景预留。Decode 时 Q 只有 1 行但 KV cache 有前 S 行,offset = kv_len - q_len 确保 decode query 能看到所有 cached tokens。
|
||||
69
docs/06-model-loading.md
Normal file
69
docs/06-model-loading.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# Phase 6: Model Loading — Design Document
|
||||
|
||||
## Goal
|
||||
|
||||
从 HuggingFace safetensors 文件加载模型权重到 GPU Tensor。解析 config.json 获取模型结构参数。
|
||||
|
||||
## Crate: `xserv-model`
|
||||
|
||||
```
|
||||
crates/xserv-model/src/
|
||||
├── lib.rs
|
||||
├── config.rs # ModelConfig from config.json
|
||||
├── loader.rs # safetensors weight loading
|
||||
└── gpt2.rs # (Phase 8) GPT-2 model definition
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `safetensors` crate: parse safetensors format
|
||||
- `serde` + `serde_json`: deserialize config.json
|
||||
- `memmap2`: mmap for zero-copy file access (safetensors uses this internally)
|
||||
|
||||
## Weight Loading Flow
|
||||
|
||||
```
|
||||
safetensors file (disk)
|
||||
→ safetensors crate parses header (tensor names, shapes, dtypes, offsets)
|
||||
→ mmap raw data
|
||||
→ for each tensor:
|
||||
→ read bytes at offset
|
||||
→ create CPU Tensor from raw bytes
|
||||
→ .to_device(Cuda(0)) → GPU Tensor
|
||||
→ return HashMap<String, Tensor>
|
||||
```
|
||||
|
||||
## Config Parsing
|
||||
|
||||
```rust
|
||||
#[derive(Deserialize)]
|
||||
pub struct ModelConfig {
|
||||
pub architectures: Option<Vec<String>>,
|
||||
pub model_type: Option<String>,
|
||||
pub hidden_size: usize,
|
||||
pub intermediate_size: Option<usize>,
|
||||
pub num_attention_heads: usize,
|
||||
pub num_key_value_heads: Option<usize>,
|
||||
pub num_hidden_layers: usize,
|
||||
pub vocab_size: usize,
|
||||
pub max_position_embeddings: Option<usize>,
|
||||
pub layer_norm_eps: Option<f64>,
|
||||
pub rms_norm_eps: Option<f64>,
|
||||
pub rope_theta: Option<f64>,
|
||||
pub tie_word_embeddings: Option<bool>,
|
||||
}
|
||||
```
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [x] Load GPT-2 124M: 160 tensors loaded successfully
|
||||
- [x] Parse GPT-2 config.json: hidden=768, layers=12, heads=12, vocab=50257
|
||||
- [x] Sharded loading path implemented (for larger models)
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **GPT-2 vs modern HF config naming**:GPT-2 uses `n_embd`/`n_head`/`n_layer`/`n_positions`,而不是 `hidden_size`/`num_attention_heads` 等。ModelConfig 需要支持两套命名并提供统一的 accessor methods(`hidden()`, `num_heads()` 等)。
|
||||
|
||||
2. **safetensors 零拷贝读取**:`safetensors` crate 直接 mmap 文件,解析 header 得到 tensor 的 offset 和 shape,然后 zero-copy 读取 raw bytes。对于 GPT-2 的 500MB 权重文件,加载速度很快。
|
||||
|
||||
3. **模型下载的网络问题**:HuggingFace 在中国网络下不可达。使用 modelscope.cn 或 hf-mirror.com 作为替代。大文件(>100MB)的 redirect 到 CDN 可能也会失败,modelscope 的 snapshot_download 更可靠。
|
||||
57
docs/07-tokenizer.md
Normal file
57
docs/07-tokenizer.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# Phase 7: BPE Tokenizer — Design Document
|
||||
|
||||
## Goal
|
||||
|
||||
从零实现 Byte-Pair Encoding tokenizer,兼容 HuggingFace `tokenizer.json` 格式。支持 GPT-2 和 Qwen3。
|
||||
|
||||
## Crate: `xserv-tokenizer`
|
||||
|
||||
```
|
||||
crates/xserv-tokenizer/src/
|
||||
├── lib.rs
|
||||
├── bpe.rs # BPE encode/decode core algorithm
|
||||
└── chat.rs # Chat template formatting
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `serde` + `serde_json`: parse tokenizer.json
|
||||
- `regex`: pre-tokenization patterns
|
||||
|
||||
## BPE Algorithm
|
||||
|
||||
### Encode
|
||||
1. Pre-tokenize: split text by regex (GPT-2 pattern)
|
||||
2. Each word → byte sequence → initial token list (one token per byte)
|
||||
3. Repeatedly merge highest-priority pair until no more merges
|
||||
4. Map merged tokens to IDs via vocab
|
||||
|
||||
### Decode
|
||||
Token IDs → lookup vocab → concatenate bytes → UTF-8 decode
|
||||
|
||||
## Key Data Structures
|
||||
|
||||
```rust
|
||||
pub struct Tokenizer {
|
||||
vocab: HashMap<Vec<u8>, u32>, // token bytes → ID
|
||||
vocab_rev: Vec<Vec<u8>>, // ID → token bytes
|
||||
merges: Vec<(Vec<u8>, Vec<u8>)>, // ordered merge rules
|
||||
merge_ranks: HashMap<(u32, u32), usize>, // (id_a, id_b) → priority
|
||||
special_tokens: HashMap<String, u32>,
|
||||
pre_tokenize_regex: Regex,
|
||||
}
|
||||
```
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [x] Encode + decode roundtrip verified (GPT-2 tokenizer, English text)
|
||||
- [x] Special tokens handled (endoftext)
|
||||
- [x] Integrated into GPT-2 inference pipeline, generates coherent text
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **GPT-2 byte-to-unicode 映射**:GPT-2 的 vocab 中,每个 byte 都映射到一个 Unicode 字符。可打印 ASCII (0x21-0x7E) 映射到自身,其余字节(空格、控制字符等)映射到 U+0100 以上的 Unicode 码点。解码时需要反向映射。这个映射表是 BPE tokenizer 正确性的关键。
|
||||
|
||||
2. **Rust regex 不支持 lookahead**:GPT-2 的 pre-tokenization regex 使用了 `(?!\S)` lookahead,Rust 的 `regex` crate 不支持。简化为去掉 lookahead 后功能等价(whitespace 仍然被正确分词)。如果需要精确匹配 Python 行为,需要 `fancy-regex` crate。
|
||||
|
||||
3. **BPE merge 的 O(n²) 复杂度**:当前实现每次 merge 扫描整个 token 序列找最高优先级 pair,复杂度 O(n² × |merges|)。对于短文本够用,长文本需要 priority queue 优化。推理场景中 prompt 通常 < 10K tokens,暂时可接受。
|
||||
71
docs/08-gpt2.md
Normal file
71
docs/08-gpt2.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# Phase 8: GPT-2 Complete Inference — Design Document (Milestone ①)
|
||||
|
||||
## Goal
|
||||
|
||||
Wire everything together: load GPT-2 124M, tokenize input, run forward pass, sample tokens, decode output. First time seeing the model "speak".
|
||||
|
||||
## Model Architecture (GPT-2 124M)
|
||||
|
||||
```
|
||||
hidden_size = 768
|
||||
num_heads = 12
|
||||
num_layers = 12
|
||||
vocab_size = 50257
|
||||
max_position_embeddings = 1024
|
||||
activation = GELU
|
||||
normalization = LayerNorm (pre-LN)
|
||||
tied embeddings (lm_head == wte)
|
||||
```
|
||||
|
||||
## Forward Pass
|
||||
|
||||
```
|
||||
tokens [S]
|
||||
→ wte[tokens] + wpe[0..S] → [S, 768]
|
||||
→ for each layer:
|
||||
residual = x
|
||||
x = layernorm(x, ln_1)
|
||||
x = attention(x) # Q,K,V from linear, MHA, output linear
|
||||
x = x + residual
|
||||
residual = x
|
||||
x = layernorm(x, ln_2)
|
||||
x = mlp(x) # linear→GELU→linear
|
||||
x = x + residual
|
||||
→ layernorm(x, ln_f)
|
||||
→ logits = x @ wte.T → [S, 50257]
|
||||
→ sample(logits[-1]) → next token
|
||||
```
|
||||
|
||||
## Sampling
|
||||
|
||||
- Greedy: argmax
|
||||
- Temperature: logits / T → softmax → sample
|
||||
- Top-K: keep top-k logits, rest = -inf
|
||||
- Top-P: sorted by prob, cumsum ≤ p
|
||||
|
||||
## CLI Binary
|
||||
|
||||
```
|
||||
$ cargo run --release --bin xserv-cli -- --model path/to/gpt2
|
||||
|
||||
xserv> The future of AI is
|
||||
GPT-2> ...generated text...
|
||||
```
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [x] Greedy generation produces coherent English text
|
||||
- [x] Interactive CLI works (pipe and interactive mode)
|
||||
- [x] Multiple prompts verified: "The future of AI is", "Once upon a time"
|
||||
|
||||
## Takeaways
|
||||
|
||||
1. **QKV split + head reshape 的 layout 陷阱(最关键的 bug)**:GPT-2 的 `c_attn` 输出 `[S, 3H]` 需要 split 成 Q/K/V 再 reshape 成 `[1, num_heads, S, head_dim]`。关键错误:从 `[S, num_heads, head_dim]` 直接 `reshape` 到 `[1, num_heads, S, head_dim]` 不等于 transpose!Reshape 只是重新解释 flat data 的 shape,不会重排数据。必须手动按 `[batch, head, seq, dim]` 的目标 layout 写入数据。同理 merge_heads 也需要手动重排。
|
||||
|
||||
2. **CPU round-trip 作为 correctness first 策略**:`add_tensors`、`add_bias`、`split_qkv`、`merge_heads` 都通过 CPU round-trip 实现。虽然慢(每次都有 GPU→CPU→GPU 拷贝),但确保了正确性。Phase 15 会写专门的 CUDA kernel 替换这些操作。
|
||||
|
||||
3. **GPT-2 的 Conv1D 权重布局**:GPT-2 用 `Conv1D` 而非 `Linear`,权重存为 `[in, out]`(不是标准 Linear 的 `[out, in]`)。计算方式是 `x @ weight`(不需要转置)。这和 Qwen3/LLaMA 的 `[out, in]` 布局不同——Phase 10 需要注意。
|
||||
|
||||
4. **Greedy decoding 的重复问题**:GPT-2 124M 在 greedy decoding 下极易陷入循环("The world was a place of great danger, and...")。这是已知行为,temperature + top-k/top-p sampling 可以缓解。当前实现只有 greedy,sampling 将在后续添加。
|
||||
|
||||
5. **无 KV Cache 的性能代价**:每生成一个 token 都要重新跑完整 forward pass(O(S²) attention)。50 tokens 的生成需要 50 次 full forward,每次的 attention 复杂度还在增长。Phase 9 的 KV Cache 会将 decode 降到 O(S) per token。
|
||||
Reference in New Issue
Block a user