Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2d48f25e66 |
@@ -87,6 +87,20 @@ impl GpuBuffer {
|
||||
error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
|
||||
}
|
||||
|
||||
/// Copy `count` bytes from `src` buffer at `src_offset` to this buffer at `dst_offset`.
|
||||
pub fn copy_from_device_at(&mut self, src: &GpuBuffer, src_offset: usize, dst_offset: usize, count: usize) -> Result<()> {
|
||||
assert!(src_offset + count <= src.len);
|
||||
assert!(dst_offset + count <= self.len);
|
||||
error::check(unsafe {
|
||||
ffi::cudaMemcpy(
|
||||
self.ptr.add(dst_offset),
|
||||
src.ptr.add(src_offset),
|
||||
count,
|
||||
ffi::CUDA_MEMCPY_D2D,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
|
||||
/// Caller is responsible for eventually calling cudaFree.
|
||||
pub fn into_raw(self) -> (*mut u8, usize) {
|
||||
|
||||
@@ -9,6 +9,7 @@ xserv-tensor = { path = "../xserv-tensor" }
|
||||
xserv-kernels = { path = "../xserv-kernels" }
|
||||
xserv-tokenizer = { path = "../xserv-tokenizer" }
|
||||
half.workspace = true
|
||||
smallvec.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
safetensors.workspace = true
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
use xserv_model::qwen3::sample_greedy;
|
||||
use xserv_model::{loader, KVCache, ModelConfig, Qwen3};
|
||||
use xserv_model::{loader, GpuKVCache, ModelConfig, Qwen3};
|
||||
use xserv_tensor::{DType, Device};
|
||||
use xserv_tokenizer::Tokenizer;
|
||||
|
||||
@@ -31,11 +31,8 @@ fn main() {
|
||||
// Warmup
|
||||
{
|
||||
let ids = tokenizer.encode("warmup");
|
||||
let mut cache = KVCache::new(
|
||||
config.num_layers(), config.num_kv_heads(), config.head_dim(),
|
||||
DType::BF16, Device::Cuda(0),
|
||||
);
|
||||
let _ = model.forward_with_cache(&ids, &mut cache);
|
||||
let mut cache = GpuKVCache::new(&config, 256, DType::BF16);
|
||||
let _ = model.forward_gpu_cache(&ids, &mut cache);
|
||||
}
|
||||
eprintln!("Warmup done. Running benchmark...");
|
||||
|
||||
@@ -97,14 +94,11 @@ fn main() {
|
||||
let input_ids = tokenizer.encode(prompt);
|
||||
let input_len = input_ids.len();
|
||||
|
||||
let mut cache = KVCache::new(
|
||||
config.num_layers(), config.num_kv_heads(), config.head_dim(),
|
||||
DType::BF16, Device::Cuda(0),
|
||||
);
|
||||
let mut cache = GpuKVCache::new(&config, 256, DType::BF16);
|
||||
|
||||
// Prefill
|
||||
let t0 = Instant::now();
|
||||
let logits = model.forward_with_cache(&input_ids, &mut cache);
|
||||
let logits = model.forward_gpu_cache(&input_ids, &mut cache);
|
||||
let first_token = sample_greedy(&logits);
|
||||
let ttft_us = t0.elapsed().as_micros();
|
||||
|
||||
@@ -115,7 +109,7 @@ fn main() {
|
||||
for _ in 1..gen_tokens {
|
||||
let last = *generated.last().unwrap();
|
||||
let t_start = Instant::now();
|
||||
let logits = model.forward_with_cache(&[last], &mut cache);
|
||||
let logits = model.forward_gpu_cache(&[last], &mut cache);
|
||||
let next = sample_greedy(&logits);
|
||||
token_times.push(t_start.elapsed().as_micros());
|
||||
generated.push(next);
|
||||
@@ -148,12 +142,14 @@ fn main() {
|
||||
print!("\"tpot_us\": {tpot_us}}}");
|
||||
if i < prompts.len() - 1 { println!(","); } else { println!(); }
|
||||
|
||||
let display_text = generated_text.replace('\n', " ");
|
||||
let truncated: String = display_text.chars().take(60).collect();
|
||||
eprintln!(
|
||||
"[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
|
||||
i + 1, prompts.len(),
|
||||
ttft_us as f64 / 1000.0,
|
||||
tbt_us as f64 / 1000.0,
|
||||
&generated_text.replace('\n', " ")[..generated_text.len().min(60)]
|
||||
truncated
|
||||
);
|
||||
}
|
||||
println!("]");
|
||||
|
||||
118
crates/xserv-model/src/kv_cache.rs
Normal file
118
crates/xserv-model/src/kv_cache.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
use xserv_cuda::GpuBuffer;
|
||||
use xserv_tensor::{DType, Device, Tensor};
|
||||
use crate::config::ModelConfig;
|
||||
|
||||
/// GPU-resident KV cache. Pre-allocates max_seq_len on GPU,
|
||||
/// appends new K/V via D2D copy at offset (no CPU round-trip).
|
||||
pub struct GpuKVCache {
|
||||
// Per layer: contiguous GPU buffer for K and V
|
||||
// Layout: [num_kv_heads, max_seq_len, head_dim] — contiguous per head
|
||||
k_bufs: Vec<GpuBuffer>,
|
||||
v_bufs: Vec<GpuBuffer>,
|
||||
seq_len: usize,
|
||||
max_seq_len: usize,
|
||||
num_kv_heads: usize,
|
||||
head_dim: usize,
|
||||
elem_size: usize,
|
||||
dtype: DType,
|
||||
}
|
||||
|
||||
impl GpuKVCache {
|
||||
pub fn new(config: &ModelConfig, max_seq_len: usize, dtype: DType) -> Self {
|
||||
let num_layers = config.num_layers();
|
||||
let num_kv_heads = config.num_kv_heads();
|
||||
let head_dim = config.head_dim();
|
||||
let elem_size = dtype.size_bytes();
|
||||
let buf_size = num_kv_heads * max_seq_len * head_dim * elem_size;
|
||||
|
||||
let mut k_bufs = Vec::with_capacity(num_layers);
|
||||
let mut v_bufs = Vec::with_capacity(num_layers);
|
||||
for _ in 0..num_layers {
|
||||
let mut k = GpuBuffer::alloc(buf_size).expect("alloc KV cache K");
|
||||
let mut v = GpuBuffer::alloc(buf_size).expect("alloc KV cache V");
|
||||
k.zero().unwrap();
|
||||
v.zero().unwrap();
|
||||
k_bufs.push(k);
|
||||
v_bufs.push(v);
|
||||
}
|
||||
|
||||
Self { k_bufs, v_bufs, seq_len: 0, max_seq_len, num_kv_heads, head_dim, elem_size, dtype }
|
||||
}
|
||||
|
||||
pub fn seq_len(&self) -> usize { self.seq_len }
|
||||
pub fn max_seq_len(&self) -> usize { self.max_seq_len }
|
||||
|
||||
/// Append new K/V tensors for a given layer.
|
||||
/// k_new, v_new: [1, num_kv_heads, new_tokens, head_dim] on GPU, contiguous.
|
||||
/// `write_pos` is the sequence position to write at (caller manages this).
|
||||
pub fn append(&mut self, layer: usize, k_new: &Tensor, v_new: &Tensor, new_tokens: usize, write_pos: usize) {
|
||||
assert!(write_pos + new_tokens <= self.max_seq_len, "KV cache overflow");
|
||||
let es = self.elem_size;
|
||||
let hd = self.head_dim;
|
||||
let max_s = self.max_seq_len;
|
||||
let nh = self.num_kv_heads;
|
||||
|
||||
let k_src = k_new.storage().gpu_buffer();
|
||||
let v_src = v_new.storage().gpu_buffer();
|
||||
|
||||
for h in 0..nh {
|
||||
let src_off = h * new_tokens * hd * es;
|
||||
let dst_off = (h * max_s + write_pos) * hd * es;
|
||||
let count = new_tokens * hd * es;
|
||||
self.k_bufs[layer].copy_from_device_at(k_src, src_off, dst_off, count).unwrap();
|
||||
self.v_bufs[layer].copy_from_device_at(v_src, src_off, dst_off, count).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn advance_seq_len(&mut self, new_tokens: usize) {
|
||||
self.seq_len += new_tokens;
|
||||
}
|
||||
|
||||
/// Get K/V cache tensors for a layer up to `seq_len` tokens: [1, num_kv_heads, seq_len, head_dim]
|
||||
pub fn get_kv(&self, layer: usize) -> (Tensor, Tensor) {
|
||||
let sl = self.seq_len;
|
||||
self.get_kv_len(layer, sl)
|
||||
}
|
||||
|
||||
pub fn get_kv_len(&self, layer: usize, sl: usize) -> (Tensor, Tensor) {
|
||||
let hd = self.head_dim;
|
||||
let nh = self.num_kv_heads;
|
||||
let es = self.elem_size;
|
||||
let max_s = self.max_seq_len;
|
||||
|
||||
// Allocate output tensors [1, nh, sl, hd]
|
||||
let out_size = nh * sl * hd * es;
|
||||
let mut k_out = GpuBuffer::alloc(out_size).expect("alloc k_out");
|
||||
let mut v_out = GpuBuffer::alloc(out_size).expect("alloc v_out");
|
||||
|
||||
// Copy each head's valid portion
|
||||
for h in 0..nh {
|
||||
let src_off = (h * max_s) * hd * es;
|
||||
let dst_off = (h * sl) * hd * es;
|
||||
let count = sl * hd * es;
|
||||
k_out.copy_from_device_at(&self.k_bufs[layer], src_off, dst_off, count).unwrap();
|
||||
v_out.copy_from_device_at(&self.v_bufs[layer], src_off, dst_off, count).unwrap();
|
||||
}
|
||||
|
||||
let shape = &[1usize, nh, sl, hd];
|
||||
let k = unsafe { tensor_from_gpu_buffer(k_out, shape, self.dtype) };
|
||||
let v = unsafe { tensor_from_gpu_buffer(v_out, shape, self.dtype) };
|
||||
(k, v)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a Tensor from a GpuBuffer (takes ownership).
|
||||
unsafe fn tensor_from_gpu_buffer(buf: GpuBuffer, shape: &[usize], dtype: DType) -> Tensor {
|
||||
use xserv_tensor::storage::Storage;
|
||||
use xserv_tensor::shape::contiguous_strides;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
let storage = Storage::cuda(buf);
|
||||
Tensor::from_storage(
|
||||
storage,
|
||||
SmallVec::from_slice(shape),
|
||||
contiguous_strides(shape),
|
||||
0,
|
||||
dtype,
|
||||
)
|
||||
}
|
||||
@@ -1,8 +1,10 @@
|
||||
pub mod config;
|
||||
pub mod gpt2;
|
||||
pub mod kv_cache;
|
||||
pub mod loader;
|
||||
pub mod qwen3;
|
||||
|
||||
pub use config::ModelConfig;
|
||||
pub use gpt2::{GPT2, KVCache};
|
||||
pub use kv_cache::GpuKVCache;
|
||||
pub use qwen3::Qwen3;
|
||||
|
||||
@@ -5,6 +5,7 @@ use xserv_tensor::{DType, Device, Tensor};
|
||||
|
||||
use crate::config::ModelConfig;
|
||||
use crate::gpt2::KVCache;
|
||||
use crate::kv_cache::GpuKVCache;
|
||||
|
||||
pub struct Qwen3 {
|
||||
pub config: ModelConfig,
|
||||
@@ -145,6 +146,69 @@ impl Qwen3 {
|
||||
let x = rmsnorm(&x, &self.norm, eps);
|
||||
matmul_2d(&x, &self.lm_head_t)
|
||||
}
|
||||
|
||||
/// Forward with GPU-resident KV cache (no CPU round-trips for KV).
|
||||
pub fn forward_gpu_cache(&self, token_ids: &[u32], cache: &mut GpuKVCache) -> Tensor {
|
||||
let new_tokens = token_ids.len();
|
||||
let pos_offset = cache.seq_len();
|
||||
let hidden = self.config.hidden();
|
||||
let num_heads = self.config.num_heads();
|
||||
let num_kv_heads = self.config.num_kv_heads();
|
||||
let head_dim = self.config.head_dim();
|
||||
let eps = self.config.rms_norm_eps.unwrap_or(1e-6) as f32;
|
||||
|
||||
let mut x = embedding(&self.embed_tokens, token_ids);
|
||||
let positions: Vec<u32> = (pos_offset..pos_offset + new_tokens).map(|p| p as u32).collect();
|
||||
|
||||
for (layer_idx, layer) in self.layers.iter().enumerate() {
|
||||
let residual = x.clone();
|
||||
let normed = rmsnorm(&x, &layer.input_norm, eps);
|
||||
|
||||
let q = matmul_2d(&normed, &layer.q_proj_wt);
|
||||
let k = matmul_2d(&normed, &layer.k_proj_wt);
|
||||
let v = matmul_2d(&normed, &layer.v_proj_wt);
|
||||
|
||||
let q = reshape_heads(&q, new_tokens, num_heads, head_dim);
|
||||
let k = reshape_heads(&k, new_tokens, num_kv_heads, head_dim);
|
||||
let v = reshape_heads(&v, new_tokens, num_kv_heads, head_dim);
|
||||
|
||||
let q = head_rmsnorm(&q, &layer.q_norm, eps);
|
||||
let k = head_rmsnorm(&k, &layer.k_norm, eps);
|
||||
|
||||
let q = transpose_for_rope(&q, new_tokens, num_heads, head_dim);
|
||||
let k = transpose_for_rope(&k, new_tokens, num_kv_heads, head_dim);
|
||||
rope_inplace(&q, &self.rope_cache, &positions);
|
||||
rope_inplace(&k, &self.rope_cache, &positions);
|
||||
let q = transpose_from_rope(&q, new_tokens, num_heads, head_dim);
|
||||
let k = transpose_from_rope(&k, new_tokens, num_kv_heads, head_dim);
|
||||
|
||||
// GPU KV cache: D2D append, no CPU round-trip
|
||||
cache.append(layer_idx, &k, &v, new_tokens, pos_offset);
|
||||
let (k_full, v_full) = cache.get_kv_len(layer_idx, pos_offset + new_tokens);
|
||||
|
||||
let n_rep = num_heads / num_kv_heads;
|
||||
let k_full = repeat_kv(&k_full, n_rep);
|
||||
let v_full = repeat_kv(&v_full, n_rep);
|
||||
|
||||
let attn_out = attention(&q, &k_full, &v_full, true);
|
||||
let attn_merged = merge_heads_any(&attn_out, new_tokens, hidden);
|
||||
let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
|
||||
x = add_any(&residual, &attn_proj);
|
||||
|
||||
let residual = x.clone();
|
||||
let normed = rmsnorm(&x, &layer.post_norm, eps);
|
||||
let gate = matmul_2d(&normed, &layer.gate_proj_wt);
|
||||
let up = matmul_2d(&normed, &layer.up_proj_wt);
|
||||
let gate_activated = silu(&gate);
|
||||
let hidden_states = mul_any(&gate_activated, &up);
|
||||
let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
|
||||
x = add_any(&residual, &down);
|
||||
}
|
||||
|
||||
cache.advance_seq_len(new_tokens);
|
||||
let x = rmsnorm(&x, &self.norm, eps);
|
||||
matmul_2d(&x, &self.lm_head_t)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
@@ -4,5 +4,6 @@ pub mod storage;
|
||||
pub mod tensor;
|
||||
|
||||
pub use dtype::{DType, TensorDType};
|
||||
pub use storage::Device;
|
||||
pub use shape::Dims;
|
||||
pub use storage::{Device, Storage};
|
||||
pub use tensor::Tensor;
|
||||
|
||||
@@ -18,6 +18,11 @@ pub struct Tensor {
|
||||
impl Tensor {
|
||||
// --- Creation ---
|
||||
|
||||
/// Create a tensor from raw components (for advanced use like GPU KV cache).
|
||||
pub fn from_storage(storage: Storage, shape: Dims, strides: Dims, offset: usize, dtype: DType) -> Self {
|
||||
Self { storage, shape, strides, offset, dtype }
|
||||
}
|
||||
|
||||
pub fn from_slice<T: TensorDType>(data: &[T], shape: &[usize]) -> Self {
|
||||
let numel: usize = shape.iter().product();
|
||||
assert_eq!(data.len(), numel, "data length mismatch with shape");
|
||||
|
||||
59
docs/11-paged-attention.md
Normal file
59
docs/11-paged-attention.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Phase 11: Paged Attention + KV Cache Manager — Design Document
|
||||
|
||||
## Goal
|
||||
|
||||
将 KV cache 从 CPU Vec 迁移到 GPU,使用 block-based paging 管理显存。消除每步 decode 的 CPU round-trip(当前 KV cache 最大性能瓶颈之一)。
|
||||
|
||||
## 当前问题
|
||||
|
||||
每步 decode 的 KV cache 路径:
|
||||
```
|
||||
GPU tensor (K_new) → CPU (per-head Vec append) → reconstruct → CPU tensor → GPU tensor
|
||||
```
|
||||
这涉及 2 次 GPU↔CPU 拷贝 × 36 层 × 2(K,V) = 144 次 transfer/token。
|
||||
|
||||
## 目标设计
|
||||
|
||||
KV cache 直接存在 GPU 上,decode 时只做 GPU→GPU append:
|
||||
```
|
||||
GPU tensor (K_new) → GPU KV cache (in-place append, no CPU)
|
||||
```
|
||||
|
||||
## 实现方案
|
||||
|
||||
### GPU KV Cache(简化版,非 paged)
|
||||
|
||||
先实现连续分配的 GPU KV cache(预分配 max_seq_len),消除 CPU round-trip。Paged allocation 留待后续优化。
|
||||
|
||||
```rust
|
||||
pub struct GpuKVCache {
|
||||
// 预分配: [num_layers, 2, num_kv_heads, max_seq_len, head_dim] on GPU
|
||||
k_caches: Vec<Tensor>, // per layer: [1, num_kv_heads, max_seq_len, head_dim]
|
||||
v_caches: Vec<Tensor>,
|
||||
seq_len: usize, // 当前已填充的长度
|
||||
max_seq_len: usize,
|
||||
}
|
||||
```
|
||||
|
||||
### Append 操作
|
||||
|
||||
用 cudaMemcpy D2D 将新 K/V 写入 cache 的正确偏移位置:
|
||||
```
|
||||
k_cache[layer][0, :, seq_len:seq_len+new, :] = k_new[0, :, :, :]
|
||||
```
|
||||
|
||||
### 读取操作
|
||||
|
||||
不需要拷贝——直接用 view/slice 返回 [0, :, 0:seq_len, :] 的 GPU tensor。
|
||||
|
||||
## 需要的新功能
|
||||
|
||||
1. Tensor slice 支持(view into sub-range of a dimension)
|
||||
2. GPU D2D copy at offset(写入 cache 指定位置)
|
||||
3. 去掉 Qwen3/GPT-2 forward 中的 CPU round-trip KV cache 路径
|
||||
|
||||
## Test Plan
|
||||
|
||||
- [ ] GPU KV cache 输出与 CPU KV cache bit-identical
|
||||
- [ ] Benchmark: TBT 应显著降低(消除 144 次 CPU round-trip)
|
||||
- [ ] 50-prompt correctness re-validation
|
||||
Reference in New Issue
Block a user