1 Commits

Author SHA1 Message Date
2d48f25e66 phase 11: GPU-resident KV cache
- GpuKVCache: pre-allocated GPU buffers, D2D copy append at offset
- Per-head strided layout [num_kv_heads, max_seq_len, head_dim]
- Fixed critical bug: seq_len must advance AFTER all layers write
  (not inside the loop per-layer)
- GpuBuffer::copy_from_device_at for offset-based D2D copy
- Tensor::from_storage constructor for wrapping raw GPU buffers
- Exported Storage and Dims from xserv-tensor

Correctness: GPU KV cache vs CPU KV cache = 50/50 bit-identical
Performance: ~neutral (KV cache was never the main bottleneck —
reshape/merge/transpose CPU round-trips dominate for Qwen3-8B)

TTFT: 122ms, TBT: 142ms, 7.0 tok/s (marginal change from 7.3)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 11:50:12 +08:00
9 changed files with 274 additions and 14 deletions

View File

@@ -87,6 +87,20 @@ impl GpuBuffer {
error::check(unsafe { ffi::cudaMemset(self.ptr, 0, self.len) })
}
/// Copy `count` bytes from `src` buffer at `src_offset` to this buffer at `dst_offset`.
pub fn copy_from_device_at(&mut self, src: &GpuBuffer, src_offset: usize, dst_offset: usize, count: usize) -> Result<()> {
assert!(src_offset + count <= src.len);
assert!(dst_offset + count <= self.len);
error::check(unsafe {
ffi::cudaMemcpy(
self.ptr.add(dst_offset),
src.ptr.add(src_offset),
count,
ffi::CUDA_MEMCPY_D2D,
)
})
}
/// Consume the buffer without freeing GPU memory. Returns the raw pointer and length.
/// Caller is responsible for eventually calling cudaFree.
pub fn into_raw(self) -> (*mut u8, usize) {

View File

@@ -9,6 +9,7 @@ xserv-tensor = { path = "../xserv-tensor" }
xserv-kernels = { path = "../xserv-kernels" }
xserv-tokenizer = { path = "../xserv-tokenizer" }
half.workspace = true
smallvec.workspace = true
serde.workspace = true
serde_json.workspace = true
safetensors.workspace = true

View File

@@ -1,7 +1,7 @@
use std::path::PathBuf;
use std::time::Instant;
use xserv_model::qwen3::sample_greedy;
use xserv_model::{loader, KVCache, ModelConfig, Qwen3};
use xserv_model::{loader, GpuKVCache, ModelConfig, Qwen3};
use xserv_tensor::{DType, Device};
use xserv_tokenizer::Tokenizer;
@@ -31,11 +31,8 @@ fn main() {
// Warmup
{
let ids = tokenizer.encode("warmup");
let mut cache = KVCache::new(
config.num_layers(), config.num_kv_heads(), config.head_dim(),
DType::BF16, Device::Cuda(0),
);
let _ = model.forward_with_cache(&ids, &mut cache);
let mut cache = GpuKVCache::new(&config, 256, DType::BF16);
let _ = model.forward_gpu_cache(&ids, &mut cache);
}
eprintln!("Warmup done. Running benchmark...");
@@ -97,14 +94,11 @@ fn main() {
let input_ids = tokenizer.encode(prompt);
let input_len = input_ids.len();
let mut cache = KVCache::new(
config.num_layers(), config.num_kv_heads(), config.head_dim(),
DType::BF16, Device::Cuda(0),
);
let mut cache = GpuKVCache::new(&config, 256, DType::BF16);
// Prefill
let t0 = Instant::now();
let logits = model.forward_with_cache(&input_ids, &mut cache);
let logits = model.forward_gpu_cache(&input_ids, &mut cache);
let first_token = sample_greedy(&logits);
let ttft_us = t0.elapsed().as_micros();
@@ -115,7 +109,7 @@ fn main() {
for _ in 1..gen_tokens {
let last = *generated.last().unwrap();
let t_start = Instant::now();
let logits = model.forward_with_cache(&[last], &mut cache);
let logits = model.forward_gpu_cache(&[last], &mut cache);
let next = sample_greedy(&logits);
token_times.push(t_start.elapsed().as_micros());
generated.push(next);
@@ -148,12 +142,14 @@ fn main() {
print!("\"tpot_us\": {tpot_us}}}");
if i < prompts.len() - 1 { println!(","); } else { println!(); }
let display_text = generated_text.replace('\n', " ");
let truncated: String = display_text.chars().take(60).collect();
eprintln!(
"[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
i + 1, prompts.len(),
ttft_us as f64 / 1000.0,
tbt_us as f64 / 1000.0,
&generated_text.replace('\n', " ")[..generated_text.len().min(60)]
truncated
);
}
println!("]");

View File

@@ -0,0 +1,118 @@
use xserv_cuda::GpuBuffer;
use xserv_tensor::{DType, Device, Tensor};
use crate::config::ModelConfig;
/// GPU-resident KV cache. Pre-allocates max_seq_len on GPU,
/// appends new K/V via D2D copy at offset (no CPU round-trip).
pub struct GpuKVCache {
// Per layer: contiguous GPU buffer for K and V
// Layout: [num_kv_heads, max_seq_len, head_dim] — contiguous per head
k_bufs: Vec<GpuBuffer>,
v_bufs: Vec<GpuBuffer>,
seq_len: usize,
max_seq_len: usize,
num_kv_heads: usize,
head_dim: usize,
elem_size: usize,
dtype: DType,
}
impl GpuKVCache {
pub fn new(config: &ModelConfig, max_seq_len: usize, dtype: DType) -> Self {
let num_layers = config.num_layers();
let num_kv_heads = config.num_kv_heads();
let head_dim = config.head_dim();
let elem_size = dtype.size_bytes();
let buf_size = num_kv_heads * max_seq_len * head_dim * elem_size;
let mut k_bufs = Vec::with_capacity(num_layers);
let mut v_bufs = Vec::with_capacity(num_layers);
for _ in 0..num_layers {
let mut k = GpuBuffer::alloc(buf_size).expect("alloc KV cache K");
let mut v = GpuBuffer::alloc(buf_size).expect("alloc KV cache V");
k.zero().unwrap();
v.zero().unwrap();
k_bufs.push(k);
v_bufs.push(v);
}
Self { k_bufs, v_bufs, seq_len: 0, max_seq_len, num_kv_heads, head_dim, elem_size, dtype }
}
pub fn seq_len(&self) -> usize { self.seq_len }
pub fn max_seq_len(&self) -> usize { self.max_seq_len }
/// Append new K/V tensors for a given layer.
/// k_new, v_new: [1, num_kv_heads, new_tokens, head_dim] on GPU, contiguous.
/// `write_pos` is the sequence position to write at (caller manages this).
pub fn append(&mut self, layer: usize, k_new: &Tensor, v_new: &Tensor, new_tokens: usize, write_pos: usize) {
assert!(write_pos + new_tokens <= self.max_seq_len, "KV cache overflow");
let es = self.elem_size;
let hd = self.head_dim;
let max_s = self.max_seq_len;
let nh = self.num_kv_heads;
let k_src = k_new.storage().gpu_buffer();
let v_src = v_new.storage().gpu_buffer();
for h in 0..nh {
let src_off = h * new_tokens * hd * es;
let dst_off = (h * max_s + write_pos) * hd * es;
let count = new_tokens * hd * es;
self.k_bufs[layer].copy_from_device_at(k_src, src_off, dst_off, count).unwrap();
self.v_bufs[layer].copy_from_device_at(v_src, src_off, dst_off, count).unwrap();
}
}
pub fn advance_seq_len(&mut self, new_tokens: usize) {
self.seq_len += new_tokens;
}
/// Get K/V cache tensors for a layer up to `seq_len` tokens: [1, num_kv_heads, seq_len, head_dim]
pub fn get_kv(&self, layer: usize) -> (Tensor, Tensor) {
let sl = self.seq_len;
self.get_kv_len(layer, sl)
}
pub fn get_kv_len(&self, layer: usize, sl: usize) -> (Tensor, Tensor) {
let hd = self.head_dim;
let nh = self.num_kv_heads;
let es = self.elem_size;
let max_s = self.max_seq_len;
// Allocate output tensors [1, nh, sl, hd]
let out_size = nh * sl * hd * es;
let mut k_out = GpuBuffer::alloc(out_size).expect("alloc k_out");
let mut v_out = GpuBuffer::alloc(out_size).expect("alloc v_out");
// Copy each head's valid portion
for h in 0..nh {
let src_off = (h * max_s) * hd * es;
let dst_off = (h * sl) * hd * es;
let count = sl * hd * es;
k_out.copy_from_device_at(&self.k_bufs[layer], src_off, dst_off, count).unwrap();
v_out.copy_from_device_at(&self.v_bufs[layer], src_off, dst_off, count).unwrap();
}
let shape = &[1usize, nh, sl, hd];
let k = unsafe { tensor_from_gpu_buffer(k_out, shape, self.dtype) };
let v = unsafe { tensor_from_gpu_buffer(v_out, shape, self.dtype) };
(k, v)
}
}
/// Create a Tensor from a GpuBuffer (takes ownership).
unsafe fn tensor_from_gpu_buffer(buf: GpuBuffer, shape: &[usize], dtype: DType) -> Tensor {
use xserv_tensor::storage::Storage;
use xserv_tensor::shape::contiguous_strides;
use smallvec::SmallVec;
let storage = Storage::cuda(buf);
Tensor::from_storage(
storage,
SmallVec::from_slice(shape),
contiguous_strides(shape),
0,
dtype,
)
}

View File

@@ -1,8 +1,10 @@
pub mod config;
pub mod gpt2;
pub mod kv_cache;
pub mod loader;
pub mod qwen3;
pub use config::ModelConfig;
pub use gpt2::{GPT2, KVCache};
pub use kv_cache::GpuKVCache;
pub use qwen3::Qwen3;

View File

@@ -5,6 +5,7 @@ use xserv_tensor::{DType, Device, Tensor};
use crate::config::ModelConfig;
use crate::gpt2::KVCache;
use crate::kv_cache::GpuKVCache;
pub struct Qwen3 {
pub config: ModelConfig,
@@ -145,6 +146,69 @@ impl Qwen3 {
let x = rmsnorm(&x, &self.norm, eps);
matmul_2d(&x, &self.lm_head_t)
}
/// Forward with GPU-resident KV cache (no CPU round-trips for KV).
pub fn forward_gpu_cache(&self, token_ids: &[u32], cache: &mut GpuKVCache) -> Tensor {
let new_tokens = token_ids.len();
let pos_offset = cache.seq_len();
let hidden = self.config.hidden();
let num_heads = self.config.num_heads();
let num_kv_heads = self.config.num_kv_heads();
let head_dim = self.config.head_dim();
let eps = self.config.rms_norm_eps.unwrap_or(1e-6) as f32;
let mut x = embedding(&self.embed_tokens, token_ids);
let positions: Vec<u32> = (pos_offset..pos_offset + new_tokens).map(|p| p as u32).collect();
for (layer_idx, layer) in self.layers.iter().enumerate() {
let residual = x.clone();
let normed = rmsnorm(&x, &layer.input_norm, eps);
let q = matmul_2d(&normed, &layer.q_proj_wt);
let k = matmul_2d(&normed, &layer.k_proj_wt);
let v = matmul_2d(&normed, &layer.v_proj_wt);
let q = reshape_heads(&q, new_tokens, num_heads, head_dim);
let k = reshape_heads(&k, new_tokens, num_kv_heads, head_dim);
let v = reshape_heads(&v, new_tokens, num_kv_heads, head_dim);
let q = head_rmsnorm(&q, &layer.q_norm, eps);
let k = head_rmsnorm(&k, &layer.k_norm, eps);
let q = transpose_for_rope(&q, new_tokens, num_heads, head_dim);
let k = transpose_for_rope(&k, new_tokens, num_kv_heads, head_dim);
rope_inplace(&q, &self.rope_cache, &positions);
rope_inplace(&k, &self.rope_cache, &positions);
let q = transpose_from_rope(&q, new_tokens, num_heads, head_dim);
let k = transpose_from_rope(&k, new_tokens, num_kv_heads, head_dim);
// GPU KV cache: D2D append, no CPU round-trip
cache.append(layer_idx, &k, &v, new_tokens, pos_offset);
let (k_full, v_full) = cache.get_kv_len(layer_idx, pos_offset + new_tokens);
let n_rep = num_heads / num_kv_heads;
let k_full = repeat_kv(&k_full, n_rep);
let v_full = repeat_kv(&v_full, n_rep);
let attn_out = attention(&q, &k_full, &v_full, true);
let attn_merged = merge_heads_any(&attn_out, new_tokens, hidden);
let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
x = add_any(&residual, &attn_proj);
let residual = x.clone();
let normed = rmsnorm(&x, &layer.post_norm, eps);
let gate = matmul_2d(&normed, &layer.gate_proj_wt);
let up = matmul_2d(&normed, &layer.up_proj_wt);
let gate_activated = silu(&gate);
let hidden_states = mul_any(&gate_activated, &up);
let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
x = add_any(&residual, &down);
}
cache.advance_seq_len(new_tokens);
let x = rmsnorm(&x, &self.norm, eps);
matmul_2d(&x, &self.lm_head_t)
}
}
// --- Helpers ---

View File

@@ -4,5 +4,6 @@ pub mod storage;
pub mod tensor;
pub use dtype::{DType, TensorDType};
pub use storage::Device;
pub use shape::Dims;
pub use storage::{Device, Storage};
pub use tensor::Tensor;

View File

@@ -18,6 +18,11 @@ pub struct Tensor {
impl Tensor {
// --- Creation ---
/// Create a tensor from raw components (for advanced use like GPU KV cache).
pub fn from_storage(storage: Storage, shape: Dims, strides: Dims, offset: usize, dtype: DType) -> Self {
Self { storage, shape, strides, offset, dtype }
}
pub fn from_slice<T: TensorDType>(data: &[T], shape: &[usize]) -> Self {
let numel: usize = shape.iter().product();
assert_eq!(data.len(), numel, "data length mismatch with shape");

View File

@@ -0,0 +1,59 @@
# Phase 11: Paged Attention + KV Cache Manager — Design Document
## Goal
将 KV cache 从 CPU Vec 迁移到 GPU使用 block-based paging 管理显存。消除每步 decode 的 CPU round-trip当前 KV cache 最大性能瓶颈之一)。
## 当前问题
每步 decode 的 KV cache 路径:
```
GPU tensor (K_new) → CPU (per-head Vec append) → reconstruct → CPU tensor → GPU tensor
```
这涉及 2 次 GPU↔CPU 拷贝 × 36 层 × 2(K,V) = 144 次 transfer/token。
## 目标设计
KV cache 直接存在 GPU 上decode 时只做 GPU→GPU append
```
GPU tensor (K_new) → GPU KV cache (in-place append, no CPU)
```
## 实现方案
### GPU KV Cache简化版非 paged
先实现连续分配的 GPU KV cache预分配 max_seq_len消除 CPU round-trip。Paged allocation 留待后续优化。
```rust
pub struct GpuKVCache {
// 预分配: [num_layers, 2, num_kv_heads, max_seq_len, head_dim] on GPU
k_caches: Vec<Tensor>, // per layer: [1, num_kv_heads, max_seq_len, head_dim]
v_caches: Vec<Tensor>,
seq_len: usize, // 当前已填充的长度
max_seq_len: usize,
}
```
### Append 操作
用 cudaMemcpy D2D 将新 K/V 写入 cache 的正确偏移位置:
```
k_cache[layer][0, :, seq_len:seq_len+new, :] = k_new[0, :, :, :]
```
### 读取操作
不需要拷贝——直接用 view/slice 返回 [0, :, 0:seq_len, :] 的 GPU tensor。
## 需要的新功能
1. Tensor slice 支持view into sub-range of a dimension
2. GPU D2D copy at offset写入 cache 指定位置)
3. 去掉 Qwen3/GPT-2 forward 中的 CPU round-trip KV cache 路径
## Test Plan
- [ ] GPU KV cache 输出与 CPU KV cache bit-identical
- [ ] Benchmark: TBT 应显著降低(消除 144 次 CPU round-trip
- [ ] 50-prompt correctness re-validation