Files
Gahow Wang 837df6bc9e v2 exp(a): three-tier KV-hit latency microbench (GPU >> CPU >> miss)
Measures TTFT to serve a reused prefix of length L from each KV tier on a
single H20 (Qwen3-Coder-30B-A3B, vLLM 0.18.1): miss (recompute), CPU-tier
hit (native DRAM offload), GPU-tier hit (HBM prefix cache). Each measured
request is bracketed by /metrics scrapes so the tier is verified
(vllm:prefix_cache_hits vs external_prefix_cache_hits), not assumed.

Result: GPU hit is ~flat (42->111 ms over 1k->64k tokens); CPU hit is
transfer-bound (PCIe H2D ~54 GB/s, 57->272 ms); miss grows superlinearly
(78 ms -> 15.2 s). GPU beats CPU 1.4-2.5x (gap grows with context);
miss/CPU up to 56x, miss/GPU up to 137x. pcie_transfer.py is the
independent CPU-hit floor backstop. Evidence for the GPU-hit-first
principle (paper section 2.2).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 11:23:04 +08:00

51 lines
2.2 KiB
Bash

#!/bin/bash
# Exp (a): three-tier hit-latency. Runs on dash0. One H20 (GPU $GPU).
set -uo pipefail
cd /home/admin/cpfs/wjh/agentic-kv
PY=.venv/bin/python
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
GPU=${GPU:-0}
PORT=${PORT:-8100}
EP=http://127.0.0.1:$PORT
OUT=v2/exp_a_tier_latency/results
mkdir -p "$OUT"
VLLM_PID=""
launch() { # $1 = extra args, $2 = logfile
echo ">>> launch vllm: $1"
CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \
$PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
--host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \
--enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \
$1 > "$2" 2>&1 &
VLLM_PID=$!
echo " pid=$VLLM_PID waiting for health..."
$PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; \
sys.exit(0 if wait_healthy('$EP',900) else 1)" || { echo "LAUNCH FAILED"; tail -30 "$2"; return 1; }
echo " healthy."
}
teardown() {
[ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null
for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done
kill -0 "$VLLM_PID" 2>/dev/null && kill -TERM "$VLLM_PID" 2>/dev/null
sleep 3; VLLM_PID=""
}
trap teardown EXIT
# ---- Config A1: big GPU pool, NO offload -> measure MISS + GPU hit ----
launch "--gpu-memory-utilization 0.9" "$OUT/vllm_a1.log" || exit 1
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode miss --reps 8 --out "$OUT/miss.json"
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode gpu --reps 8 --out "$OUT/gpu.json"
teardown
# ---- Config A2: small GPU pool (80k tok = 5000 blocks) + CPU offload 40GB -> CPU hit ----
launch "--num-gpu-blocks-override 5000 --kv-offloading-size 40 --kv-offloading-backend native" "$OUT/vllm_a2.log" || exit 1
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode cpu --reps 4 \
--flood-tokens 88000 --flood-chunk 16384 --out "$OUT/cpu.json"
teardown
# ---- PCIe backstop (uses the now-free GPU) ----
CUDA_VISIBLE_DEVICES=$GPU $PY v2/exp_a_tier_latency/pcie_transfer.py --reps 20 --out "$OUT/pcie.json"
echo "=== exp (a) DONE ==="