Measures TTFT to serve a reused prefix of length L from each KV tier on a single H20 (Qwen3-Coder-30B-A3B, vLLM 0.18.1): miss (recompute), CPU-tier hit (native DRAM offload), GPU-tier hit (HBM prefix cache). Each measured request is bracketed by /metrics scrapes so the tier is verified (vllm:prefix_cache_hits vs external_prefix_cache_hits), not assumed. Result: GPU hit is ~flat (42->111 ms over 1k->64k tokens); CPU hit is transfer-bound (PCIe H2D ~54 GB/s, 57->272 ms); miss grows superlinearly (78 ms -> 15.2 s). GPU beats CPU 1.4-2.5x (gap grows with context); miss/CPU up to 56x, miss/GPU up to 137x. pcie_transfer.py is the independent CPU-hit floor backstop. Evidence for the GPU-hit-first principle (paper section 2.2). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
40 lines
1.6 KiB
Bash
40 lines
1.6 KiB
Bash
#!/bin/bash
|
|
# Exp (a) CPU-tier + PCIe only (miss/gpu already done). HMA fix applied.
|
|
set -uo pipefail
|
|
cd /home/admin/cpfs/wjh/agentic-kv
|
|
PY=.venv/bin/python
|
|
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
|
GPU=${GPU:-0}
|
|
PORT=${PORT:-8100}
|
|
EP=http://127.0.0.1:$PORT
|
|
OUT=v2/exp_a_tier_latency/results
|
|
mkdir -p "$OUT"
|
|
|
|
VLLM_PID=""
|
|
teardown() {
|
|
[ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null
|
|
for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done
|
|
sleep 3; VLLM_PID=""
|
|
}
|
|
trap teardown EXIT
|
|
|
|
echo ">>> launch A2: small pool + CPU offload (HMA disabled)"
|
|
CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \
|
|
$PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
|
|
--host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \
|
|
--enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \
|
|
--num-gpu-blocks-override 5000 --kv-offloading-size 40 --kv-offloading-backend native \
|
|
--disable-hybrid-kv-cache-manager > "$OUT/vllm_a2.log" 2>&1 &
|
|
VLLM_PID=$!
|
|
echo " pid=$VLLM_PID waiting for health..."
|
|
$PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; sys.exit(0 if wait_healthy('$EP',900) else 1)" \
|
|
|| { echo "LAUNCH FAILED"; tail -25 "$OUT/vllm_a2.log"; exit 1; }
|
|
echo " healthy."
|
|
|
|
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode cpu --reps 4 \
|
|
--flood-tokens 88000 --flood-chunk 16384 --out "$OUT/cpu.json"
|
|
teardown
|
|
|
|
CUDA_VISIBLE_DEVICES=$GPU $PY v2/exp_a_tier_latency/pcie_transfer.py --reps 20 --out "$OUT/pcie.json"
|
|
echo "=== exp (a) CPU+PCIe DONE ==="
|