Sweeps GPU KV-cache capacity (--num-gpu-blocks-override) under a closed-loop replay (concurrency 4) of a controlled multi-turn workload (cumulative intra-session prefix, gen_synth_trace.py), measuring realized APC (prefix_cache hits/queries delta) and latency per capacity. Result: a sharp knee at 3.6 GB = exactly the active working set (4 sessions x 0.91 GB). APC rises 7->12->36->80% then saturates at the ~71% intra-session ceiling; TTFT p90 collapses 13.0 s -> 0.53 s at the same point; dead flat to 14.5 GB, 100% completion throughout. So only the active working set needs HBM; capacity beyond it -- and the CPU/storage tier built to chase the reuse tail -- buys ~0. Knee scales linearly with concurrency = cluster GPU count. README.md ties exp(a)+exp(b) into the section-2.2 GPU-hit-first argument with tables, conclusions, and caveats. Raw per-request dumps gitignored; summary/m0/m1 deltas kept. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
55 lines
2.3 KiB
Bash
55 lines
2.3 KiB
Bash
#!/bin/bash
|
|
# Exp (b): capacity -> realized-APC -> latency knee. Runs on dash0, one H20.
|
|
set -uo pipefail
|
|
cd /home/admin/cpfs/wjh/agentic-kv
|
|
PY=.venv/bin/python
|
|
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
|
GPU=${GPU:-1}
|
|
PORT=${PORT:-8200}
|
|
EP=http://127.0.0.1:$PORT
|
|
# Filtered trace (inputs <= 60k tok) so max-model-len can be 64k and the low
|
|
# capacity points still boot; raw trace has p90=89k/max=167k single requests.
|
|
TRACE=${TRACE:-traces/sampled_pfx_r0.004_le60k.jsonl}
|
|
MAXLEN=${MAXLEN:-65536}
|
|
REQLIMIT=${REQLIMIT:-600}
|
|
INFLIGHT=${INFLIGHT:-8}
|
|
OUT=v2/exp_b_capacity_knee/results
|
|
mkdir -p "$OUT"
|
|
|
|
# GPU KV-block counts to sweep (16 tok/block; 1 GiB ~= 683 blocks).
|
|
# floor 4096 blk (6.4GB, holds one 64k req) -> 24000 blk (37.7GB, full instance):
|
|
CAPS=${CAPS:-"4096 6144 8192 12288 16384 20480 24000"}
|
|
|
|
VLLM_PID=""
|
|
launch() {
|
|
CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \
|
|
$PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
|
|
--host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \
|
|
--enable-prefix-caching --enforce-eager --dtype auto --max-model-len $MAXLEN \
|
|
--num-gpu-blocks-override "$1" > "$OUT/vllm_blk$1.log" 2>&1 &
|
|
VLLM_PID=$!
|
|
$PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; \
|
|
sys.exit(0 if wait_healthy('$EP',900) else 1)"
|
|
}
|
|
teardown() {
|
|
[ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null
|
|
for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done
|
|
sleep 3; VLLM_PID=""
|
|
}
|
|
trap teardown EXIT
|
|
|
|
scrape() { $PY -c "import sys,json; sys.path.insert(0,'v2'); from common.util import scrape_prefix_cache; print(json.dumps(scrape_prefix_cache('$EP')))"; }
|
|
|
|
for BLK in $CAPS; do
|
|
echo "==================== blocks=$BLK ===================="
|
|
launch "$BLK" || { echo "launch failed at $BLK (pool too small for model?)"; tail -20 "$OUT/vllm_blk$BLK.log"; teardown; continue; }
|
|
M0=$(scrape)
|
|
$PY -m replayer --trace "$TRACE" --output "$OUT/metrics_blk$BLK.jsonl" \
|
|
--endpoint $EP --model "$MODEL" --max-inflight-sessions $INFLIGHT --request-limit $REQLIMIT
|
|
M1=$(scrape)
|
|
echo "$M0" > "$OUT/m0_blk$BLK.json"; echo "$M1" > "$OUT/m1_blk$BLK.json"
|
|
teardown
|
|
done
|
|
|
|
echo "=== exp (b) sweep DONE ==="
|