agentic-kvc/v2/exp_b_capacity_knee/run_sweep.sh

#!/bin/bash
# Exp (b): capacity -> realized-APC -> latency knee. Runs on dash0, one H20.
set -uo pipefail
cd /home/admin/cpfs/wjh/agentic-kv
PY=.venv/bin/python
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
GPU=${GPU:-1}
PORT=${PORT:-8200}
EP=http://127.0.0.1:$PORT
# Filtered trace (inputs <= 60k tok) so max-model-len can be 64k and the low
# capacity points still boot; raw trace has p90=89k/max=167k single requests.
TRACE=${TRACE:-traces/sampled_pfx_r0.004_le60k.jsonl}
MAXLEN=${MAXLEN:-65536}
REQLIMIT=${REQLIMIT:-600}
INFLIGHT=${INFLIGHT:-8}
OUT=v2/exp_b_capacity_knee/results
mkdir -p "$OUT"

# GPU KV-block counts to sweep (16 tok/block; 1 GiB ~= 683 blocks).
# floor 4096 blk (6.4GB, holds one 64k req) -> 24000 blk (37.7GB, full instance):
CAPS=${CAPS:-"4096 6144 8192 12288 16384 20480 24000"}

VLLM_PID=""
launch() {
    CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \
    $PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
        --host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \
        --enable-prefix-caching --enforce-eager --dtype auto --max-model-len $MAXLEN \
        --num-gpu-blocks-override "$1" > "$OUT/vllm_blk$1.log" 2>&1 &
    VLLM_PID=$!
    $PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; \
            sys.exit(0 if wait_healthy('$EP',900) else 1)"
}
teardown() {
    [ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null
    for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done
    sleep 3; VLLM_PID=""
}
trap teardown EXIT

scrape() { $PY -c "import sys,json; sys.path.insert(0,'v2'); from common.util import scrape_prefix_cache; print(json.dumps(scrape_prefix_cache('$EP')))"; }

for BLK in $CAPS; do
    echo "==================== blocks=$BLK ===================="
    launch "$BLK" || { echo "launch failed at $BLK (pool too small for model?)"; tail -20 "$OUT/vllm_blk$BLK.log"; teardown; continue; }
    M0=$(scrape)
    $PY -m replayer --trace "$TRACE" --output "$OUT/metrics_blk$BLK.jsonl" \
        --endpoint $EP --model "$MODEL" --max-inflight-sessions $INFLIGHT --request-limit $REQLIMIT
    M1=$(scrape)
    echo "$M0" > "$OUT/m0_blk$BLK.json"; echo "$M1" > "$OUT/m1_blk$BLK.json"
    teardown
done

echo "=== exp (b) sweep DONE ==="