Files
agentic-kvc/scripts/bench.sh
Gahow Wang bf037594c4 Production-realistic baseline: APC 67.5%, TPOT +139% from interference
Updated methodology:
- Window+thin sampling preserves cross-session sharing (48% vs 16%)
- --max-single-turn-ratio 0.3 boosts multi-turn to 70%
- --window-seconds 600 for 10-min contiguous window
- Trace-driven replay (no session limit, no time compression)
- Daily config: --requests 850 (~13 min, APC~76%)

Key result: TPOT p90=0.175s (vs 0.073s in legacy 1-req/GPU setup),
confirming prefill-decode interference is real at production concurrency.
APC 67.5% (vs 44%) from better KV reuse preservation.

Also fixed KV reuse breakdown: 62% intra-session / 38% cross-session
(was incorrectly reported as 91% / 9%).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-23 15:44:34 +08:00

341 lines
13 KiB
Bash
Executable File

#!/bin/bash
# Standardized single-experiment harness with guaranteed fresh state.
#
# GUARANTEES:
# 1. All GPU processes killed before start (verified via nvidia-smi)
# 2. All GPU processes killed after finish (clean for next experiment)
# 3. Fresh vLLM instances + proxy for every run
# 4. All outputs saved to outputs/<tag>/ with metrics, breakdown, APC, GPU snapshot
#
# Usage:
# bash scripts/bench.sh --tag my_experiment --mode baseline
# bash scripts/bench.sh --tag my_experiment --mode elastic
# bash scripts/bench.sh --tag my_experiment --mode baseline --policy lmetric
# bash scripts/bench.sh --tag my_experiment --mode elastic --requests 1000
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
PYTHON="$VENV/python"
VLLM="$VENV/vllm"
MODEL="${MODEL_PATH:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="${TRACE:-$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl}"
# Defaults
TAG=""
MODE="baseline" # baseline | elastic
POLICY="linear" # linear | lmetric
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
REQUESTS="" # empty = all requests in trace
HEAVY_THRESHOLD=20000
NO_OFFLOAD=false
OVERLOAD_FACTOR_ARG=""
MAX_BATCHED_TOKENS=""
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--tag) TAG="$2"; shift 2 ;;
--mode) MODE="$2"; shift 2 ;;
--policy) POLICY="$2"; shift 2 ;;
--instances) N_INSTANCES="$2"; shift 2 ;;
--requests) REQUESTS="$2"; shift 2 ;;
--trace) TRACE="$2"; shift 2 ;;
--heavy-threshold) HEAVY_THRESHOLD="$2"; shift 2 ;;
--no-offload) NO_OFFLOAD=true; shift ;;
--overload-factor) OVERLOAD_FACTOR_ARG="$2"; shift 2 ;;
--max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;;
*) echo "Unknown: $1"; exit 1 ;;
esac
done
if [ -z "$TAG" ]; then
echo "Usage: bench.sh --tag NAME --mode {baseline|elastic} [--instances N] [--policy {linear|lmetric}] [--requests N]"
echo " Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh."
exit 1
fi
OUTDIR="$PROJECT_DIR/outputs/$TAG"
if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then
echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag."
exit 1
fi
mkdir -p "$OUTDIR"
# Save experiment config
cat > "$OUTDIR/config.json" << CONF
{
"tag": "$TAG",
"mode": "$MODE",
"policy": "$POLICY",
"model": "$MODEL",
"n_instances": $N_INSTANCES,
"requests": "${REQUESTS:-all}",
"heavy_threshold": $HEAVY_THRESHOLD,
"no_offload": "$NO_OFFLOAD",
"overload_factor": "${OVERLOAD_FACTOR_ARG:-2.0}",
"max_batched_tokens": "${MAX_BATCHED_TOKENS:-default}",
"timestamp": "$(date -Iseconds)",
"hostname": "$(hostname)"
}
CONF
# ─── GPU Cleanup (verified) ────────────────────────────────────────────────
cleanup_gpu() {
echo "[cleanup] Killing all vLLM/proxy processes..."
for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do
kill -9 "$p" 2>/dev/null || true
done
sleep 3
# Kill any remaining GPU holders
local gpu_pids
gpu_pids=$(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true)
if [ -n "$gpu_pids" ]; then
echo "[cleanup] Killing GPU-holding PIDs: $gpu_pids"
echo "$gpu_pids" | xargs -r kill -9 2>/dev/null || true
sleep 5
fi
# Verify GPUs are free
local used
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}')
if [ "${used:-0}" -gt 100 ]; then
echo "[ERROR] GPUs still have ${used}MB allocated after cleanup. Aborting."
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader
exit 1
fi
echo "[cleanup] All GPUs verified free."
}
# ─── Launch vLLM instances ─────────────────────────────────────────────────
launch_instances() {
echo "[launch] Starting $N_INSTANCES vLLM instances (mode=$MODE)..."
# Build optional vLLM args
local vllm_extra_args=""
if [ -n "$MAX_BATCHED_TOKENS" ]; then
vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS"
fi
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local master=$((29500 + i))
local logfile="$OUTDIR/vllm_inst_${i}.log"
if [ "$MODE" = "elastic" ]; then
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
$vllm_extra_args \
> "$logfile" 2>&1 &
else
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
$vllm_extra_args \
> "$logfile" 2>&1 &
fi
echo " inst_$i: GPU=$i port=$port"
sleep 2 # stagger to avoid port collision
done
# Wait for health
echo "[launch] Waiting for instances to become healthy..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 120 ]; then
echo "[FAIL] Instance $i (port $port) failed to start. Log:"
tail -10 "$OUTDIR/vllm_inst_${i}.log"
cleanup_gpu
exit 1
fi
sleep 5
done
echo " inst_$i healthy"
done
# Wait for bootstrap (elastic only)
if [ "$MODE" = "elastic" ]; then
echo "[launch] Waiting for Mooncake bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local bp=$((8998 + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 60 ]; then
echo "[FAIL] Bootstrap $bp failed"
cleanup_gpu
exit 1
fi
sleep 2
done
echo " bootstrap $bp ready"
done
fi
}
# ─── Launch proxy ──────────────────────────────────────────────────────────
launch_proxy() {
echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..."
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
local extra_args="--policy $POLICY"
if [ -n "$OVERLOAD_FACTOR_ARG" ]; then
extra_args="$extra_args --overload-factor $OVERLOAD_FACTOR_ARG"
fi
if [ "$MODE" = "elastic" ]; then
local bp_list=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
done
if [ "$NO_OFFLOAD" = "true" ]; then
extra_args="$extra_args --bootstrap-ports $bp_list"
else
extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
fi
fi
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--port $PROXY_PORT \
$extra_args \
> "$OUTDIR/proxy.log" 2>&1 &
# Wait for proxy
local tries=0
while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 30 ]; then
echo "[FAIL] Proxy failed to start"
cleanup_gpu
exit 1
fi
sleep 2
done
echo "[proxy] Ready on port $PROXY_PORT"
}
# ─── Run benchmark ─────────────────────────────────────────────────────────
run_benchmark() {
local request_args=""
if [ -n "$REQUESTS" ]; then
request_args="--request-limit $REQUESTS"
echo "[bench] Running $REQUESTS requests (trace-driven timing)..."
else
echo "[bench] Running all requests in trace (trace-driven timing)..."
fi
# Start GPU monitor in background
bash "$PROJECT_DIR/scripts/gpu_monitor.sh" "$OUTDIR/gpu_util.csv" 5 &
GPU_MON_PID=$!
$PYTHON -m replayer \
--trace "$TRACE" \
--output "$OUTDIR/metrics.jsonl" \
--endpoint "http://localhost:$PROXY_PORT" \
--model "$MODEL" \
$request_args \
-v 2>&1 | tee "$OUTDIR/replayer.log"
# Stop GPU monitor
kill $GPU_MON_PID 2>/dev/null || true
wait $GPU_MON_PID 2>/dev/null || true
echo "[bench] GPU util saved: $(wc -l < "$OUTDIR/gpu_util.csv") samples"
}
# ─── Collect artifacts ─────────────────────────────────────────────────────
collect_artifacts() {
echo "[collect] Saving artifacts..."
curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true
curl -sf "http://localhost:$PROXY_PORT/stats" > "$OUTDIR/stats.json" 2>/dev/null || true
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
--format=csv > "$OUTDIR/gpu_snapshot.csv" 2>/dev/null || true
# APC from vLLM logs
for i in $(seq 0 $((N_INSTANCES - 1))); do
pch=$(grep "Prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0")
ech=$(grep "External prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "External prefix cache hit rate: \K[0-9.]+" || echo "")
ext_str=""
[ -n "$ech" ] && ext_str=" ext=$ech%"
echo "inst_$i: prefix=$pch%$ext_str"
done | tee "$OUTDIR/apc.txt"
}
# ─── Summary ───────────────────────────────────────────────────────────────
print_summary() {
$PYTHON -c "
import json
rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')]
ok = [r for r in rows if not r.get('error')]
err = [r for r in rows if r.get('error')]
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
e2es = sorted([r['latency_s'] for r in ok])
print()
print('=' * 70)
print(' RESULT: $TAG ($MODE, $POLICY)')
print('=' * 70)
print(' OK=%d/%d (%.1f%%) TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f E2E50=%.3f' % (
len(ok), len(rows), len(ok)*100/len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MEDIUM'),(20000,200000,'HEAVY')]:
sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
if sub:
t = sorted([r['ttft_s'] for r in sub])
tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
print(' %-8s n=%3d TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f' % (
cl, len(sub), p(t,.5), p(t,.9), p(tp,.9) if tp else 0))
if err:
print(' Errors (%d):' % len(err))
for e in err[:5]:
print(' input=%d %s' % (e['input_length'], str(e.get('error',''))[:60]))
print(' Output: $OUTDIR/')
print('=' * 70)
"
}
# ─── Main ──────────────────────────────────────────────────────────────────
echo "================================================================"
echo " bench.sh: $TAG"
echo " mode=$MODE policy=$POLICY requests=${REQUESTS:-all} overload_factor=${OVERLOAD_FACTOR_ARG:-2.0}"
echo " $(date)"
echo "================================================================"
cd "$PROJECT_DIR"
cleanup_gpu
launch_instances
launch_proxy
run_benchmark
collect_artifacts
print_summary
cleanup_gpu
echo "[done] $(date)"