Files
agentic-kvc/scripts/bench.sh
Gahow Wang 5816aad731 A3: vLLM scheduler patch for step-level JSONL log
When AGENTIC_STEP_LOG_PATH is set, the scheduler emits one JSONL line
per scheduler step with t_unix, worker_id, prefill/decode token
counts, n_running/n_waiting, preempted ids, and per-request phase
labels. No-op when the env var is unset, so production engines are
not impacted. bench.sh now threads AGENTIC_STEP_LOG_DIR through to
each per-engine launch so step logs end up at engine_${i}.jsonl.

Required by Batch 2 (PD-colo interference index) and Batch 5
(same-worker overlap attribution); engine /metrics polling cannot
provide per-step granularity.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 16:19:11 +08:00

417 lines
16 KiB
Bash
Executable File

#!/bin/bash
# Standardized single-experiment harness with guaranteed fresh state.
#
# GUARANTEES:
# 1. All GPU processes killed before start (verified via nvidia-smi)
# 2. All GPU processes killed after finish (clean for next experiment)
# 3. Fresh vLLM instances + proxy for every run
# 4. All outputs saved to outputs/<tag>/ with metrics, breakdown, APC, GPU snapshot
#
# Usage:
# bash scripts/bench.sh --tag my_experiment --mode baseline
# bash scripts/bench.sh --tag my_experiment --mode elastic
# bash scripts/bench.sh --tag my_experiment --mode baseline --policy lmetric
# bash scripts/bench.sh --tag my_experiment --mode elastic --requests 1000
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
PYTHON="$VENV/python"
VLLM="$VENV/vllm"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="${TRACE:-$PROJECT_DIR/traces/w600_r0.0015_st30.jsonl}"
# Defaults
TAG=""
MODE="baseline" # baseline | elastic | pdsep
POLICY="linear" # linear | lmetric | unified
POLICY_SET=false
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
REQUESTS="" # empty = all requests in trace
HEAVY_THRESHOLD=20000
NO_OFFLOAD=false
OVERLOAD_FACTOR_ARG=""
MAX_BATCHED_TOKENS=""
MAX_OFFLOAD_INFLIGHT=""
CACHE_GATE_RATIO=""
OFFLOAD_MODE=""
PD_RATIO="4:4" # P:D split when MODE=pdsep
EAGER=false # add --enforce-eager back (cuda-graph ablation)
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--tag) TAG="$2"; shift 2 ;;
--mode) MODE="$2"; shift 2 ;;
--policy) POLICY="$2"; POLICY_SET=true; shift 2 ;;
--instances) N_INSTANCES="$2"; shift 2 ;;
--requests) REQUESTS="$2"; shift 2 ;;
--trace) TRACE="$2"; shift 2 ;;
--heavy-threshold) HEAVY_THRESHOLD="$2"; shift 2 ;;
--no-offload) NO_OFFLOAD=true; shift ;;
--overload-factor) OVERLOAD_FACTOR_ARG="$2"; shift 2 ;;
--max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;;
--max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
--cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
--offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
--pd-ratio) PD_RATIO="$2"; shift 2 ;;
--eager) EAGER=true; shift ;;
*) echo "Unknown: $1"; exit 1 ;;
esac
done
if [ -z "$TAG" ]; then
echo "Usage: bench.sh --tag NAME --mode {baseline|elastic|pdsep}"
echo " [--policy {linear|lmetric|unified}] [--instances N]"
echo " [--pd-ratio P:D] (only with --mode pdsep, default 4:4)"
echo " [--eager] (re-enable --enforce-eager for the cuda-graph ablation)"
echo " [--requests N] [--trace PATH]"
echo " Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh."
exit 1
fi
if [ "$MODE" = "elastic" ] && [ "$POLICY_SET" = "false" ]; then
POLICY="unified"
fi
if [ "$MODE" = "pdsep" ]; then
N_P_INST=${PD_RATIO%%:*}
N_D_INST=${PD_RATIO##*:}
if [ $((N_P_INST + N_D_INST)) -ne "$N_INSTANCES" ]; then
echo "[ERROR] --pd-ratio $PD_RATIO must sum to --instances $N_INSTANCES"
exit 1
fi
fi
OUTDIR="$PROJECT_DIR/outputs/$TAG"
if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then
echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag."
exit 1
fi
mkdir -p "$OUTDIR"
# Save experiment config
cat > "$OUTDIR/config.json" << CONF
{
"tag": "$TAG",
"mode": "$MODE",
"policy": "$POLICY",
"model": "$MODEL",
"n_instances": $N_INSTANCES,
"requests": "${REQUESTS:-all}",
"heavy_threshold": $HEAVY_THRESHOLD,
"no_offload": "$NO_OFFLOAD",
"overload_factor": "${OVERLOAD_FACTOR_ARG:-2.0}",
"max_batched_tokens": "${MAX_BATCHED_TOKENS:-default}",
"timestamp": "$(date -Iseconds)",
"hostname": "$(hostname)"
}
CONF
# ─── GPU Cleanup (verified) ────────────────────────────────────────────────
cleanup_gpu() {
echo "[cleanup] Killing all vLLM/proxy/monitor processes..."
for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy|gpu_monitor' | grep -v grep | awk '{print $2}' 2>/dev/null); do
kill -9 "$p" 2>/dev/null || true
done
sleep 3
local gpu_pids
gpu_pids=$(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true)
if [ -n "$gpu_pids" ]; then
echo "[cleanup] Killing GPU-holding PIDs: $gpu_pids"
echo "$gpu_pids" | xargs -r kill -9 2>/dev/null || true
sleep 5
fi
local used
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}')
if [ "${used:-0}" -gt 100 ]; then
echo "[ERROR] GPUs still have ${used}MB allocated after cleanup. Aborting."
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader
exit 1
fi
echo "[cleanup] All GPUs verified free."
}
trap 'echo "[bench.sh] Caught signal, cleaning up..."; cleanup_gpu; exit 1' INT TERM
trap 'cleanup_gpu' EXIT
# ─── Launch vLLM instances ─────────────────────────────────────────────────
launch_instances() {
echo "[launch] Starting $N_INSTANCES vLLM instances (mode=$MODE)..."
# Build optional vLLM args
local vllm_extra_args=""
if [ -n "$MAX_BATCHED_TOKENS" ]; then
vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS"
fi
if [ "$EAGER" = "true" ]; then
vllm_extra_args="$vllm_extra_args --enforce-eager"
fi
# elastic and pdsep both run Mooncake kv_both; difference is only the
# proxy routing. baseline runs plain vLLM (no Mooncake).
local use_mooncake=false
if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
use_mooncake=true
fi
# Optional: when AGENTIC_STEP_LOG_DIR is exported, point each engine at its
# own JSONL file so the patched scheduler emits per-step records.
local step_log_dir="${AGENTIC_STEP_LOG_DIR:-}"
if [ -n "$step_log_dir" ]; then
mkdir -p "$step_log_dir"
fi
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local master=$((29500 + i))
local logfile="$OUTDIR/vllm_inst_${i}.log"
local step_env=""
if [ -n "$step_log_dir" ]; then
step_env="AGENTIC_STEP_LOG_PATH=$step_log_dir/engine_${i}.jsonl AGENTIC_WORKER_ID=engine_${i}"
fi
if [ "$use_mooncake" = "true" ]; then
env $step_env \
PYTHONHASHSEED=42 \
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
$vllm_extra_args \
> "$logfile" 2>&1 &
else
env $step_env \
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
$vllm_extra_args \
> "$logfile" 2>&1 &
fi
echo " inst_$i: GPU=$i port=$port"
sleep 2 # stagger to avoid port collision
done
# Wait for health
echo "[launch] Waiting for instances to become healthy..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 120 ]; then
echo "[FAIL] Instance $i (port $port) failed to start. Log:"
tail -10 "$OUTDIR/vllm_inst_${i}.log"
cleanup_gpu
exit 1
fi
sleep 5
done
echo " inst_$i healthy"
done
# Wait for bootstrap (Mooncake modes only)
if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
echo "[launch] Waiting for Mooncake bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local bp=$((8998 + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 60 ]; then
echo "[FAIL] Bootstrap $bp failed"
cleanup_gpu
exit 1
fi
sleep 2
done
echo " bootstrap $bp ready"
done
fi
}
# ─── Launch proxy ──────────────────────────────────────────────────────────
launch_proxy() {
echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..."
local extra_args="--policy $POLICY"
if [ -n "$OVERLOAD_FACTOR_ARG" ]; then
extra_args="$extra_args --overload-factor $OVERLOAD_FACTOR_ARG"
fi
if [ -n "$MAX_OFFLOAD_INFLIGHT" ]; then
extra_args="$extra_args --max-offload-inflight $MAX_OFFLOAD_INFLIGHT"
fi
if [ -n "$CACHE_GATE_RATIO" ]; then
extra_args="$extra_args --cache-gate-ratio $CACHE_GATE_RATIO"
fi
if [ -n "$OFFLOAD_MODE" ]; then
extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
fi
local proxy_mode_args=""
if [ "$MODE" = "pdsep" ]; then
# First N_P_INST instances are prefill (with their bootstrap ports),
# remaining N_D_INST are decode.
for i in $(seq 0 $((N_P_INST - 1))); do
proxy_mode_args="$proxy_mode_args --prefill http://127.0.0.1:$((BASE_PORT + i)) $((8998 + i))"
done
for i in $(seq $N_P_INST $((N_INSTANCES - 1))); do
proxy_mode_args="$proxy_mode_args --decode http://127.0.0.1:$((BASE_PORT + i))"
done
else
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
proxy_mode_args="--combined $combined_args"
if [ "$MODE" = "elastic" ]; then
local bp_list=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
done
if [ "$NO_OFFLOAD" = "true" ]; then
extra_args="$extra_args --bootstrap-ports $bp_list"
else
extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
fi
fi
fi
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
$proxy_mode_args \
--port $PROXY_PORT \
$extra_args \
> "$OUTDIR/proxy.log" 2>&1 &
# Wait for proxy
local tries=0
while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 30 ]; then
echo "[FAIL] Proxy failed to start"
cleanup_gpu
exit 1
fi
sleep 2
done
echo "[proxy] Ready on port $PROXY_PORT"
}
# ─── Run benchmark ─────────────────────────────────────────────────────────
run_benchmark() {
local request_args=""
if [ -n "$REQUESTS" ]; then
request_args="--request-limit $REQUESTS"
echo "[bench] Running $REQUESTS requests (trace-driven timing)..."
else
echo "[bench] Running all requests in trace (trace-driven timing)..."
fi
# Start GPU monitor in background
bash "$PROJECT_DIR/scripts/gpu_monitor.sh" "$OUTDIR/gpu_util.csv" 5 &
GPU_MON_PID=$!
$PYTHON -m replayer \
--trace "$TRACE" \
--output "$OUTDIR/metrics.jsonl" \
--endpoint "http://localhost:$PROXY_PORT" \
--model "$MODEL" \
$request_args \
-v 2>&1 | tee "$OUTDIR/replayer.log"
# Stop GPU monitor
kill $GPU_MON_PID 2>/dev/null || true
wait $GPU_MON_PID 2>/dev/null || true
echo "[bench] GPU util saved: $(wc -l < "$OUTDIR/gpu_util.csv") samples"
}
# ─── Collect artifacts ─────────────────────────────────────────────────────
collect_artifacts() {
echo "[collect] Saving artifacts..."
curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true
curl -sf "http://localhost:$PROXY_PORT/stats" > "$OUTDIR/stats.json" 2>/dev/null || true
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
--format=csv > "$OUTDIR/gpu_snapshot.csv" 2>/dev/null || true
# APC from vLLM logs
for i in $(seq 0 $((N_INSTANCES - 1))); do
pch=$(grep "Prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0")
ech=$(grep "External prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "External prefix cache hit rate: \K[0-9.]+" || echo "")
ext_str=""
[ -n "$ech" ] && ext_str=" ext=$ech%"
echo "inst_$i: prefix=$pch%$ext_str"
done | tee "$OUTDIR/apc.txt"
}
# ─── Summary ───────────────────────────────────────────────────────────────
print_summary() {
$PYTHON -c "
import json
rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')]
ok = [r for r in rows if not r.get('error')]
err = [r for r in rows if r.get('error')]
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
e2es = sorted([r['latency_s'] for r in ok])
print()
print('=' * 70)
print(' RESULT: $TAG ($MODE, $POLICY)')
print('=' * 70)
print(' OK=%d/%d (%.1f%%) TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f E2E50=%.3f' % (
len(ok), len(rows), len(ok)*100/len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MEDIUM'),(20000,200000,'HEAVY')]:
sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
if sub:
t = sorted([r['ttft_s'] for r in sub])
tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
print(' %-8s n=%3d TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f' % (
cl, len(sub), p(t,.5), p(t,.9), p(tp,.9) if tp else 0))
if err:
print(' Errors (%d):' % len(err))
for e in err[:5]:
print(' input=%d %s' % (e['input_length'], str(e.get('error',''))[:60]))
print(' Output: $OUTDIR/')
print('=' * 70)
"
}
# ─── Main ──────────────────────────────────────────────────────────────────
echo "================================================================"
echo " bench.sh: $TAG"
echo " mode=$MODE policy=$POLICY requests=${REQUESTS:-all} overload_factor=${OVERLOAD_FACTOR_ARG:-2.0}"
echo " $(date)"
echo "================================================================"
cd "$PROJECT_DIR"
cleanup_gpu
launch_instances
launch_proxy
run_benchmark
collect_artifacts
print_summary
# cleanup_gpu runs automatically via EXIT trap
echo "[done] $(date)"