Adds analysis/pd_sep_paper_section/ as the home for the "PD separation is net negative under agentic workloads" paper section: plot scripts for C1 (workload chars), C6 (roofline), C7 (routing-vs-PD-sep lever), the C6/C7 PDFs already rendered, and a README mapping candidate claims to required figures plus open re-run items. Removes --enforce-eager from bench.sh and all active launch scripts so cuda graphs are captured -- the prior methodology suppressed one of PD-sep's structural advantages (D-node fixed-shape decode). Legacy scripts under scripts/legacy/ are intentionally untouched as historical records. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
363 lines
14 KiB
Bash
Executable File
363 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
|
# Standardized single-experiment harness with guaranteed fresh state.
|
|
#
|
|
# GUARANTEES:
|
|
# 1. All GPU processes killed before start (verified via nvidia-smi)
|
|
# 2. All GPU processes killed after finish (clean for next experiment)
|
|
# 3. Fresh vLLM instances + proxy for every run
|
|
# 4. All outputs saved to outputs/<tag>/ with metrics, breakdown, APC, GPU snapshot
|
|
#
|
|
# Usage:
|
|
# bash scripts/bench.sh --tag my_experiment --mode baseline
|
|
# bash scripts/bench.sh --tag my_experiment --mode elastic
|
|
# bash scripts/bench.sh --tag my_experiment --mode baseline --policy lmetric
|
|
# bash scripts/bench.sh --tag my_experiment --mode elastic --requests 1000
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
|
|
PYTHON="$VENV/python"
|
|
VLLM="$VENV/vllm"
|
|
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
TRACE="${TRACE:-$PROJECT_DIR/traces/w600_r0.0015_st30.jsonl}"
|
|
|
|
# Defaults
|
|
TAG=""
|
|
MODE="baseline" # baseline | elastic
|
|
POLICY="linear" # linear | lmetric | unified
|
|
POLICY_SET=false
|
|
N_INSTANCES=8
|
|
BASE_PORT=8000
|
|
PROXY_PORT=9090
|
|
REQUESTS="" # empty = all requests in trace
|
|
HEAVY_THRESHOLD=20000
|
|
NO_OFFLOAD=false
|
|
OVERLOAD_FACTOR_ARG=""
|
|
MAX_BATCHED_TOKENS=""
|
|
MAX_OFFLOAD_INFLIGHT=""
|
|
CACHE_GATE_RATIO=""
|
|
OFFLOAD_MODE=""
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--tag) TAG="$2"; shift 2 ;;
|
|
--mode) MODE="$2"; shift 2 ;;
|
|
--policy) POLICY="$2"; POLICY_SET=true; shift 2 ;;
|
|
--instances) N_INSTANCES="$2"; shift 2 ;;
|
|
--requests) REQUESTS="$2"; shift 2 ;;
|
|
--trace) TRACE="$2"; shift 2 ;;
|
|
--heavy-threshold) HEAVY_THRESHOLD="$2"; shift 2 ;;
|
|
--no-offload) NO_OFFLOAD=true; shift ;;
|
|
--overload-factor) OVERLOAD_FACTOR_ARG="$2"; shift 2 ;;
|
|
--max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;;
|
|
--max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
|
|
--cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
|
|
--offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
|
|
*) echo "Unknown: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
if [ -z "$TAG" ]; then
|
|
echo "Usage: bench.sh --tag NAME --mode {baseline|elastic} [--instances N] [--policy {linear|lmetric|unified}] [--requests N]"
|
|
echo " Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh."
|
|
exit 1
|
|
fi
|
|
|
|
if [ "$MODE" = "elastic" ] && [ "$POLICY_SET" = "false" ]; then
|
|
POLICY="unified"
|
|
fi
|
|
|
|
OUTDIR="$PROJECT_DIR/outputs/$TAG"
|
|
if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then
|
|
echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag."
|
|
exit 1
|
|
fi
|
|
mkdir -p "$OUTDIR"
|
|
|
|
# Save experiment config
|
|
cat > "$OUTDIR/config.json" << CONF
|
|
{
|
|
"tag": "$TAG",
|
|
"mode": "$MODE",
|
|
"policy": "$POLICY",
|
|
"model": "$MODEL",
|
|
"n_instances": $N_INSTANCES,
|
|
"requests": "${REQUESTS:-all}",
|
|
"heavy_threshold": $HEAVY_THRESHOLD,
|
|
"no_offload": "$NO_OFFLOAD",
|
|
"overload_factor": "${OVERLOAD_FACTOR_ARG:-2.0}",
|
|
"max_batched_tokens": "${MAX_BATCHED_TOKENS:-default}",
|
|
"timestamp": "$(date -Iseconds)",
|
|
"hostname": "$(hostname)"
|
|
}
|
|
CONF
|
|
|
|
# ─── GPU Cleanup (verified) ────────────────────────────────────────────────
|
|
|
|
cleanup_gpu() {
|
|
echo "[cleanup] Killing all vLLM/proxy/monitor processes..."
|
|
for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy|gpu_monitor' | grep -v grep | awk '{print $2}' 2>/dev/null); do
|
|
kill -9 "$p" 2>/dev/null || true
|
|
done
|
|
sleep 3
|
|
local gpu_pids
|
|
gpu_pids=$(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true)
|
|
if [ -n "$gpu_pids" ]; then
|
|
echo "[cleanup] Killing GPU-holding PIDs: $gpu_pids"
|
|
echo "$gpu_pids" | xargs -r kill -9 2>/dev/null || true
|
|
sleep 5
|
|
fi
|
|
local used
|
|
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}')
|
|
if [ "${used:-0}" -gt 100 ]; then
|
|
echo "[ERROR] GPUs still have ${used}MB allocated after cleanup. Aborting."
|
|
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader
|
|
exit 1
|
|
fi
|
|
echo "[cleanup] All GPUs verified free."
|
|
}
|
|
|
|
trap 'echo "[bench.sh] Caught signal, cleaning up..."; cleanup_gpu; exit 1' INT TERM
|
|
trap 'cleanup_gpu' EXIT
|
|
|
|
# ─── Launch vLLM instances ─────────────────────────────────────────────────
|
|
|
|
launch_instances() {
|
|
echo "[launch] Starting $N_INSTANCES vLLM instances (mode=$MODE)..."
|
|
|
|
# Build optional vLLM args
|
|
local vllm_extra_args=""
|
|
if [ -n "$MAX_BATCHED_TOKENS" ]; then
|
|
vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS"
|
|
fi
|
|
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
local port=$((BASE_PORT + i))
|
|
local master=$((29500 + i))
|
|
local logfile="$OUTDIR/vllm_inst_${i}.log"
|
|
|
|
if [ "$MODE" = "elastic" ]; then
|
|
PYTHONHASHSEED=42 \
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \
|
|
MASTER_PORT=$master \
|
|
CUDA_VISIBLE_DEVICES=$i \
|
|
$VLLM serve "$MODEL" \
|
|
--host 0.0.0.0 --port $port \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
$vllm_extra_args \
|
|
> "$logfile" 2>&1 &
|
|
else
|
|
MASTER_PORT=$master \
|
|
CUDA_VISIBLE_DEVICES=$i \
|
|
$VLLM serve "$MODEL" \
|
|
--host 0.0.0.0 --port $port \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
$vllm_extra_args \
|
|
> "$logfile" 2>&1 &
|
|
fi
|
|
|
|
echo " inst_$i: GPU=$i port=$port"
|
|
sleep 2 # stagger to avoid port collision
|
|
done
|
|
|
|
# Wait for health
|
|
echo "[launch] Waiting for instances to become healthy..."
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
local port=$((BASE_PORT + i))
|
|
local tries=0
|
|
while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -ge 120 ]; then
|
|
echo "[FAIL] Instance $i (port $port) failed to start. Log:"
|
|
tail -10 "$OUTDIR/vllm_inst_${i}.log"
|
|
cleanup_gpu
|
|
exit 1
|
|
fi
|
|
sleep 5
|
|
done
|
|
echo " inst_$i healthy"
|
|
done
|
|
|
|
# Wait for bootstrap (elastic only)
|
|
if [ "$MODE" = "elastic" ]; then
|
|
echo "[launch] Waiting for Mooncake bootstrap servers..."
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
local bp=$((8998 + i))
|
|
local tries=0
|
|
while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -ge 60 ]; then
|
|
echo "[FAIL] Bootstrap $bp failed"
|
|
cleanup_gpu
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " bootstrap $bp ready"
|
|
done
|
|
fi
|
|
}
|
|
|
|
# ─── Launch proxy ──────────────────────────────────────────────────────────
|
|
|
|
launch_proxy() {
|
|
echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..."
|
|
local combined_args=""
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
|
done
|
|
|
|
local extra_args="--policy $POLICY"
|
|
if [ -n "$OVERLOAD_FACTOR_ARG" ]; then
|
|
extra_args="$extra_args --overload-factor $OVERLOAD_FACTOR_ARG"
|
|
fi
|
|
if [ -n "$MAX_OFFLOAD_INFLIGHT" ]; then
|
|
extra_args="$extra_args --max-offload-inflight $MAX_OFFLOAD_INFLIGHT"
|
|
fi
|
|
if [ -n "$CACHE_GATE_RATIO" ]; then
|
|
extra_args="$extra_args --cache-gate-ratio $CACHE_GATE_RATIO"
|
|
fi
|
|
if [ -n "$OFFLOAD_MODE" ]; then
|
|
extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
|
|
fi
|
|
if [ "$MODE" = "elastic" ]; then
|
|
local bp_list=""
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
|
|
done
|
|
if [ "$NO_OFFLOAD" = "true" ]; then
|
|
extra_args="$extra_args --bootstrap-ports $bp_list"
|
|
else
|
|
extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
|
|
fi
|
|
fi
|
|
|
|
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
|
--combined $combined_args \
|
|
--port $PROXY_PORT \
|
|
$extra_args \
|
|
> "$OUTDIR/proxy.log" 2>&1 &
|
|
|
|
# Wait for proxy
|
|
local tries=0
|
|
while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -ge 30 ]; then
|
|
echo "[FAIL] Proxy failed to start"
|
|
cleanup_gpu
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo "[proxy] Ready on port $PROXY_PORT"
|
|
}
|
|
|
|
# ─── Run benchmark ─────────────────────────────────────────────────────────
|
|
|
|
run_benchmark() {
|
|
local request_args=""
|
|
if [ -n "$REQUESTS" ]; then
|
|
request_args="--request-limit $REQUESTS"
|
|
echo "[bench] Running $REQUESTS requests (trace-driven timing)..."
|
|
else
|
|
echo "[bench] Running all requests in trace (trace-driven timing)..."
|
|
fi
|
|
|
|
# Start GPU monitor in background
|
|
bash "$PROJECT_DIR/scripts/gpu_monitor.sh" "$OUTDIR/gpu_util.csv" 5 &
|
|
GPU_MON_PID=$!
|
|
|
|
$PYTHON -m replayer \
|
|
--trace "$TRACE" \
|
|
--output "$OUTDIR/metrics.jsonl" \
|
|
--endpoint "http://localhost:$PROXY_PORT" \
|
|
--model "$MODEL" \
|
|
$request_args \
|
|
-v 2>&1 | tee "$OUTDIR/replayer.log"
|
|
|
|
# Stop GPU monitor
|
|
kill $GPU_MON_PID 2>/dev/null || true
|
|
wait $GPU_MON_PID 2>/dev/null || true
|
|
echo "[bench] GPU util saved: $(wc -l < "$OUTDIR/gpu_util.csv") samples"
|
|
}
|
|
|
|
# ─── Collect artifacts ─────────────────────────────────────────────────────
|
|
|
|
collect_artifacts() {
|
|
echo "[collect] Saving artifacts..."
|
|
curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true
|
|
curl -sf "http://localhost:$PROXY_PORT/stats" > "$OUTDIR/stats.json" 2>/dev/null || true
|
|
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
|
|
--format=csv > "$OUTDIR/gpu_snapshot.csv" 2>/dev/null || true
|
|
|
|
# APC from vLLM logs
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
pch=$(grep "Prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0")
|
|
ech=$(grep "External prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "External prefix cache hit rate: \K[0-9.]+" || echo "")
|
|
ext_str=""
|
|
[ -n "$ech" ] && ext_str=" ext=$ech%"
|
|
echo "inst_$i: prefix=$pch%$ext_str"
|
|
done | tee "$OUTDIR/apc.txt"
|
|
}
|
|
|
|
# ─── Summary ───────────────────────────────────────────────────────────────
|
|
|
|
print_summary() {
|
|
$PYTHON -c "
|
|
import json
|
|
rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')]
|
|
ok = [r for r in rows if not r.get('error')]
|
|
err = [r for r in rows if r.get('error')]
|
|
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
|
|
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
|
|
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
|
|
e2es = sorted([r['latency_s'] for r in ok])
|
|
print()
|
|
print('=' * 70)
|
|
print(' RESULT: $TAG ($MODE, $POLICY)')
|
|
print('=' * 70)
|
|
print(' OK=%d/%d (%.1f%%) TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f E2E50=%.3f' % (
|
|
len(ok), len(rows), len(ok)*100/len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
|
|
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MEDIUM'),(20000,200000,'HEAVY')]:
|
|
sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
|
|
if sub:
|
|
t = sorted([r['ttft_s'] for r in sub])
|
|
tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
|
|
print(' %-8s n=%3d TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f' % (
|
|
cl, len(sub), p(t,.5), p(t,.9), p(tp,.9) if tp else 0))
|
|
if err:
|
|
print(' Errors (%d):' % len(err))
|
|
for e in err[:5]:
|
|
print(' input=%d %s' % (e['input_length'], str(e.get('error',''))[:60]))
|
|
print(' Output: $OUTDIR/')
|
|
print('=' * 70)
|
|
"
|
|
}
|
|
|
|
# ─── Main ──────────────────────────────────────────────────────────────────
|
|
|
|
echo "================================================================"
|
|
echo " bench.sh: $TAG"
|
|
echo " mode=$MODE policy=$POLICY requests=${REQUESTS:-all} overload_factor=${OVERLOAD_FACTOR_ARG:-2.0}"
|
|
echo " $(date)"
|
|
echo "================================================================"
|
|
|
|
cd "$PROJECT_DIR"
|
|
cleanup_gpu
|
|
launch_instances
|
|
launch_proxy
|
|
run_benchmark
|
|
collect_artifacts
|
|
print_summary
|
|
# cleanup_gpu runs automatically via EXIT trap
|
|
|
|
echo "[done] $(date)"
|