#!/bin/bash # Standardized single-experiment harness with guaranteed fresh state. # # GUARANTEES: # 1. All GPU processes killed before start (verified via nvidia-smi) # 2. All GPU processes killed after finish (clean for next experiment) # 3. Fresh vLLM instances + proxy for every run # 4. All outputs saved to outputs// with metrics, breakdown, APC, GPU snapshot # # Usage: # bash scripts/bench.sh --tag my_experiment --mode baseline # bash scripts/bench.sh --tag my_experiment --mode elastic # bash scripts/bench.sh --tag my_experiment --mode baseline --policy lmetric # bash scripts/bench.sh --tag my_experiment --mode elastic --requests 1000 set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}" PYTHON="$VENV/python" VLLM="$VENV/vllm" MODEL="${MODEL_PATH:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" TRACE="${TRACE:-$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl}" # Defaults TAG="" MODE="baseline" # baseline | elastic POLICY="linear" # linear | lmetric N_INSTANCES=8 BASE_PORT=8000 PROXY_PORT=9090 REQUESTS="" # empty = all requests in trace HEAVY_THRESHOLD=20000 NO_OFFLOAD=false OVERLOAD_FACTOR_ARG="" MAX_BATCHED_TOKENS="" # Parse args while [[ $# -gt 0 ]]; do case "$1" in --tag) TAG="$2"; shift 2 ;; --mode) MODE="$2"; shift 2 ;; --policy) POLICY="$2"; shift 2 ;; --instances) N_INSTANCES="$2"; shift 2 ;; --requests) REQUESTS="$2"; shift 2 ;; --trace) TRACE="$2"; shift 2 ;; --heavy-threshold) HEAVY_THRESHOLD="$2"; shift 2 ;; --no-offload) NO_OFFLOAD=true; shift ;; --overload-factor) OVERLOAD_FACTOR_ARG="$2"; shift 2 ;; --max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;; *) echo "Unknown: $1"; exit 1 ;; esac done if [ -z "$TAG" ]; then echo "Usage: bench.sh --tag NAME --mode {baseline|elastic} [--instances N] [--policy {linear|lmetric}] [--requests N]" echo " Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh." exit 1 fi OUTDIR="$PROJECT_DIR/outputs/$TAG" if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag." exit 1 fi mkdir -p "$OUTDIR" # Save experiment config cat > "$OUTDIR/config.json" << CONF { "tag": "$TAG", "mode": "$MODE", "policy": "$POLICY", "model": "$MODEL", "n_instances": $N_INSTANCES, "requests": "${REQUESTS:-all}", "heavy_threshold": $HEAVY_THRESHOLD, "no_offload": "$NO_OFFLOAD", "overload_factor": "${OVERLOAD_FACTOR_ARG:-2.0}", "max_batched_tokens": "${MAX_BATCHED_TOKENS:-default}", "timestamp": "$(date -Iseconds)", "hostname": "$(hostname)" } CONF # ─── GPU Cleanup (verified) ──────────────────────────────────────────────── cleanup_gpu() { echo "[cleanup] Killing all vLLM/proxy/monitor processes..." for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy|gpu_monitor' | grep -v grep | awk '{print $2}' 2>/dev/null); do kill -9 "$p" 2>/dev/null || true done sleep 3 local gpu_pids gpu_pids=$(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true) if [ -n "$gpu_pids" ]; then echo "[cleanup] Killing GPU-holding PIDs: $gpu_pids" echo "$gpu_pids" | xargs -r kill -9 2>/dev/null || true sleep 5 fi local used used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}') if [ "${used:-0}" -gt 100 ]; then echo "[ERROR] GPUs still have ${used}MB allocated after cleanup. Aborting." nvidia-smi --query-gpu=index,memory.used --format=csv,noheader exit 1 fi echo "[cleanup] All GPUs verified free." } trap 'echo "[bench.sh] Caught signal, cleaning up..."; cleanup_gpu; exit 1' INT TERM trap 'cleanup_gpu' EXIT # ─── Launch vLLM instances ───────────────────────────────────────────────── launch_instances() { echo "[launch] Starting $N_INSTANCES vLLM instances (mode=$MODE)..." # Build optional vLLM args local vllm_extra_args="" if [ -n "$MAX_BATCHED_TOKENS" ]; then vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS" fi for i in $(seq 0 $((N_INSTANCES - 1))); do local port=$((BASE_PORT + i)) local master=$((29500 + i)) local logfile="$OUTDIR/vllm_inst_${i}.log" if [ "$MODE" = "elastic" ]; then VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \ MASTER_PORT=$master \ CUDA_VISIBLE_DEVICES=$i \ $VLLM serve "$MODEL" \ --host 0.0.0.0 --port $port \ --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching --enforce-eager \ --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \ --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \ $vllm_extra_args \ > "$logfile" 2>&1 & else MASTER_PORT=$master \ CUDA_VISIBLE_DEVICES=$i \ $VLLM serve "$MODEL" \ --host 0.0.0.0 --port $port \ --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching --enforce-eager \ --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \ $vllm_extra_args \ > "$logfile" 2>&1 & fi echo " inst_$i: GPU=$i port=$port" sleep 2 # stagger to avoid port collision done # Wait for health echo "[launch] Waiting for instances to become healthy..." for i in $(seq 0 $((N_INSTANCES - 1))); do local port=$((BASE_PORT + i)) local tries=0 while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do tries=$((tries + 1)) if [ $tries -ge 120 ]; then echo "[FAIL] Instance $i (port $port) failed to start. Log:" tail -10 "$OUTDIR/vllm_inst_${i}.log" cleanup_gpu exit 1 fi sleep 5 done echo " inst_$i healthy" done # Wait for bootstrap (elastic only) if [ "$MODE" = "elastic" ]; then echo "[launch] Waiting for Mooncake bootstrap servers..." for i in $(seq 0 $((N_INSTANCES - 1))); do local bp=$((8998 + i)) local tries=0 while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do tries=$((tries + 1)) if [ $tries -ge 60 ]; then echo "[FAIL] Bootstrap $bp failed" cleanup_gpu exit 1 fi sleep 2 done echo " bootstrap $bp ready" done fi } # ─── Launch proxy ────────────────────────────────────────────────────────── launch_proxy() { echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..." local combined_args="" for i in $(seq 0 $((N_INSTANCES - 1))); do combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))" done local extra_args="--policy $POLICY" if [ -n "$OVERLOAD_FACTOR_ARG" ]; then extra_args="$extra_args --overload-factor $OVERLOAD_FACTOR_ARG" fi if [ "$MODE" = "elastic" ]; then local bp_list="" for i in $(seq 0 $((N_INSTANCES - 1))); do bp_list="${bp_list:+$bp_list,}$((8998 + i))" done if [ "$NO_OFFLOAD" = "true" ]; then extra_args="$extra_args --bootstrap-ports $bp_list" else extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list" fi fi $PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \ --combined $combined_args \ --port $PROXY_PORT \ $extra_args \ > "$OUTDIR/proxy.log" 2>&1 & # Wait for proxy local tries=0 while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do tries=$((tries + 1)) if [ $tries -ge 30 ]; then echo "[FAIL] Proxy failed to start" cleanup_gpu exit 1 fi sleep 2 done echo "[proxy] Ready on port $PROXY_PORT" } # ─── Run benchmark ───────────────────────────────────────────────────────── run_benchmark() { local request_args="" if [ -n "$REQUESTS" ]; then request_args="--request-limit $REQUESTS" echo "[bench] Running $REQUESTS requests (trace-driven timing)..." else echo "[bench] Running all requests in trace (trace-driven timing)..." fi # Start GPU monitor in background bash "$PROJECT_DIR/scripts/gpu_monitor.sh" "$OUTDIR/gpu_util.csv" 5 & GPU_MON_PID=$! $PYTHON -m replayer \ --trace "$TRACE" \ --output "$OUTDIR/metrics.jsonl" \ --endpoint "http://localhost:$PROXY_PORT" \ --model "$MODEL" \ $request_args \ -v 2>&1 | tee "$OUTDIR/replayer.log" # Stop GPU monitor kill $GPU_MON_PID 2>/dev/null || true wait $GPU_MON_PID 2>/dev/null || true echo "[bench] GPU util saved: $(wc -l < "$OUTDIR/gpu_util.csv") samples" } # ─── Collect artifacts ───────────────────────────────────────────────────── collect_artifacts() { echo "[collect] Saving artifacts..." curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true curl -sf "http://localhost:$PROXY_PORT/stats" > "$OUTDIR/stats.json" 2>/dev/null || true nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \ --format=csv > "$OUTDIR/gpu_snapshot.csv" 2>/dev/null || true # APC from vLLM logs for i in $(seq 0 $((N_INSTANCES - 1))); do pch=$(grep "Prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0") ech=$(grep "External prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "External prefix cache hit rate: \K[0-9.]+" || echo "") ext_str="" [ -n "$ech" ] && ext_str=" ext=$ech%" echo "inst_$i: prefix=$pch%$ext_str" done | tee "$OUTDIR/apc.txt" } # ─── Summary ─────────────────────────────────────────────────────────────── print_summary() { $PYTHON -c " import json rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')] ok = [r for r in rows if not r.get('error')] err = [r for r in rows if r.get('error')] p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0 ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')]) tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0]) e2es = sorted([r['latency_s'] for r in ok]) print() print('=' * 70) print(' RESULT: $TAG ($MODE, $POLICY)') print('=' * 70) print(' OK=%d/%d (%.1f%%) TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f E2E50=%.3f' % ( len(ok), len(rows), len(ok)*100/len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5))) for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MEDIUM'),(20000,200000,'HEAVY')]: sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')] if sub: t = sorted([r['ttft_s'] for r in sub]) tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0]) print(' %-8s n=%3d TTFT50=%.3f TTFT90=%.3f TPOT90=%.4f' % ( cl, len(sub), p(t,.5), p(t,.9), p(tp,.9) if tp else 0)) if err: print(' Errors (%d):' % len(err)) for e in err[:5]: print(' input=%d %s' % (e['input_length'], str(e.get('error',''))[:60])) print(' Output: $OUTDIR/') print('=' * 70) " } # ─── Main ────────────────────────────────────────────────────────────────── echo "================================================================" echo " bench.sh: $TAG" echo " mode=$MODE policy=$POLICY requests=${REQUESTS:-all} overload_factor=${OVERLOAD_FACTOR_ARG:-2.0}" echo " $(date)" echo "================================================================" cd "$PROJECT_DIR" cleanup_gpu launch_instances launch_proxy run_benchmark collect_artifacts print_summary # cleanup_gpu runs automatically via EXIT trap echo "[done] $(date)"