Files
agentic-kvc/scripts/run_elastic_stability_test.sh
Gahow Wang fc92410ec9 Invalidate prior A/B results + add proper experiment harness
Prior cross-machine comparison (commit 1e86285) was invalid: dash0
baseline used warm instances with residual KV cache, inflating TTFT
by 2x. Evidence: inst_7 APC=68.3% impossible from 25 cold-start
requests; WARM TTFT p90=3.3s vs fresh=0.26s.

Fair same-machine comparison (both fresh restart on dash0):
  Baseline:    TTFT50=1.075  TPOT90=0.076  E2E50=5.075  OK=198/200
  Elastic P2P: TTFT50=1.018  TPOT90=0.085  E2E50=6.977  OK=195/200
Elastic is WORSE due to Mooncake kv_both memory overhead.

Changes:
- REPORT.md: rewrite §3-4 with corrected results, add §3.5 errata
- pd_separation_analysis.md: update elastic TL;DR with correct numbers
- cache_aware_proxy.py: fix double-decrement bugs in offload path,
  add 120s prefill timeout with co-located fallback (HEAVY_COLO_FALLBACK)
- bench.sh: standardized experiment harness with guaranteed GPU cleanup
  and fresh-state verification (nvidia-smi check before start)
- run_elastic_stability_test.sh: two-phase elastic vs baseline test

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 17:54:21 +08:00

330 lines
11 KiB
Bash
Executable File

#!/bin/bash
# Elastic P2P stability test: runs 200-request benchmark with offload mode
# and baseline mode, then compares success rates.
#
# Must be run on dash0 (8 GPUs with Mooncake support).
#
# Usage:
# bash scripts/run_elastic_stability_test.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
PYTHON="$VENV/python"
VLLM="$VENV/vllm"
MODEL="${MODEL_PATH:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
REQUEST_LIMIT=200
TIME_SCALE=20
MAX_INFLIGHT=8
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
OUT_ELASTIC="$PROJECT_DIR/outputs/elastic_stability_${TIMESTAMP}"
OUT_BASELINE="$PROJECT_DIR/outputs/baseline_stability_${TIMESTAMP}"
# ─── Helper functions ────────────────────────────────────────────────────────
kill_all() {
echo "[cleanup] Killing vLLM and proxy processes..."
for p in $(ps aux | grep 'vllm serve' | grep -v grep | awk '{print $2}' 2>/dev/null); do
kill -9 "$p" 2>/dev/null || true
done
for p in $(ps aux | grep 'cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do
kill -9 "$p" 2>/dev/null || true
done
sleep 5
echo "[cleanup] Releasing GPUs..."
for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u); do
kill -9 "$p" 2>/dev/null || true
done
sleep 10
echo "[cleanup] Done."
}
wait_for_instances() {
local n=$1
echo "[wait] Waiting for $n vLLM instances to become healthy..."
for i in $(seq 0 $((n - 1))); do
local port=$((BASE_PORT + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 120 ]; then
echo "[FAIL] Instance $i (port $port) did not start in 600s"
return 1
fi
sleep 5
done
echo " Instance $i (port $port) healthy"
done
}
wait_for_bootstrap() {
echo "[wait] Waiting for Mooncake bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local bp=$((8998 + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 60 ]; then
echo "[FAIL] Bootstrap $bp did not start in 120s"
return 1
fi
sleep 2
done
echo " Bootstrap $bp ready"
done
}
wait_for_proxy() {
echo "[wait] Waiting for proxy on port $PROXY_PORT..."
local tries=0
while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -ge 30 ]; then
echo "[FAIL] Proxy did not start in 60s"
return 1
fi
sleep 2
done
echo " Proxy ready"
}
launch_vllm_kv_both() {
echo ""
echo "=== Launching $N_INSTANCES vLLM instances (kv_both) ==="
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local bp=$((8998 + i))
local master=$((29500 + i))
local log="/tmp/elastic_test_${i}.log"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port "$port" --tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
> "$log" 2>&1 &
echo " Instance $i: GPU=$i port=$port bootstrap=$bp log=$log"
sleep 2
done
wait_for_instances $N_INSTANCES
wait_for_bootstrap
}
launch_vllm_baseline() {
echo ""
echo "=== Launching $N_INSTANCES vLLM instances (baseline, no Mooncake) ==="
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local master=$((29500 + i))
local log="/tmp/baseline_test_${i}.log"
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port "$port" --tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
> "$log" 2>&1 &
echo " Instance $i: GPU=$i port=$port log=$log"
sleep 2
done
wait_for_instances $N_INSTANCES
}
launch_proxy_elastic() {
echo ""
echo "=== Starting proxy (elastic offload mode) ==="
local combined_args=""
local bp_list=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
done
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--bootstrap-ports "$bp_list" \
--offload --heavy-threshold 20000 \
--port $PROXY_PORT \
> /tmp/proxy_elastic.log 2>&1 &
wait_for_proxy
}
launch_proxy_baseline() {
echo ""
echo "=== Starting proxy (baseline, no offload) ==="
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--port $PROXY_PORT \
> /tmp/proxy_baseline.log 2>&1 &
wait_for_proxy
}
run_benchmark() {
local tag=$1
local output_dir=$2
mkdir -p "$output_dir"
echo ""
echo "=== Running benchmark: $tag ($REQUEST_LIMIT requests) ==="
$PYTHON -m replayer \
--trace "$TRACE" \
--output "$output_dir/metrics.jsonl" \
--endpoint "http://localhost:$PROXY_PORT" \
--model "$MODEL" \
--time-scale "$TIME_SCALE" \
--max-inflight-sessions "$MAX_INFLIGHT" \
--request-limit "$REQUEST_LIMIT" \
-v 2>&1 | tee "$output_dir/replayer.log"
# Save proxy breakdown and stats
curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$output_dir/breakdown.json" 2>/dev/null || true
curl -sf "http://localhost:$PROXY_PORT/stats" > "$output_dir/stats.json" 2>/dev/null || true
}
collect_gpu_util() {
local output_dir=$1
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
--format=csv > "$output_dir/gpu_snapshot.csv" 2>/dev/null || true
}
print_summary() {
local label=$1
local output_dir=$2
local metrics="$output_dir/metrics.jsonl"
if [ ! -f "$metrics" ]; then
echo " [$label] No metrics file found!"
return
fi
# Count total, success, error from metrics JSONL
local total=$(wc -l < "$metrics")
local success=$(grep -c '"error":null\|"error": null' "$metrics" 2>/dev/null || grep -c '"ttft":[0-9]' "$metrics" 2>/dev/null || echo 0)
local errors=$((total - success))
local rate="N/A"
if [ "$total" -gt 0 ]; then
rate=$(awk "BEGIN{printf \"%.1f\", ($success/$total)*100}")
fi
echo " [$label]"
echo " Total requests: $total"
echo " Successful: $success"
echo " Errors: $errors"
echo " Success rate: ${rate}%"
# Print summary.json if it exists
local summary="$output_dir/metrics.summary.json"
if [ -f "$summary" ]; then
echo " Summary: $(cat "$summary")"
fi
}
# ─── Main ────────────────────────────────────────────────────────────────────
echo "================================================================"
echo " Elastic P2P Stability Test"
echo " $(date)"
echo " Model: $MODEL"
echo " Requests: $REQUEST_LIMIT"
echo " Output: elastic → $OUT_ELASTIC"
echo " baseline → $OUT_BASELINE"
echo "================================================================"
# Sanity checks
if [ ! -f "$TRACE" ]; then
echo "[ERROR] Trace file not found: $TRACE"
exit 1
fi
if [ ! -x "$PYTHON" ]; then
echo "[ERROR] Python not found: $PYTHON"
exit 1
fi
# ─── Phase 1: Elastic P2P offload ────────────────────────────────────────────
echo ""
echo "############################################################"
echo " Phase 1: Elastic P2P Offload"
echo "############################################################"
kill_all
launch_vllm_kv_both
launch_proxy_elastic
collect_gpu_util "$OUT_ELASTIC"
run_benchmark "elastic_p2p" "$OUT_ELASTIC"
echo ""
echo "[phase1] Saving APC stats..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
curl -sf "http://127.0.0.1:$port/metrics" 2>/dev/null \
| grep -E 'vllm:cache_hit|prefix_cache' \
>> "$OUT_ELASTIC/apc_metrics.txt" 2>/dev/null || true
done
# ─── Phase 2: Baseline (no offload) ─────────────────────────────────────────
echo ""
echo "############################################################"
echo " Phase 2: Baseline (no offload, no Mooncake)"
echo "############################################################"
kill_all
launch_vllm_baseline
launch_proxy_baseline
collect_gpu_util "$OUT_BASELINE"
run_benchmark "baseline" "$OUT_BASELINE"
echo ""
echo "[phase2] Saving APC stats..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
curl -sf "http://127.0.0.1:$port/metrics" 2>/dev/null \
| grep -E 'vllm:cache_hit|prefix_cache' \
>> "$OUT_BASELINE/apc_metrics.txt" 2>/dev/null || true
done
# ─── Cleanup ─────────────────────────────────────────────────────────────────
kill_all
# ─── Comparison ──────────────────────────────────────────────────────────────
echo ""
echo "================================================================"
echo " Results Comparison"
echo "================================================================"
print_summary "Elastic P2P" "$OUT_ELASTIC"
echo ""
print_summary "Baseline" "$OUT_BASELINE"
echo ""
echo "Detailed outputs:"
echo " Elastic: $OUT_ELASTIC/"
echo " Baseline: $OUT_BASELINE/"
echo ""
echo "Breakdown analysis:"
echo " python scripts/analyze_breakdown.py $OUT_ELASTIC/breakdown.json"
echo ""
echo "================================================================"
echo " Done. $(date)"
echo "================================================================"