D2: run_benchmark.sh and run_experiments.sh still pass --time-scale and --max-inflight-sessions to the replayer, but those flags were removed when the project moved to trace-driven dispatch. The scripts cannot run as-is. D3: ~25 ad-hoc analyze_* / compare_* / profile_* / final_* scripts and a handful of single-experiment run_*.sh point at /home/admin/cpfs paths, deleted output directories, or a sampled trace file that no longer exists. Keep them in scripts/legacy/ for historical reference; the scripts that remain in scripts/ (analyze_trace, analyze_breakdown, analyze_cache_hit, analyze_eviction, compare_results, compute_roofline, sample_trace, analyze_agentic_patterns, simulate_cache_policies, plus launch_*.sh, gpu_monitor.sh, bench.sh) cover the current workflow. Adds scripts/legacy/README.md to document the archival policy.
330 lines
11 KiB
Bash
Executable File
330 lines
11 KiB
Bash
Executable File
#!/bin/bash
|
|
# Elastic P2P stability test: runs 200-request benchmark with offload mode
|
|
# and baseline mode, then compares success rates.
|
|
#
|
|
# Must be run on dash0 (8 GPUs with Mooncake support).
|
|
#
|
|
# Usage:
|
|
# bash scripts/run_elastic_stability_test.sh
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
|
|
PYTHON="$VENV/python"
|
|
VLLM="$VENV/vllm"
|
|
|
|
MODEL="${MODEL_PATH:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
|
|
N_INSTANCES=8
|
|
BASE_PORT=8000
|
|
PROXY_PORT=9090
|
|
REQUEST_LIMIT=200
|
|
TIME_SCALE=20
|
|
MAX_INFLIGHT=8
|
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
|
|
OUT_ELASTIC="$PROJECT_DIR/outputs/elastic_stability_${TIMESTAMP}"
|
|
OUT_BASELINE="$PROJECT_DIR/outputs/baseline_stability_${TIMESTAMP}"
|
|
|
|
# ─── Helper functions ────────────────────────────────────────────────────────
|
|
|
|
kill_all() {
|
|
echo "[cleanup] Killing vLLM and proxy processes..."
|
|
for p in $(ps aux | grep 'vllm serve' | grep -v grep | awk '{print $2}' 2>/dev/null); do
|
|
kill -9 "$p" 2>/dev/null || true
|
|
done
|
|
for p in $(ps aux | grep 'cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do
|
|
kill -9 "$p" 2>/dev/null || true
|
|
done
|
|
sleep 5
|
|
echo "[cleanup] Releasing GPUs..."
|
|
for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u); do
|
|
kill -9 "$p" 2>/dev/null || true
|
|
done
|
|
sleep 10
|
|
echo "[cleanup] Done."
|
|
}
|
|
|
|
wait_for_instances() {
|
|
local n=$1
|
|
echo "[wait] Waiting for $n vLLM instances to become healthy..."
|
|
for i in $(seq 0 $((n - 1))); do
|
|
local port=$((BASE_PORT + i))
|
|
local tries=0
|
|
while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -ge 120 ]; then
|
|
echo "[FAIL] Instance $i (port $port) did not start in 600s"
|
|
return 1
|
|
fi
|
|
sleep 5
|
|
done
|
|
echo " Instance $i (port $port) healthy"
|
|
done
|
|
}
|
|
|
|
wait_for_bootstrap() {
|
|
echo "[wait] Waiting for Mooncake bootstrap servers..."
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
local bp=$((8998 + i))
|
|
local tries=0
|
|
while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -ge 60 ]; then
|
|
echo "[FAIL] Bootstrap $bp did not start in 120s"
|
|
return 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " Bootstrap $bp ready"
|
|
done
|
|
}
|
|
|
|
wait_for_proxy() {
|
|
echo "[wait] Waiting for proxy on port $PROXY_PORT..."
|
|
local tries=0
|
|
while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -ge 30 ]; then
|
|
echo "[FAIL] Proxy did not start in 60s"
|
|
return 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " Proxy ready"
|
|
}
|
|
|
|
launch_vllm_kv_both() {
|
|
echo ""
|
|
echo "=== Launching $N_INSTANCES vLLM instances (kv_both) ==="
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
local port=$((BASE_PORT + i))
|
|
local bp=$((8998 + i))
|
|
local master=$((29500 + i))
|
|
local log="/tmp/elastic_test_${i}.log"
|
|
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
|
|
MASTER_PORT=$master \
|
|
CUDA_VISIBLE_DEVICES=$i \
|
|
$VLLM serve "$MODEL" \
|
|
--host 0.0.0.0 --port "$port" --tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
> "$log" 2>&1 &
|
|
|
|
echo " Instance $i: GPU=$i port=$port bootstrap=$bp log=$log"
|
|
sleep 2
|
|
done
|
|
wait_for_instances $N_INSTANCES
|
|
wait_for_bootstrap
|
|
}
|
|
|
|
launch_vllm_baseline() {
|
|
echo ""
|
|
echo "=== Launching $N_INSTANCES vLLM instances (baseline, no Mooncake) ==="
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
local port=$((BASE_PORT + i))
|
|
local master=$((29500 + i))
|
|
local log="/tmp/baseline_test_${i}.log"
|
|
|
|
MASTER_PORT=$master \
|
|
CUDA_VISIBLE_DEVICES=$i \
|
|
$VLLM serve "$MODEL" \
|
|
--host 0.0.0.0 --port "$port" --tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
> "$log" 2>&1 &
|
|
|
|
echo " Instance $i: GPU=$i port=$port log=$log"
|
|
sleep 2
|
|
done
|
|
wait_for_instances $N_INSTANCES
|
|
}
|
|
|
|
launch_proxy_elastic() {
|
|
echo ""
|
|
echo "=== Starting proxy (elastic offload mode) ==="
|
|
local combined_args=""
|
|
local bp_list=""
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
|
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
|
|
done
|
|
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
|
--combined $combined_args \
|
|
--bootstrap-ports "$bp_list" \
|
|
--offload --heavy-threshold 20000 \
|
|
--port $PROXY_PORT \
|
|
> /tmp/proxy_elastic.log 2>&1 &
|
|
wait_for_proxy
|
|
}
|
|
|
|
launch_proxy_baseline() {
|
|
echo ""
|
|
echo "=== Starting proxy (baseline, no offload) ==="
|
|
local combined_args=""
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
|
done
|
|
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
|
--combined $combined_args \
|
|
--port $PROXY_PORT \
|
|
> /tmp/proxy_baseline.log 2>&1 &
|
|
wait_for_proxy
|
|
}
|
|
|
|
run_benchmark() {
|
|
local tag=$1
|
|
local output_dir=$2
|
|
mkdir -p "$output_dir"
|
|
|
|
echo ""
|
|
echo "=== Running benchmark: $tag ($REQUEST_LIMIT requests) ==="
|
|
$PYTHON -m replayer \
|
|
--trace "$TRACE" \
|
|
--output "$output_dir/metrics.jsonl" \
|
|
--endpoint "http://localhost:$PROXY_PORT" \
|
|
--model "$MODEL" \
|
|
--time-scale "$TIME_SCALE" \
|
|
--max-inflight-sessions "$MAX_INFLIGHT" \
|
|
--request-limit "$REQUEST_LIMIT" \
|
|
-v 2>&1 | tee "$output_dir/replayer.log"
|
|
|
|
# Save proxy breakdown and stats
|
|
curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$output_dir/breakdown.json" 2>/dev/null || true
|
|
curl -sf "http://localhost:$PROXY_PORT/stats" > "$output_dir/stats.json" 2>/dev/null || true
|
|
}
|
|
|
|
collect_gpu_util() {
|
|
local output_dir=$1
|
|
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
|
|
--format=csv > "$output_dir/gpu_snapshot.csv" 2>/dev/null || true
|
|
}
|
|
|
|
print_summary() {
|
|
local label=$1
|
|
local output_dir=$2
|
|
local metrics="$output_dir/metrics.jsonl"
|
|
|
|
if [ ! -f "$metrics" ]; then
|
|
echo " [$label] No metrics file found!"
|
|
return
|
|
fi
|
|
|
|
# Count total, success, error from metrics JSONL
|
|
local total=$(wc -l < "$metrics")
|
|
local success=$(grep -c '"error":null\|"error": null' "$metrics" 2>/dev/null || grep -c '"ttft":[0-9]' "$metrics" 2>/dev/null || echo 0)
|
|
local errors=$((total - success))
|
|
local rate="N/A"
|
|
if [ "$total" -gt 0 ]; then
|
|
rate=$(awk "BEGIN{printf \"%.1f\", ($success/$total)*100}")
|
|
fi
|
|
|
|
echo " [$label]"
|
|
echo " Total requests: $total"
|
|
echo " Successful: $success"
|
|
echo " Errors: $errors"
|
|
echo " Success rate: ${rate}%"
|
|
|
|
# Print summary.json if it exists
|
|
local summary="$output_dir/metrics.summary.json"
|
|
if [ -f "$summary" ]; then
|
|
echo " Summary: $(cat "$summary")"
|
|
fi
|
|
}
|
|
|
|
# ─── Main ────────────────────────────────────────────────────────────────────
|
|
|
|
echo "================================================================"
|
|
echo " Elastic P2P Stability Test"
|
|
echo " $(date)"
|
|
echo " Model: $MODEL"
|
|
echo " Requests: $REQUEST_LIMIT"
|
|
echo " Output: elastic → $OUT_ELASTIC"
|
|
echo " baseline → $OUT_BASELINE"
|
|
echo "================================================================"
|
|
|
|
# Sanity checks
|
|
if [ ! -f "$TRACE" ]; then
|
|
echo "[ERROR] Trace file not found: $TRACE"
|
|
exit 1
|
|
fi
|
|
if [ ! -x "$PYTHON" ]; then
|
|
echo "[ERROR] Python not found: $PYTHON"
|
|
exit 1
|
|
fi
|
|
|
|
# ─── Phase 1: Elastic P2P offload ────────────────────────────────────────────
|
|
|
|
echo ""
|
|
echo "############################################################"
|
|
echo " Phase 1: Elastic P2P Offload"
|
|
echo "############################################################"
|
|
|
|
kill_all
|
|
launch_vllm_kv_both
|
|
launch_proxy_elastic
|
|
collect_gpu_util "$OUT_ELASTIC"
|
|
run_benchmark "elastic_p2p" "$OUT_ELASTIC"
|
|
|
|
echo ""
|
|
echo "[phase1] Saving APC stats..."
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
port=$((BASE_PORT + i))
|
|
curl -sf "http://127.0.0.1:$port/metrics" 2>/dev/null \
|
|
| grep -E 'vllm:cache_hit|prefix_cache' \
|
|
>> "$OUT_ELASTIC/apc_metrics.txt" 2>/dev/null || true
|
|
done
|
|
|
|
# ─── Phase 2: Baseline (no offload) ─────────────────────────────────────────
|
|
|
|
echo ""
|
|
echo "############################################################"
|
|
echo " Phase 2: Baseline (no offload, no Mooncake)"
|
|
echo "############################################################"
|
|
|
|
kill_all
|
|
launch_vllm_baseline
|
|
launch_proxy_baseline
|
|
collect_gpu_util "$OUT_BASELINE"
|
|
run_benchmark "baseline" "$OUT_BASELINE"
|
|
|
|
echo ""
|
|
echo "[phase2] Saving APC stats..."
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
port=$((BASE_PORT + i))
|
|
curl -sf "http://127.0.0.1:$port/metrics" 2>/dev/null \
|
|
| grep -E 'vllm:cache_hit|prefix_cache' \
|
|
>> "$OUT_BASELINE/apc_metrics.txt" 2>/dev/null || true
|
|
done
|
|
|
|
# ─── Cleanup ─────────────────────────────────────────────────────────────────
|
|
|
|
kill_all
|
|
|
|
# ─── Comparison ──────────────────────────────────────────────────────────────
|
|
|
|
echo ""
|
|
echo "================================================================"
|
|
echo " Results Comparison"
|
|
echo "================================================================"
|
|
|
|
print_summary "Elastic P2P" "$OUT_ELASTIC"
|
|
echo ""
|
|
print_summary "Baseline" "$OUT_BASELINE"
|
|
|
|
echo ""
|
|
echo "Detailed outputs:"
|
|
echo " Elastic: $OUT_ELASTIC/"
|
|
echo " Baseline: $OUT_BASELINE/"
|
|
echo ""
|
|
echo "Breakdown analysis:"
|
|
echo " python scripts/analyze_breakdown.py $OUT_ELASTIC/breakdown.json"
|
|
echo ""
|
|
echo "================================================================"
|
|
echo " Done. $(date)"
|
|
echo "================================================================"
|