agentic-kvc/scripts/run_elastic_stability_test.sh

#!/bin/bash
# Elastic P2P stability test: runs 200-request benchmark with offload mode
# and baseline mode, then compares success rates.
#
# Must be run on dash0 (8 GPUs with Mooncake support).
#
# Usage:
#   bash scripts/run_elastic_stability_test.sh

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
PYTHON="$VENV/python"
VLLM="$VENV/vllm"

MODEL="${MODEL_PATH:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
REQUEST_LIMIT=200
TIME_SCALE=20
MAX_INFLIGHT=8
TIMESTAMP=$(date +%Y%m%d_%H%M%S)

OUT_ELASTIC="$PROJECT_DIR/outputs/elastic_stability_${TIMESTAMP}"
OUT_BASELINE="$PROJECT_DIR/outputs/baseline_stability_${TIMESTAMP}"

# ─── Helper functions ────────────────────────────────────────────────────────

kill_all() {
    echo "[cleanup] Killing vLLM and proxy processes..."
    for p in $(ps aux | grep 'vllm serve' | grep -v grep | awk '{print $2}' 2>/dev/null); do
        kill -9 "$p" 2>/dev/null || true
    done
    for p in $(ps aux | grep 'cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do
        kill -9 "$p" 2>/dev/null || true
    done
    sleep 5
    echo "[cleanup] Releasing GPUs..."
    for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u); do
        kill -9 "$p" 2>/dev/null || true
    done
    sleep 10
    echo "[cleanup] Done."
}

wait_for_instances() {
    local n=$1
    echo "[wait] Waiting for $n vLLM instances to become healthy..."
    for i in $(seq 0 $((n - 1))); do
        local port=$((BASE_PORT + i))
        local tries=0
        while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
            tries=$((tries + 1))
            if [ $tries -ge 120 ]; then
                echo "[FAIL] Instance $i (port $port) did not start in 600s"
                return 1
            fi
            sleep 5
        done
        echo "  Instance $i (port $port) healthy"
    done
}

wait_for_bootstrap() {
    echo "[wait] Waiting for Mooncake bootstrap servers..."
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        local bp=$((8998 + i))
        local tries=0
        while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do
            tries=$((tries + 1))
            if [ $tries -ge 60 ]; then
                echo "[FAIL] Bootstrap $bp did not start in 120s"
                return 1
            fi
            sleep 2
        done
        echo "  Bootstrap $bp ready"
    done
}

wait_for_proxy() {
    echo "[wait] Waiting for proxy on port $PROXY_PORT..."
    local tries=0
    while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do
        tries=$((tries + 1))
        if [ $tries -ge 30 ]; then
            echo "[FAIL] Proxy did not start in 60s"
            return 1
        fi
        sleep 2
    done
    echo "  Proxy ready"
}

launch_vllm_kv_both() {
    echo ""
    echo "=== Launching $N_INSTANCES vLLM instances (kv_both) ==="
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        local port=$((BASE_PORT + i))
        local bp=$((8998 + i))
        local master=$((29500 + i))
        local log="/tmp/elastic_test_${i}.log"

        VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
        MASTER_PORT=$master \
        CUDA_VISIBLE_DEVICES=$i \
        $VLLM serve "$MODEL" \
            --host 0.0.0.0 --port "$port" --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching --enforce-eager \
            --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
            --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
            > "$log" 2>&1 &

        echo "  Instance $i: GPU=$i port=$port bootstrap=$bp log=$log"
        sleep 2
    done
    wait_for_instances $N_INSTANCES
    wait_for_bootstrap
}

launch_vllm_baseline() {
    echo ""
    echo "=== Launching $N_INSTANCES vLLM instances (baseline, no Mooncake) ==="
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        local port=$((BASE_PORT + i))
        local master=$((29500 + i))
        local log="/tmp/baseline_test_${i}.log"

        MASTER_PORT=$master \
        CUDA_VISIBLE_DEVICES=$i \
        $VLLM serve "$MODEL" \
            --host 0.0.0.0 --port "$port" --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching --enforce-eager \
            --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
            > "$log" 2>&1 &

        echo "  Instance $i: GPU=$i port=$port log=$log"
        sleep 2
    done
    wait_for_instances $N_INSTANCES
}

launch_proxy_elastic() {
    echo ""
    echo "=== Starting proxy (elastic offload mode) ==="
    local combined_args=""
    local bp_list=""
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
        bp_list="${bp_list:+$bp_list,}$((8998 + i))"
    done
    $PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
        --combined $combined_args \
        --bootstrap-ports "$bp_list" \
        --offload --heavy-threshold 20000 \
        --port $PROXY_PORT \
        > /tmp/proxy_elastic.log 2>&1 &
    wait_for_proxy
}

launch_proxy_baseline() {
    echo ""
    echo "=== Starting proxy (baseline, no offload) ==="
    local combined_args=""
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
    done
    $PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
        --combined $combined_args \
        --port $PROXY_PORT \
        > /tmp/proxy_baseline.log 2>&1 &
    wait_for_proxy
}

run_benchmark() {
    local tag=$1
    local output_dir=$2
    mkdir -p "$output_dir"

    echo ""
    echo "=== Running benchmark: $tag ($REQUEST_LIMIT requests) ==="
    $PYTHON -m replayer \
        --trace "$TRACE" \
        --output "$output_dir/metrics.jsonl" \
        --endpoint "http://localhost:$PROXY_PORT" \
        --model "$MODEL" \
        --time-scale "$TIME_SCALE" \
        --max-inflight-sessions "$MAX_INFLIGHT" \
        --request-limit "$REQUEST_LIMIT" \
        -v 2>&1 | tee "$output_dir/replayer.log"

    # Save proxy breakdown and stats
    curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$output_dir/breakdown.json" 2>/dev/null || true
    curl -sf "http://localhost:$PROXY_PORT/stats" > "$output_dir/stats.json" 2>/dev/null || true
}

collect_gpu_util() {
    local output_dir=$1
    nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
        --format=csv > "$output_dir/gpu_snapshot.csv" 2>/dev/null || true
}

print_summary() {
    local label=$1
    local output_dir=$2
    local metrics="$output_dir/metrics.jsonl"

    if [ ! -f "$metrics" ]; then
        echo "  [$label] No metrics file found!"
        return
    fi

    # Count total, success, error from metrics JSONL
    local total=$(wc -l < "$metrics")
    local success=$(grep -c '"error":null\|"error": null' "$metrics" 2>/dev/null || grep -c '"ttft":[0-9]' "$metrics" 2>/dev/null || echo 0)
    local errors=$((total - success))
    local rate="N/A"
    if [ "$total" -gt 0 ]; then
        rate=$(awk "BEGIN{printf \"%.1f\", ($success/$total)*100}")
    fi

    echo "  [$label]"
    echo "    Total requests: $total"
    echo "    Successful:     $success"
    echo "    Errors:         $errors"
    echo "    Success rate:   ${rate}%"

    # Print summary.json if it exists
    local summary="$output_dir/metrics.summary.json"
    if [ -f "$summary" ]; then
        echo "    Summary: $(cat "$summary")"
    fi
}

# ─── Main ────────────────────────────────────────────────────────────────────

echo "================================================================"
echo "  Elastic P2P Stability Test"
echo "  $(date)"
echo "  Model:    $MODEL"
echo "  Requests: $REQUEST_LIMIT"
echo "  Output:   elastic → $OUT_ELASTIC"
echo "            baseline → $OUT_BASELINE"
echo "================================================================"

# Sanity checks
if [ ! -f "$TRACE" ]; then
    echo "[ERROR] Trace file not found: $TRACE"
    exit 1
fi
if [ ! -x "$PYTHON" ]; then
    echo "[ERROR] Python not found: $PYTHON"
    exit 1
fi

# ─── Phase 1: Elastic P2P offload ────────────────────────────────────────────

echo ""
echo "############################################################"
echo "  Phase 1: Elastic P2P Offload"
echo "############################################################"

kill_all
launch_vllm_kv_both
launch_proxy_elastic
collect_gpu_util "$OUT_ELASTIC"
run_benchmark "elastic_p2p" "$OUT_ELASTIC"

echo ""
echo "[phase1] Saving APC stats..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
    port=$((BASE_PORT + i))
    curl -sf "http://127.0.0.1:$port/metrics" 2>/dev/null \
        | grep -E 'vllm:cache_hit|prefix_cache' \
        >> "$OUT_ELASTIC/apc_metrics.txt" 2>/dev/null || true
done

# ─── Phase 2: Baseline (no offload) ─────────────────────────────────────────

echo ""
echo "############################################################"
echo "  Phase 2: Baseline (no offload, no Mooncake)"
echo "############################################################"

kill_all
launch_vllm_baseline
launch_proxy_baseline
collect_gpu_util "$OUT_BASELINE"
run_benchmark "baseline" "$OUT_BASELINE"

echo ""
echo "[phase2] Saving APC stats..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
    port=$((BASE_PORT + i))
    curl -sf "http://127.0.0.1:$port/metrics" 2>/dev/null \
        | grep -E 'vllm:cache_hit|prefix_cache' \
        >> "$OUT_BASELINE/apc_metrics.txt" 2>/dev/null || true
done

# ─── Cleanup ─────────────────────────────────────────────────────────────────

kill_all

# ─── Comparison ──────────────────────────────────────────────────────────────

echo ""
echo "================================================================"
echo "  Results Comparison"
echo "================================================================"

print_summary "Elastic P2P" "$OUT_ELASTIC"
echo ""
print_summary "Baseline" "$OUT_BASELINE"

echo ""
echo "Detailed outputs:"
echo "  Elastic: $OUT_ELASTIC/"
echo "  Baseline: $OUT_BASELINE/"
echo ""
echo "Breakdown analysis:"
echo "  python scripts/analyze_breakdown.py $OUT_ELASTIC/breakdown.json"
echo ""
echo "================================================================"
echo "  Done. $(date)"
echo "================================================================"