agentic-kvc/scripts/bench.sh

#!/bin/bash
# Standardized single-experiment harness with guaranteed fresh state.
#
# GUARANTEES:
#   1. All GPU processes killed before start (verified via nvidia-smi)
#   2. All GPU processes killed after finish (clean for next experiment)
#   3. Fresh vLLM instances + proxy for every run
#   4. All outputs saved to outputs/<tag>/ with metrics, breakdown, APC, GPU snapshot
#
# Usage:
#   bash scripts/bench.sh --tag my_experiment --mode baseline
#   bash scripts/bench.sh --tag my_experiment --mode elastic
#   bash scripts/bench.sh --tag my_experiment --mode baseline --policy lmetric
#   bash scripts/bench.sh --tag my_experiment --mode elastic --requests 1000

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
PYTHON="$VENV/python"
VLLM="$VENV/vllm"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="${TRACE:-$PROJECT_DIR/traces/w600_r0.0015_st30.jsonl}"

# Defaults
TAG=""
MODE="baseline"     # baseline | elastic | pdsep
POLICY="linear"     # linear | lmetric | unified
POLICY_SET=false
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
REQUESTS=""         # empty = all requests in trace
HEAVY_THRESHOLD=20000
NO_OFFLOAD=false
OVERLOAD_FACTOR_ARG=""
MAX_BATCHED_TOKENS=""
MAX_OFFLOAD_INFLIGHT=""
CACHE_GATE_RATIO=""
OFFLOAD_MODE=""
PD_RATIO="4:4"      # P:D split when MODE=pdsep
EAGER=false         # add --enforce-eager back (cuda-graph ablation)

# Parse args
while [[ $# -gt 0 ]]; do
    case "$1" in
        --tag) TAG="$2"; shift 2 ;;
        --mode) MODE="$2"; shift 2 ;;
        --policy) POLICY="$2"; POLICY_SET=true; shift 2 ;;
        --instances) N_INSTANCES="$2"; shift 2 ;;
        --requests) REQUESTS="$2"; shift 2 ;;
        --trace) TRACE="$2"; shift 2 ;;
        --heavy-threshold) HEAVY_THRESHOLD="$2"; shift 2 ;;
        --no-offload) NO_OFFLOAD=true; shift ;;
        --overload-factor) OVERLOAD_FACTOR_ARG="$2"; shift 2 ;;
        --max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;;
        --max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
        --cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
        --offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
        --pd-ratio) PD_RATIO="$2"; shift 2 ;;
        --eager) EAGER=true; shift ;;
        *) echo "Unknown: $1"; exit 1 ;;
    esac
done

if [ -z "$TAG" ]; then
    echo "Usage: bench.sh --tag NAME --mode {baseline|elastic|pdsep}"
    echo "                [--policy {linear|lmetric|unified}] [--instances N]"
    echo "                [--pd-ratio P:D]   (only with --mode pdsep, default 4:4)"
    echo "                [--eager]          (re-enable --enforce-eager for the cuda-graph ablation)"
    echo "                [--requests N] [--trace PATH]"
    echo "  Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh."
    exit 1
fi

if [ "$MODE" = "elastic" ] && [ "$POLICY_SET" = "false" ]; then
    POLICY="unified"
fi

if [ "$MODE" = "pdsep" ]; then
    N_P_INST=${PD_RATIO%%:*}
    N_D_INST=${PD_RATIO##*:}
    if [ $((N_P_INST + N_D_INST)) -ne "$N_INSTANCES" ]; then
        echo "[ERROR] --pd-ratio $PD_RATIO must sum to --instances $N_INSTANCES"
        exit 1
    fi
fi

OUTDIR="$PROJECT_DIR/outputs/$TAG"
if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then
    echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag."
    exit 1
fi
mkdir -p "$OUTDIR"

# Save experiment config
cat > "$OUTDIR/config.json" << CONF
{
    "tag": "$TAG",
    "mode": "$MODE",
    "policy": "$POLICY",
    "model": "$MODEL",
    "n_instances": $N_INSTANCES,
    "requests": "${REQUESTS:-all}",
    "heavy_threshold": $HEAVY_THRESHOLD,
    "no_offload": "$NO_OFFLOAD",
    "overload_factor": "${OVERLOAD_FACTOR_ARG:-2.0}",
    "max_batched_tokens": "${MAX_BATCHED_TOKENS:-default}",
    "timestamp": "$(date -Iseconds)",
    "hostname": "$(hostname)"
}
CONF

# ─── GPU Cleanup (verified) ────────────────────────────────────────────────

cleanup_gpu() {
    echo "[cleanup] Killing all vLLM/proxy/monitor processes..."
    for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy|gpu_monitor' | grep -v grep | awk '{print $2}' 2>/dev/null); do
        kill -9 "$p" 2>/dev/null || true
    done
    sleep 3
    local gpu_pids
    gpu_pids=$(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true)
    if [ -n "$gpu_pids" ]; then
        echo "[cleanup] Killing GPU-holding PIDs: $gpu_pids"
        echo "$gpu_pids" | xargs -r kill -9 2>/dev/null || true
        sleep 5
    fi
    local used
    used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}')
    if [ "${used:-0}" -gt 100 ]; then
        echo "[ERROR] GPUs still have ${used}MB allocated after cleanup. Aborting."
        nvidia-smi --query-gpu=index,memory.used --format=csv,noheader
        exit 1
    fi
    echo "[cleanup] All GPUs verified free."
}

trap 'echo "[bench.sh] Caught signal, cleaning up..."; cleanup_gpu; exit 1' INT TERM
trap 'cleanup_gpu' EXIT

# ─── Launch vLLM instances ─────────────────────────────────────────────────

launch_instances() {
    echo "[launch] Starting $N_INSTANCES vLLM instances (mode=$MODE)..."

    # Build optional vLLM args
    local vllm_extra_args=""
    if [ -n "$MAX_BATCHED_TOKENS" ]; then
        vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS"
    fi
    if [ "$EAGER" = "true" ]; then
        vllm_extra_args="$vllm_extra_args --enforce-eager"
    fi

    # elastic and pdsep both run Mooncake kv_both; difference is only the
    # proxy routing. baseline runs plain vLLM (no Mooncake).
    local use_mooncake=false
    if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
        use_mooncake=true
    fi

    # Optional: when AGENTIC_STEP_LOG_DIR is exported, point each engine at its
    # own JSONL file so the patched scheduler emits per-step records.
    local step_log_dir="${AGENTIC_STEP_LOG_DIR:-}"
    if [ -n "$step_log_dir" ]; then
        mkdir -p "$step_log_dir"
    fi

    for i in $(seq 0 $((N_INSTANCES - 1))); do
        local port=$((BASE_PORT + i))
        local master=$((29500 + i))
        local logfile="$OUTDIR/vllm_inst_${i}.log"
        local step_env=""
        if [ -n "$step_log_dir" ]; then
            step_env="AGENTIC_STEP_LOG_PATH=$step_log_dir/engine_${i}.jsonl AGENTIC_WORKER_ID=engine_${i}"
        fi

        if [ "$use_mooncake" = "true" ]; then
            env $step_env \
            PYTHONHASHSEED=42 \
            VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \
            MASTER_PORT=$master \
            CUDA_VISIBLE_DEVICES=$i \
            $VLLM serve "$MODEL" \
                --host 0.0.0.0 --port $port \
                --tensor-parallel-size 1 \
                --trust-remote-code --enable-prefix-caching \
                --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
                --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
                $vllm_extra_args \
                > "$logfile" 2>&1 &
        else
            env $step_env \
            MASTER_PORT=$master \
            CUDA_VISIBLE_DEVICES=$i \
            $VLLM serve "$MODEL" \
                --host 0.0.0.0 --port $port \
                --tensor-parallel-size 1 \
                --trust-remote-code --enable-prefix-caching \
                --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
                $vllm_extra_args \
                > "$logfile" 2>&1 &
        fi

        echo "  inst_$i: GPU=$i port=$port"
        sleep 2  # stagger to avoid port collision
    done

    # Wait for health
    echo "[launch] Waiting for instances to become healthy..."
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        local port=$((BASE_PORT + i))
        local tries=0
        while ! curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
            tries=$((tries + 1))
            if [ $tries -ge 120 ]; then
                echo "[FAIL] Instance $i (port $port) failed to start. Log:"
                tail -10 "$OUTDIR/vllm_inst_${i}.log"
                cleanup_gpu
                exit 1
            fi
            sleep 5
        done
        echo "  inst_$i healthy"
    done

    # Wait for bootstrap (Mooncake modes only)
    if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
        echo "[launch] Waiting for Mooncake bootstrap servers..."
        for i in $(seq 0 $((N_INSTANCES - 1))); do
            local bp=$((8998 + i))
            local tries=0
            while ! curl -sf "http://127.0.0.1:$bp/query" > /dev/null 2>&1; do
                tries=$((tries + 1))
                if [ $tries -ge 60 ]; then
                    echo "[FAIL] Bootstrap $bp failed"
                    cleanup_gpu
                    exit 1
                fi
                sleep 2
            done
            echo "  bootstrap $bp ready"
        done
    fi
}

# ─── Launch proxy ──────────────────────────────────────────────────────────

launch_proxy() {
    echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..."

    local extra_args="--policy $POLICY"
    if [ -n "$OVERLOAD_FACTOR_ARG" ]; then
        extra_args="$extra_args --overload-factor $OVERLOAD_FACTOR_ARG"
    fi
    if [ -n "$MAX_OFFLOAD_INFLIGHT" ]; then
        extra_args="$extra_args --max-offload-inflight $MAX_OFFLOAD_INFLIGHT"
    fi
    if [ -n "$CACHE_GATE_RATIO" ]; then
        extra_args="$extra_args --cache-gate-ratio $CACHE_GATE_RATIO"
    fi
    if [ -n "$OFFLOAD_MODE" ]; then
        extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
    fi

    local proxy_mode_args=""
    if [ "$MODE" = "pdsep" ]; then
        # First N_P_INST instances are prefill (with their bootstrap ports),
        # remaining N_D_INST are decode.
        for i in $(seq 0 $((N_P_INST - 1))); do
            proxy_mode_args="$proxy_mode_args --prefill http://127.0.0.1:$((BASE_PORT + i)) $((8998 + i))"
        done
        for i in $(seq $N_P_INST $((N_INSTANCES - 1))); do
            proxy_mode_args="$proxy_mode_args --decode http://127.0.0.1:$((BASE_PORT + i))"
        done
    else
        local combined_args=""
        for i in $(seq 0 $((N_INSTANCES - 1))); do
            combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
        done
        proxy_mode_args="--combined $combined_args"
        if [ "$MODE" = "elastic" ]; then
            local bp_list=""
            for i in $(seq 0 $((N_INSTANCES - 1))); do
                bp_list="${bp_list:+$bp_list,}$((8998 + i))"
            done
            if [ "$NO_OFFLOAD" = "true" ]; then
                extra_args="$extra_args --bootstrap-ports $bp_list"
            else
                extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
            fi
        fi
    fi

    $PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
        $proxy_mode_args \
        --port $PROXY_PORT \
        $extra_args \
        > "$OUTDIR/proxy.log" 2>&1 &

    # Wait for proxy
    local tries=0
    while ! curl -sf "http://127.0.0.1:$PROXY_PORT/stats" > /dev/null 2>&1; do
        tries=$((tries + 1))
        if [ $tries -ge 30 ]; then
            echo "[FAIL] Proxy failed to start"
            cleanup_gpu
            exit 1
        fi
        sleep 2
    done
    echo "[proxy] Ready on port $PROXY_PORT"
}

# ─── Run benchmark ─────────────────────────────────────────────────────────

run_benchmark() {
    local request_args=""
    if [ -n "$REQUESTS" ]; then
        request_args="--request-limit $REQUESTS"
        echo "[bench] Running $REQUESTS requests (trace-driven timing)..."
    else
        echo "[bench] Running all requests in trace (trace-driven timing)..."
    fi

    # Start GPU monitor in background
    bash "$PROJECT_DIR/scripts/gpu_monitor.sh" "$OUTDIR/gpu_util.csv" 5 &
    GPU_MON_PID=$!

    $PYTHON -m replayer \
        --trace "$TRACE" \
        --output "$OUTDIR/metrics.jsonl" \
        --endpoint "http://localhost:$PROXY_PORT" \
        --model "$MODEL" \
        $request_args \
        -v 2>&1 | tee "$OUTDIR/replayer.log"

    # Stop GPU monitor
    kill $GPU_MON_PID 2>/dev/null || true
    wait $GPU_MON_PID 2>/dev/null || true
    echo "[bench] GPU util saved: $(wc -l < "$OUTDIR/gpu_util.csv") samples"
}

# ─── Collect artifacts ─────────────────────────────────────────────────────

collect_artifacts() {
    echo "[collect] Saving artifacts..."
    curl -sf "http://localhost:$PROXY_PORT/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true
    curl -sf "http://localhost:$PROXY_PORT/stats" > "$OUTDIR/stats.json" 2>/dev/null || true
    nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
        --format=csv > "$OUTDIR/gpu_snapshot.csv" 2>/dev/null || true

    # APC from vLLM logs
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        pch=$(grep "Prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0")
        ech=$(grep "External prefix cache hit rate" "$OUTDIR/vllm_inst_${i}.log" 2>/dev/null | tail -1 | grep -oP "External prefix cache hit rate: \K[0-9.]+" || echo "")
        ext_str=""
        [ -n "$ech" ] && ext_str=" ext=$ech%"
        echo "inst_$i: prefix=$pch%$ext_str"
    done | tee "$OUTDIR/apc.txt"
}

# ─── Summary ───────────────────────────────────────────────────────────────

print_summary() {
    $PYTHON -c "
import json
rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')]
ok = [r for r in rows if not r.get('error')]
err = [r for r in rows if r.get('error')]
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
e2es = sorted([r['latency_s'] for r in ok])
print()
print('=' * 70)
print('  RESULT: $TAG ($MODE, $POLICY)')
print('=' * 70)
print('  OK=%d/%d (%.1f%%)  TTFT50=%.3f  TTFT90=%.3f  TPOT90=%.4f  E2E50=%.3f' % (
    len(ok), len(rows), len(ok)*100/len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MEDIUM'),(20000,200000,'HEAVY')]:
    sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
    if sub:
        t = sorted([r['ttft_s'] for r in sub])
        tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
        print('  %-8s n=%3d  TTFT50=%.3f  TTFT90=%.3f  TPOT90=%.4f' % (
            cl, len(sub), p(t,.5), p(t,.9), p(tp,.9) if tp else 0))
if err:
    print('  Errors (%d):' % len(err))
    for e in err[:5]:
        print('    input=%d %s' % (e['input_length'], str(e.get('error',''))[:60]))
print('  Output: $OUTDIR/')
print('=' * 70)
"
}

# ─── Main ──────────────────────────────────────────────────────────────────

echo "================================================================"
echo "  bench.sh: $TAG"
echo "  mode=$MODE  policy=$POLICY  requests=${REQUESTS:-all}  overload_factor=${OVERLOAD_FACTOR_ARG:-2.0}"
echo "  $(date)"
echo "================================================================"

cd "$PROJECT_DIR"
cleanup_gpu
launch_instances
launch_proxy
run_benchmark
collect_artifacts
print_summary
# cleanup_gpu runs automatically via EXIT trap

echo "[done] $(date)"