agentic-kvc/scripts/run_lmetric_ab.sh

#!/bin/bash
# A/B comparison: linear (current baseline) vs lmetric (OSDI'26) routing policy.
# Both use same 8× TP=1 combined instances, fresh restart between experiments.
set -euo pipefail

PROJECT_DIR="/home/admin/cpfs/wjh/agentic-kv"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"
MODEL="/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct"
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"

N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
REQUEST_LIMIT=200
TIME_SCALE=20
MAX_SESSIONS=8

cleanup() {
    for p in $(ps aux | grep 'vllm serve' | grep -v grep | awk '{print $2}'); do kill -9 $p 2>/dev/null; done
    for p in $(ps aux | grep 'cache_aware_proxy' | grep -v grep | awk '{print $2}'); do kill -9 $p 2>/dev/null; done
    sleep 5
    for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u); do kill -9 $p 2>/dev/null; done
    sleep 10
}

start_instances() {
    echo "  Starting $N_INSTANCES vLLM instances..."
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        port=$((BASE_PORT + i))
        MASTER_PORT=$((29500 + i)) CUDA_VISIBLE_DEVICES=$i \
        $VLLM serve "$MODEL" \
            --host 0.0.0.0 --port $port \
            --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching --enforce-eager \
            --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
            > /tmp/lmetric_ab_inst_$i.log 2>&1 &
    done

    echo "  Waiting for instances..."
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        port=$((BASE_PORT + i))
        timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
        echo "    Instance $i (port $port) ready"
    done
}

run_experiment() {
    local policy=$1
    local tag=$2
    local outdir="$PROJECT_DIR/outputs/$tag"
    mkdir -p "$outdir"

    echo "  Starting proxy (policy=$policy)..."
    $PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
        --combined $(for i in $(seq 0 $((N_INSTANCES - 1))); do echo -n "http://127.0.0.1:$((BASE_PORT + i)) "; done) \
        --policy "$policy" \
        --port $PROXY_PORT > /tmp/lmetric_ab_proxy_${policy}.log 2>&1 &
    PROXY_PID=$!
    sleep 3

    # Smoke test
    result=$(curl -s -m 30 http://localhost:$PROXY_PORT/v1/completions \
        -X POST -H "Content-Type: application/json" \
        -d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
    if ! echo "$result" | grep -q "choices"; then
        echo "  ERROR: Smoke test failed: $result"
        kill $PROXY_PID 2>/dev/null
        return 1
    fi
    echo "  Smoke test passed"

    # Start GPU monitor
    bash "$PROJECT_DIR/scripts/gpu_monitor.sh" > "$outdir/gpu_util.csv" &
    GPU_MON_PID=$!

    # Run benchmark
    echo "  Running benchmark (policy=$policy, $REQUEST_LIMIT requests)..."
    $PYTHON -m replayer \
        --trace "$TRACE" \
        --output "$outdir/metrics.jsonl" \
        --endpoint "http://localhost:$PROXY_PORT" \
        --model "$MODEL" \
        --time-scale $TIME_SCALE \
        --max-inflight-sessions $MAX_SESSIONS \
        --request-limit $REQUEST_LIMIT \
        -v

    # Save breakdown
    curl -s http://localhost:$PROXY_PORT/breakdown > "$outdir/breakdown.json" 2>/dev/null
    curl -s http://localhost:$PROXY_PORT/stats > "$outdir/stats.json" 2>/dev/null

    # Collect APC from vLLM logs
    echo "  Collecting APC..."
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        pch=$(grep "Prefix cache hit rate" /tmp/lmetric_ab_inst_$i.log 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0")
        echo "    inst_$i: prefix=$pch%"
    done | tee "$outdir/apc.txt"

    kill $GPU_MON_PID 2>/dev/null
    kill $PROXY_PID 2>/dev/null
    wait $PROXY_PID 2>/dev/null
    echo "  Done: $(wc -l < "$outdir/metrics.jsonl") requests -> $outdir"
}

echo "================================================================"
echo "  A/B: Linear vs LMetric routing policy"
echo "  $(date)"
echo "================================================================"

# Experiment 1: Linear (current baseline)
echo ""
echo "=== Experiment 1: Linear policy ==="
cleanup
start_instances
run_experiment "linear" "ab_linear"

# Experiment 2: LMetric (OSDI'26)
echo ""
echo "=== Experiment 2: LMetric policy ==="
cleanup
start_instances
run_experiment "lmetric" "ab_lmetric"

# Compare
echo ""
echo "================================================================"
echo "  Results comparison"
echo "================================================================"
$PYTHON -c "
import json, statistics

def summarize(path, label):
    rows = [json.loads(l) for l in open(path)]
    ok = [r for r in rows if not r.get('error')]
    p = lambda v,q: v[min(int(q*len(v)),len(v)-1)] if v else 0
    ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
    tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
    e2es = sorted([r['latency_s'] for r in ok])
    print('%-20s OK=%3d/%3d  TTFT50=%.3f  TTFT90=%.3f  TPOT90=%.3f  E2E50=%.3f' % (
        label, len(ok), len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))

summarize('$PROJECT_DIR/outputs/ab_linear/metrics.jsonl', 'Linear')
summarize('$PROJECT_DIR/outputs/ab_lmetric/metrics.jsonl', 'LMetric')
"

echo ""
echo "Done at $(date)"