Files
agentic-kvc/scripts/run_lmetric_ab.sh
Gahow Wang e4fa56cb1e LMetric routing policy (OSDI'26) + A/B results vs linear baseline
Implement LMetric (P_tokens × BS multiplication score) from "Simple is
Better" (Zhang et al., OSDI'26) as alternative routing policy for
combined mode. Key changes:

- cache_aware_proxy.py: add --policy {linear,lmetric} flag, track
  pending_prefill_tokens and num_requests per instance, /stats endpoint
- run_lmetric_ab.sh: automated A/B script for fair comparison

Results (200 req, fresh restart, same trace):
  Linear:  TTFT50=1.086  TPOT90=0.077  E2E50=5.423
  LMetric: TTFT50=1.099  TPOT90=0.073  E2E50=5.205
  Delta:   TTFT +1.2%    TPOT -5.9%    E2E -4.0%

LMetric improves TPOT/E2E modestly through better load balancing, but
routing policy headroom is limited vs elastic P2P offload (-44% E2E).

TODO: vLLM → Redis → router pipeline for exact state ablation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 16:57:32 +08:00

150 lines
5.2 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# A/B comparison: linear (current baseline) vs lmetric (OSDI'26) routing policy.
# Both use same 8× TP=1 combined instances, fresh restart between experiments.
set -euo pipefail
PROJECT_DIR="/home/admin/cpfs/wjh/agentic-kv"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"
MODEL="/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct"
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
REQUEST_LIMIT=200
TIME_SCALE=20
MAX_SESSIONS=8
cleanup() {
for p in $(ps aux | grep 'vllm serve' | grep -v grep | awk '{print $2}'); do kill -9 $p 2>/dev/null; done
for p in $(ps aux | grep 'cache_aware_proxy' | grep -v grep | awk '{print $2}'); do kill -9 $p 2>/dev/null; done
sleep 5
for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u); do kill -9 $p 2>/dev/null; done
sleep 10
}
start_instances() {
echo " Starting $N_INSTANCES vLLM instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
MASTER_PORT=$((29500 + i)) CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
> /tmp/lmetric_ab_inst_$i.log 2>&1 &
done
echo " Waiting for instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
echo " Instance $i (port $port) ready"
done
}
run_experiment() {
local policy=$1
local tag=$2
local outdir="$PROJECT_DIR/outputs/$tag"
mkdir -p "$outdir"
echo " Starting proxy (policy=$policy)..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $(for i in $(seq 0 $((N_INSTANCES - 1))); do echo -n "http://127.0.0.1:$((BASE_PORT + i)) "; done) \
--policy "$policy" \
--port $PROXY_PORT > /tmp/lmetric_ab_proxy_${policy}.log 2>&1 &
PROXY_PID=$!
sleep 3
# Smoke test
result=$(curl -s -m 30 http://localhost:$PROXY_PORT/v1/completions \
-X POST -H "Content-Type: application/json" \
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
if ! echo "$result" | grep -q "choices"; then
echo " ERROR: Smoke test failed: $result"
kill $PROXY_PID 2>/dev/null
return 1
fi
echo " Smoke test passed"
# Start GPU monitor
bash "$PROJECT_DIR/scripts/gpu_monitor.sh" > "$outdir/gpu_util.csv" &
GPU_MON_PID=$!
# Run benchmark
echo " Running benchmark (policy=$policy, $REQUEST_LIMIT requests)..."
$PYTHON -m replayer \
--trace "$TRACE" \
--output "$outdir/metrics.jsonl" \
--endpoint "http://localhost:$PROXY_PORT" \
--model "$MODEL" \
--time-scale $TIME_SCALE \
--max-inflight-sessions $MAX_SESSIONS \
--request-limit $REQUEST_LIMIT \
-v
# Save breakdown
curl -s http://localhost:$PROXY_PORT/breakdown > "$outdir/breakdown.json" 2>/dev/null
curl -s http://localhost:$PROXY_PORT/stats > "$outdir/stats.json" 2>/dev/null
# Collect APC from vLLM logs
echo " Collecting APC..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
pch=$(grep "Prefix cache hit rate" /tmp/lmetric_ab_inst_$i.log 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0")
echo " inst_$i: prefix=$pch%"
done | tee "$outdir/apc.txt"
kill $GPU_MON_PID 2>/dev/null
kill $PROXY_PID 2>/dev/null
wait $PROXY_PID 2>/dev/null
echo " Done: $(wc -l < "$outdir/metrics.jsonl") requests -> $outdir"
}
echo "================================================================"
echo " A/B: Linear vs LMetric routing policy"
echo " $(date)"
echo "================================================================"
# Experiment 1: Linear (current baseline)
echo ""
echo "=== Experiment 1: Linear policy ==="
cleanup
start_instances
run_experiment "linear" "ab_linear"
# Experiment 2: LMetric (OSDI'26)
echo ""
echo "=== Experiment 2: LMetric policy ==="
cleanup
start_instances
run_experiment "lmetric" "ab_lmetric"
# Compare
echo ""
echo "================================================================"
echo " Results comparison"
echo "================================================================"
$PYTHON -c "
import json, statistics
def summarize(path, label):
rows = [json.loads(l) for l in open(path)]
ok = [r for r in rows if not r.get('error')]
p = lambda v,q: v[min(int(q*len(v)),len(v)-1)] if v else 0
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
e2es = sorted([r['latency_s'] for r in ok])
print('%-20s OK=%3d/%3d TTFT50=%.3f TTFT90=%.3f TPOT90=%.3f E2E50=%.3f' % (
label, len(ok), len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
summarize('$PROJECT_DIR/outputs/ab_linear/metrics.jsonl', 'Linear')
summarize('$PROJECT_DIR/outputs/ab_lmetric/metrics.jsonl', 'LMetric')
"
echo ""
echo "Done at $(date)"