Implement LMetric (P_tokens × BS multiplication score) from "Simple is
Better" (Zhang et al., OSDI'26) as alternative routing policy for
combined mode. Key changes:
- cache_aware_proxy.py: add --policy {linear,lmetric} flag, track
pending_prefill_tokens and num_requests per instance, /stats endpoint
- run_lmetric_ab.sh: automated A/B script for fair comparison
Results (200 req, fresh restart, same trace):
Linear: TTFT50=1.086 TPOT90=0.077 E2E50=5.423
LMetric: TTFT50=1.099 TPOT90=0.073 E2E50=5.205
Delta: TTFT +1.2% TPOT -5.9% E2E -4.0%
LMetric improves TPOT/E2E modestly through better load balancing, but
routing policy headroom is limited vs elastic P2P offload (-44% E2E).
TODO: vLLM → Redis → router pipeline for exact state ablation.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
150 lines
5.2 KiB
Bash
Executable File
150 lines
5.2 KiB
Bash
Executable File
#!/bin/bash
|
||
# A/B comparison: linear (current baseline) vs lmetric (OSDI'26) routing policy.
|
||
# Both use same 8× TP=1 combined instances, fresh restart between experiments.
|
||
set -euo pipefail
|
||
|
||
PROJECT_DIR="/home/admin/cpfs/wjh/agentic-kv"
|
||
VENV="$PROJECT_DIR/.venv/bin"
|
||
VLLM="$VENV/vllm"
|
||
PYTHON="$VENV/python"
|
||
MODEL="/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
||
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
|
||
|
||
N_INSTANCES=8
|
||
BASE_PORT=8000
|
||
PROXY_PORT=9090
|
||
REQUEST_LIMIT=200
|
||
TIME_SCALE=20
|
||
MAX_SESSIONS=8
|
||
|
||
cleanup() {
|
||
for p in $(ps aux | grep 'vllm serve' | grep -v grep | awk '{print $2}'); do kill -9 $p 2>/dev/null; done
|
||
for p in $(ps aux | grep 'cache_aware_proxy' | grep -v grep | awk '{print $2}'); do kill -9 $p 2>/dev/null; done
|
||
sleep 5
|
||
for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u); do kill -9 $p 2>/dev/null; done
|
||
sleep 10
|
||
}
|
||
|
||
start_instances() {
|
||
echo " Starting $N_INSTANCES vLLM instances..."
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
port=$((BASE_PORT + i))
|
||
MASTER_PORT=$((29500 + i)) CUDA_VISIBLE_DEVICES=$i \
|
||
$VLLM serve "$MODEL" \
|
||
--host 0.0.0.0 --port $port \
|
||
--tensor-parallel-size 1 \
|
||
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
||
> /tmp/lmetric_ab_inst_$i.log 2>&1 &
|
||
done
|
||
|
||
echo " Waiting for instances..."
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
port=$((BASE_PORT + i))
|
||
timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
|
||
echo " Instance $i (port $port) ready"
|
||
done
|
||
}
|
||
|
||
run_experiment() {
|
||
local policy=$1
|
||
local tag=$2
|
||
local outdir="$PROJECT_DIR/outputs/$tag"
|
||
mkdir -p "$outdir"
|
||
|
||
echo " Starting proxy (policy=$policy)..."
|
||
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
||
--combined $(for i in $(seq 0 $((N_INSTANCES - 1))); do echo -n "http://127.0.0.1:$((BASE_PORT + i)) "; done) \
|
||
--policy "$policy" \
|
||
--port $PROXY_PORT > /tmp/lmetric_ab_proxy_${policy}.log 2>&1 &
|
||
PROXY_PID=$!
|
||
sleep 3
|
||
|
||
# Smoke test
|
||
result=$(curl -s -m 30 http://localhost:$PROXY_PORT/v1/completions \
|
||
-X POST -H "Content-Type: application/json" \
|
||
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
|
||
if ! echo "$result" | grep -q "choices"; then
|
||
echo " ERROR: Smoke test failed: $result"
|
||
kill $PROXY_PID 2>/dev/null
|
||
return 1
|
||
fi
|
||
echo " Smoke test passed"
|
||
|
||
# Start GPU monitor
|
||
bash "$PROJECT_DIR/scripts/gpu_monitor.sh" > "$outdir/gpu_util.csv" &
|
||
GPU_MON_PID=$!
|
||
|
||
# Run benchmark
|
||
echo " Running benchmark (policy=$policy, $REQUEST_LIMIT requests)..."
|
||
$PYTHON -m replayer \
|
||
--trace "$TRACE" \
|
||
--output "$outdir/metrics.jsonl" \
|
||
--endpoint "http://localhost:$PROXY_PORT" \
|
||
--model "$MODEL" \
|
||
--time-scale $TIME_SCALE \
|
||
--max-inflight-sessions $MAX_SESSIONS \
|
||
--request-limit $REQUEST_LIMIT \
|
||
-v
|
||
|
||
# Save breakdown
|
||
curl -s http://localhost:$PROXY_PORT/breakdown > "$outdir/breakdown.json" 2>/dev/null
|
||
curl -s http://localhost:$PROXY_PORT/stats > "$outdir/stats.json" 2>/dev/null
|
||
|
||
# Collect APC from vLLM logs
|
||
echo " Collecting APC..."
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
pch=$(grep "Prefix cache hit rate" /tmp/lmetric_ab_inst_$i.log 2>/dev/null | tail -1 | grep -oP "Prefix cache hit rate: \K[0-9.]+" || echo "0")
|
||
echo " inst_$i: prefix=$pch%"
|
||
done | tee "$outdir/apc.txt"
|
||
|
||
kill $GPU_MON_PID 2>/dev/null
|
||
kill $PROXY_PID 2>/dev/null
|
||
wait $PROXY_PID 2>/dev/null
|
||
echo " Done: $(wc -l < "$outdir/metrics.jsonl") requests -> $outdir"
|
||
}
|
||
|
||
echo "================================================================"
|
||
echo " A/B: Linear vs LMetric routing policy"
|
||
echo " $(date)"
|
||
echo "================================================================"
|
||
|
||
# Experiment 1: Linear (current baseline)
|
||
echo ""
|
||
echo "=== Experiment 1: Linear policy ==="
|
||
cleanup
|
||
start_instances
|
||
run_experiment "linear" "ab_linear"
|
||
|
||
# Experiment 2: LMetric (OSDI'26)
|
||
echo ""
|
||
echo "=== Experiment 2: LMetric policy ==="
|
||
cleanup
|
||
start_instances
|
||
run_experiment "lmetric" "ab_lmetric"
|
||
|
||
# Compare
|
||
echo ""
|
||
echo "================================================================"
|
||
echo " Results comparison"
|
||
echo "================================================================"
|
||
$PYTHON -c "
|
||
import json, statistics
|
||
|
||
def summarize(path, label):
|
||
rows = [json.loads(l) for l in open(path)]
|
||
ok = [r for r in rows if not r.get('error')]
|
||
p = lambda v,q: v[min(int(q*len(v)),len(v)-1)] if v else 0
|
||
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
|
||
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
|
||
e2es = sorted([r['latency_s'] for r in ok])
|
||
print('%-20s OK=%3d/%3d TTFT50=%.3f TTFT90=%.3f TPOT90=%.3f E2E50=%.3f' % (
|
||
label, len(ok), len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
|
||
|
||
summarize('$PROJECT_DIR/outputs/ab_linear/metrics.jsonl', 'Linear')
|
||
summarize('$PROJECT_DIR/outputs/ab_lmetric/metrics.jsonl', 'LMetric')
|
||
"
|
||
|
||
echo ""
|
||
echo "Done at $(date)"
|