Experiments run: - Phase 0: kv_both has zero idle overhead (TPOT +1.3%, noise) - PS V1 (cold prefill): REJECTED — PS always slower than cached C - PS V1+flexD: 92.5% OK, HEAVY TTFT 7.8s (baseline 5.0s) — PS bottleneck - V2 (C_s prefill + flexible D): E2E -9% but 6 errors, RDMA bimodal - H4 (cache-gate): 198/200 OK, GPU imbalance 4.0x→2.0x, but HEAVY_OFFLOAD TTFT=11.5s due to RDMA. HEAVY_COLO improved 10.5% from better balance. - H5: Mooncake RDMA transfer R²=0.095, bimodal (0.6s or 18-30s) Key findings: - Mooncake lacks layerwise KV transfer → RDMA is pure sequential overhead - 92% of HEAVY are turn-1 cold → offloading cold requests always loses - GPU balance improvement from routing IS real (-10.5% HEAVY_COLO TTFT) - RDMA transfer negates the routing benefit for offloaded requests Code changes: - bench.sh: add GPU timeline monitoring (gpu_monitor.sh during benchmark) - cache_aware_proxy.py: H4 cache-gate, flexible D, PS routing - mooncake_connector.py: elif→if fix (allow dual prefill+decode flags) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
157 lines
5.3 KiB
Bash
Executable File
157 lines
5.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Phase 1: Dedicated Prefill Service (7C + 1PS)
|
|
# 7 Combined instances (GPUs 0-6, ports 8000-8006, kv_both)
|
|
# 1 Prefill Service instance (GPU 7, port 8007, kv_both)
|
|
set -euo pipefail
|
|
|
|
cd /home/admin/cpfs/wjh/agentic-kv
|
|
source .venv/bin/activate
|
|
|
|
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
|
OUTDIR=outputs/phase1_ps
|
|
|
|
mkdir -p "$OUTDIR"
|
|
|
|
# ---- Cleanup ----
|
|
echo "=== Killing existing processes ==="
|
|
pkill -f "vllm serve" 2>/dev/null || true
|
|
pkill -f "cache_aware_proxy" 2>/dev/null || true
|
|
sleep 3
|
|
|
|
echo "=== Verifying GPUs free ==="
|
|
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader
|
|
sleep 2
|
|
|
|
# ---- Launch 7 Combined instances (GPUs 0-6) ----
|
|
echo "=== Launching 7 Combined instances ==="
|
|
for i in $(seq 0 6); do
|
|
echo "Starting C instance $i on GPU $i, port $((8000+i)), bootstrap $((8998+i))"
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \
|
|
.venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
> "$OUTDIR/vllm_c_$i.log" 2>&1 &
|
|
sleep 2
|
|
done
|
|
|
|
# ---- Launch 1 PS instance (GPU 7) ----
|
|
echo "=== Launching PS instance on GPU 7, port 8007, bootstrap 9005 ==="
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=9005 MASTER_PORT=29507 CUDA_VISIBLE_DEVICES=7 \
|
|
.venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port 8007 --tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
> "$OUTDIR/vllm_ps_0.log" 2>&1 &
|
|
sleep 2
|
|
|
|
# ---- Wait for all instances healthy ----
|
|
echo "=== Waiting for all instances to be healthy ==="
|
|
for port in 8000 8001 8002 8003 8004 8005 8006 8007; do
|
|
echo -n " Waiting for port $port..."
|
|
until curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
|
|
sleep 5
|
|
echo -n "."
|
|
done
|
|
echo " OK"
|
|
done
|
|
|
|
echo "=== All instances healthy ==="
|
|
sleep 5
|
|
|
|
# ---- Launch Proxy ----
|
|
echo "=== Launching cache_aware_proxy with PS ==="
|
|
.venv/bin/python scripts/cache_aware_proxy.py \
|
|
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \
|
|
http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 \
|
|
--ps-instances http://127.0.0.1:8007 \
|
|
--ps-bootstrap-ports 9005 \
|
|
--bootstrap-ports 8998,8999,9000,9001,9002,9003,9004 \
|
|
--port 9090 \
|
|
> "$OUTDIR/proxy.log" 2>&1 &
|
|
PROXY_PID=$!
|
|
echo "Proxy PID: $PROXY_PID"
|
|
|
|
# Wait for proxy ready
|
|
echo -n " Waiting for proxy..."
|
|
until curl -sf "http://127.0.0.1:9090/stats" > /dev/null 2>&1; do
|
|
sleep 2
|
|
echo -n "."
|
|
done
|
|
echo " OK"
|
|
|
|
echo "=== Running benchmark ==="
|
|
.venv/bin/python -m replayer --trace traces/sampled_1000req_seed42.jsonl \
|
|
--output "$OUTDIR/metrics.jsonl" \
|
|
--endpoint http://localhost:9090 --model "$MODEL" \
|
|
--time-scale 20 --max-inflight-sessions 7 --request-limit 200 -v
|
|
|
|
echo "=== Saving proxy breakdown and stats ==="
|
|
curl -s "http://127.0.0.1:9090/breakdown" > "$OUTDIR/breakdown.json"
|
|
curl -s "http://127.0.0.1:9090/stats" > "$OUTDIR/stats.json"
|
|
|
|
echo "=== Benchmark complete ==="
|
|
echo "Results in $OUTDIR/"
|
|
echo ""
|
|
|
|
# ---- Quick analysis ----
|
|
echo "=== Quick metrics summary ==="
|
|
python3 -c "
|
|
import json, statistics
|
|
|
|
records = []
|
|
with open('$OUTDIR/metrics.jsonl') as f:
|
|
for line in f:
|
|
records.append(json.loads(line))
|
|
|
|
ok = [r for r in records if r.get('status') == 'ok']
|
|
fail = [r for r in records if r.get('status') != 'ok']
|
|
print(f'Total: {len(records)}, OK: {len(ok)}, Failed: {len(fail)}')
|
|
print(f'Success rate: {len(ok)/len(records)*100:.1f}%')
|
|
|
|
if ok:
|
|
ttfts = sorted([r['ttft'] for r in ok])
|
|
tpots = sorted([r['tpot'] for r in ok])
|
|
e2es = sorted([r['e2e'] for r in ok])
|
|
|
|
def pct(vals, p):
|
|
idx = int(len(vals) * p / 100)
|
|
return vals[min(idx, len(vals)-1)]
|
|
|
|
print(f'TTFT p50={pct(ttfts,50):.3f} p90={pct(ttfts,90):.3f} p99={pct(ttfts,99):.3f}')
|
|
print(f'TPOT p50={pct(tpots,50):.4f} p90={pct(tpots,90):.4f} p99={pct(tpots,99):.4f}')
|
|
print(f'E2E p50={pct(e2es,50):.3f} p90={pct(e2es,90):.3f} p99={pct(e2es,99):.3f}')
|
|
|
|
# Breakdown analysis
|
|
try:
|
|
with open('$OUTDIR/breakdown.json') as f:
|
|
bd = json.load(f)
|
|
classes = {}
|
|
for r in bd:
|
|
rc = r.get('route_class', 'UNKNOWN')
|
|
classes[rc] = classes.get(rc, 0) + 1
|
|
print(f'\nRoute class breakdown:')
|
|
for rc, cnt in sorted(classes.items()):
|
|
print(f' {rc}: {cnt}')
|
|
|
|
# PS utilization
|
|
ps_reqs = [r for r in bd if r.get('route_class') == 'HEAVY_PS']
|
|
print(f'\nPS offloaded: {len(ps_reqs)} requests')
|
|
|
|
# Offload reasons for HEAVY
|
|
heavy = [r for r in bd if r.get('route_class', '').startswith('HEAVY')]
|
|
reasons = {}
|
|
for r in heavy:
|
|
reason = r.get('offload_reason', 'unknown')
|
|
reasons[reason] = reasons.get(reason, 0) + 1
|
|
if reasons:
|
|
print(f'HEAVY offload reasons:')
|
|
for reason, cnt in sorted(reasons.items()):
|
|
print(f' {reason}: {cnt}')
|
|
except Exception as e:
|
|
print(f'Breakdown analysis error: {e}')
|
|
"
|
|
|
|
echo ""
|
|
echo "=== Phase 1 experiment complete ==="
|