agentic-kvc/scripts/legacy/run_v2_offload.sh

#!/bin/bash
# V2: P2P KV offload — C_s (session-sticky, cached) prefills, D (least-loaded) decodes
# 8 combined instances (all kv_both), no dedicated PS
set -euo pipefail
cd /home/admin/cpfs/wjh/agentic-kv
source .venv/bin/activate

MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
OUTDIR=outputs/v2_offload

cleanup() {
    for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do kill -9 "$p" 2>/dev/null || true; done
    sleep 3
    for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true); do kill -9 "$p" 2>/dev/null || true; done
    sleep 5
}

cleanup
mkdir -p "$OUTDIR"

echo "=== Verifying GPUs free ==="
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader

# ---- Launch 8 Combined instances (GPUs 0-7, all kv_both) ----
echo "=== Launching 8 Combined instances ==="
for i in $(seq 0 7); do
    echo "Starting C instance $i on GPU $i, port $((8000+i)), bootstrap $((8998+i))"
    VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \
    .venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \
        --trust-remote-code --enable-prefix-caching --enforce-eager \
        --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
        --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
        > "$OUTDIR/vllm_$i.log" 2>&1 &
    sleep 2
done

# ---- Wait for all instances healthy ----
echo "=== Waiting for all instances to be healthy ==="
for port in $(seq 8000 8007); do
    echo -n "  Waiting for port $port..."
    timeout 600 bash -c "until curl -sf http://127.0.0.1:$port/health > /dev/null 2>&1; do sleep 5; done"
    echo " OK"
done

# Wait for bootstrap ports
for bp in $(seq 8998 9005); do
    timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
done
echo "=== All instances healthy ==="
sleep 5

# ---- Launch Proxy with V2 offload ----
echo "=== Launching cache_aware_proxy with V2 offload ==="
.venv/bin/python scripts/cache_aware_proxy.py \
    --combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \
               http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 \
               http://127.0.0.1:8006 http://127.0.0.1:8007 \
    --bootstrap-ports 8998,8999,9000,9001,9002,9003,9004,9005 \
    --offload --port 9090 \
    > "$OUTDIR/proxy.log" 2>&1 &
PROXY_PID=$!
echo "Proxy PID: $PROXY_PID"

# Wait for proxy ready
echo -n "  Waiting for proxy..."
until curl -sf "http://127.0.0.1:9090/stats" > /dev/null 2>&1; do
    sleep 2
    echo -n "."
done
echo " OK"

# ---- Run benchmark ----
echo "=== Running benchmark: 200 req, time_scale=20, max-inflight-sessions=8 ==="
.venv/bin/python -m replayer --trace traces/sampled_1000req_seed42.jsonl \
    --output "$OUTDIR/metrics.jsonl" \
    --endpoint http://localhost:9090 --model "$MODEL" \
    --time-scale 20 --max-inflight-sessions 8 --request-limit 200 -v

# ---- Save proxy data BEFORE cleanup ----
echo "=== Saving proxy breakdown and stats ==="
curl -sf "http://127.0.0.1:9090/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true
curl -sf "http://127.0.0.1:9090/stats" > "$OUTDIR/stats.json" 2>/dev/null || true

# ---- Quick analysis ----
echo "=== Quick metrics summary ==="
.venv/bin/python -c "
import json
from collections import Counter

rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')]
ok = [r for r in rows if not r.get('error')]
fail = [r for r in rows if r.get('error')]
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0

ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
e2es = sorted([r['latency_s'] for r in ok])

print(f'OK={len(ok)}/{len(rows)}  TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f}  TPOT90={p(tpots,.9):.4f}  E2E50={p(e2es,.5):.3f}')

# Per-class breakdown
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MED'),(20000,200000,'HEAVY')]:
    sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
    if sub:
        t = sorted([r['ttft_s'] for r in sub])
        tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
        e = sorted([r['latency_s'] for r in sub])
        print(f'  {cl:6s} n={len(sub):3d}  TTFT50={p(t,.5):.3f} TTFT90={p(t,.9):.3f}  TPOT90={p(tp,.9):.4f}  E2E50={p(e,.5):.3f}')

# Route distribution from breakdown
try:
    bd = json.load(open('$OUTDIR/breakdown.json'))
    rc = Counter(b.get('route_class','') for b in bd)
    print(f'\nRoute class distribution:')
    for cls, cnt in sorted(rc.items()):
        print(f'  {cls}: {cnt}')

    # Offload timing
    offloaded = [b for b in bd if b.get('route_class') == 'HEAVY_OFFLOAD']
    if offloaded:
        pf = [b['t_prefill_done']-b['t_prefill_sent'] for b in offloaded if b.get('t_prefill_done') and b.get('t_prefill_sent')]
        if pf:
            pf.sort()
            print(f'\nHEAVY_OFFLOAD prefill time: n={len(pf)} p50={p(pf,.5):.2f}s p90={p(pf,.9):.2f}s')

    # Offload reasons for HEAVY
    heavy = [b for b in bd if b.get('route_class','').startswith('HEAVY')]
    reasons = Counter(b.get('offload_reason','') for b in heavy)
    if reasons:
        print(f'HEAVY offload reasons: {dict(reasons)}')

    # Per-instance P and D counts
    p_counts = Counter(b.get('p_inst','') for b in bd if b.get('p_inst'))
    d_counts = Counter(b.get('d_inst','') for b in bd if b.get('d_inst'))
    if p_counts:
        print(f'\nP-instance distribution: {dict(p_counts)}')
    if d_counts:
        print(f'D-instance distribution: {dict(d_counts)}')

except Exception as e:
    print(f'Breakdown analysis error: {e}')

if fail:
    print(f'\nFailed requests ({len(fail)}):')
    for r in fail[:5]:
        print(f'  input={r[\"input_length\"]} error={r[\"error\"][:80]}')

print()
print('=== Baselines for comparison ===')
print('Phase0A (7C kv_both): OK=198/200 TTFT50=1.073 TPOT90=0.0738 E2E50=5.096')
print('Baseline (8C plain):  OK=198/200 TTFT50=1.075 TPOT90=0.0761 E2E50=5.075')
print('PS V1 flexD:          OK=172/186 TTFT50=0.978 TPOT90=0.0758 E2E50=5.623')
"

cleanup
echo "=== DONE $(date) ==="