agentic-kvc/scripts/legacy/run_ps_flexd.sh

#!/bin/bash
# PS + flexible D: HEAVY prefill on PS, decode on least-loaded C
set -euo pipefail
cd /home/admin/cpfs/wjh/agentic-kv
source .venv/bin/activate

MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
OUTDIR=outputs/ps_flexd

cleanup() {
    for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do kill -9 "$p" 2>/dev/null || true; done
    sleep 3
    for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true); do kill -9 "$p" 2>/dev/null || true; done
    sleep 5
}

cleanup
mkdir -p "$OUTDIR"

# 7 C instances (kv_both, GPUs 0-6)
for i in $(seq 0 6); do
    VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \
    .venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \
        --trust-remote-code --enable-prefix-caching --enforce-eager \
        --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
        --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
        > "$OUTDIR/vllm_c_$i.log" 2>&1 &
    sleep 2
done

# 1 PS instance (kv_both, GPU 7)
VLLM_MOONCAKE_BOOTSTRAP_PORT=9005 MASTER_PORT=29507 CUDA_VISIBLE_DEVICES=7 \
.venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port 8007 --tensor-parallel-size 1 \
    --trust-remote-code --enable-prefix-caching --enforce-eager \
    --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
    --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
    > "$OUTDIR/vllm_ps.log" 2>&1 &

echo "Waiting for instances..."
for i in $(seq 0 7); do
    timeout 600 bash -c "until curl -s localhost:$((8000+i))/health > /dev/null 2>&1; do sleep 5; done"
    echo "  inst_$i healthy"
done
for bp in $(seq 8998 9005); do
    timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
done
echo "All ready"

# Proxy: PS on port 8007, C on 8000-8006, flexible D
.venv/bin/python scripts/cache_aware_proxy.py \
    --combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \
               http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 \
    --ps-instances http://127.0.0.1:8007 \
    --ps-bootstrap-ports 9005 \
    --bootstrap-ports 8998,8999,9000,9001,9002,9003,9004 \
    --port 9090 > "$OUTDIR/proxy.log" 2>&1 &
sleep 3

echo "Running benchmark..."
.venv/bin/python -m replayer --trace traces/sampled_1000req_seed42.jsonl \
    --output "$OUTDIR/metrics.jsonl" --endpoint http://localhost:9090 --model "$MODEL" \
    --time-scale 20 --max-inflight-sessions 7 --request-limit 200 -v

curl -sf http://localhost:9090/breakdown > "$OUTDIR/breakdown.json" 2>/dev/null || true
curl -sf http://localhost:9090/stats > "$OUTDIR/stats.json" 2>/dev/null || true

# Quick analysis
.venv/bin/python -c "
import json
from collections import Counter
rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')]
ok = [r for r in rows if not r.get('error')]
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
e2es = sorted([r['latency_s'] for r in ok])
print('OK=%d/%d  TTFT50=%.3f TTFT90=%.3f  TPOT90=%.4f  E2E50=%.3f' % (
    len(ok), len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MED'),(20000,200000,'HEAVY')]:
    sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
    if sub:
        t = sorted([r['ttft_s'] for r in sub])
        tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
        print('  %-6s n=%3d  TTFT50=%.3f  TPOT90=%.4f' % (cl, len(sub), p(t,.5), p(tp,.9) if tp else 0))
try:
    bd = json.load(open('$OUTDIR/breakdown.json'))
    rc = Counter(b.get('route_class','') for b in bd)
    print('Routes: %s' % dict(rc))
    ps = [b for b in bd if b.get('route_class') == 'HEAVY_PS']
    if ps:
        pf = [b['t_prefill_done']-b['t_prefill_sent'] for b in ps if b.get('t_prefill_done') and b.get('t_prefill_sent')]
        if pf: print('PS prefill: n=%d p50=%.1fs p90=%.1fs' % (len(pf), p(pf,.5), p(pf,.9)))
except: pass
print()
print('Compare: Phase0A: TTFT50=1.073 TPOT90=0.0738 E2E50=5.096')
print('Compare: Baseline 8C: TTFT50=1.075 TPOT90=0.0761 E2E50=5.075')
"

cleanup
echo "=== DONE $(date) ==="