agentic-kvc/scripts/launch_phase1_ps.sh

#!/usr/bin/env bash
# Phase 1: Dedicated Prefill Service (7C + 1PS)
# 7 Combined instances (GPUs 0-6, ports 8000-8006, kv_both)
# 1 Prefill Service instance (GPU 7, port 8007, kv_both)
set -euo pipefail

PROJECT_DIR="${PROJECT_DIR:-$HOME/phd/agentic-kv}"
cd "$PROJECT_DIR"
source .venv/bin/activate

MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
OUTDIR=outputs/phase1_ps

mkdir -p "$OUTDIR"

# ---- Cleanup ----
echo "=== Killing existing processes ==="
pkill -f "vllm serve" 2>/dev/null || true
pkill -f "cache_aware_proxy" 2>/dev/null || true
sleep 3

echo "=== Verifying GPUs free ==="
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader
sleep 2

# ---- Launch 7 Combined instances (GPUs 0-6) ----
echo "=== Launching 7 Combined instances ==="
for i in $(seq 0 6); do
    echo "Starting C instance $i on GPU $i, port $((8000+i)), bootstrap $((8998+i))"
    VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \
    .venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \
        --trust-remote-code --enable-prefix-caching \
        --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
        --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
        > "$OUTDIR/vllm_c_$i.log" 2>&1 &
    sleep 2
done

# ---- Launch 1 PS instance (GPU 7) ----
echo "=== Launching PS instance on GPU 7, port 8007, bootstrap 9005 ==="
VLLM_MOONCAKE_BOOTSTRAP_PORT=9005 MASTER_PORT=29507 CUDA_VISIBLE_DEVICES=7 \
.venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port 8007 --tensor-parallel-size 1 \
    --trust-remote-code --enable-prefix-caching \
    --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
    --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
    > "$OUTDIR/vllm_ps_0.log" 2>&1 &
sleep 2

# ---- Wait for all instances healthy ----
echo "=== Waiting for all instances to be healthy ==="
for port in 8000 8001 8002 8003 8004 8005 8006 8007; do
    echo -n "  Waiting for port $port..."
    until curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do
        sleep 5
        echo -n "."
    done
    echo " OK"
done

echo "=== All instances healthy ==="
sleep 5

# ---- Launch Proxy ----
echo "=== Launching cache_aware_proxy with PS ==="
.venv/bin/python scripts/cache_aware_proxy.py \
    --combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \
               http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 \
    --ps-instances http://127.0.0.1:8007 \
    --ps-bootstrap-ports 9005 \
    --bootstrap-ports 8998,8999,9000,9001,9002,9003,9004 \
    --port 9090 \
    > "$OUTDIR/proxy.log" 2>&1 &
PROXY_PID=$!
echo "Proxy PID: $PROXY_PID"

# Wait for proxy ready
echo -n "  Waiting for proxy..."
until curl -sf "http://127.0.0.1:9090/stats" > /dev/null 2>&1; do
    sleep 2
    echo -n "."
done
echo " OK"

echo "=== Running benchmark ==="
.venv/bin/python -m replayer --trace traces/sampled_1000req_seed42.jsonl \
    --output "$OUTDIR/metrics.jsonl" \
    --endpoint http://localhost:9090 --model "$MODEL" \
    --time-scale 20 --max-inflight-sessions 7 --request-limit 200 -v

echo "=== Saving proxy breakdown and stats ==="
curl -s "http://127.0.0.1:9090/breakdown" > "$OUTDIR/breakdown.json"
curl -s "http://127.0.0.1:9090/stats" > "$OUTDIR/stats.json"

echo "=== Benchmark complete ==="
echo "Results in $OUTDIR/"
echo ""

# ---- Quick analysis ----
echo "=== Quick metrics summary ==="
python3 -c "
import json, statistics

records = []
with open('$OUTDIR/metrics.jsonl') as f:
    for line in f:
        records.append(json.loads(line))

ok = [r for r in records if r.get('status') == 'ok']
fail = [r for r in records if r.get('status') != 'ok']
print(f'Total: {len(records)}, OK: {len(ok)}, Failed: {len(fail)}')
print(f'Success rate: {len(ok)/len(records)*100:.1f}%')

if ok:
    ttfts = sorted([r['ttft'] for r in ok])
    tpots = sorted([r['tpot'] for r in ok])
    e2es = sorted([r['e2e'] for r in ok])

    def pct(vals, p):
        idx = int(len(vals) * p / 100)
        return vals[min(idx, len(vals)-1)]

    print(f'TTFT p50={pct(ttfts,50):.3f} p90={pct(ttfts,90):.3f} p99={pct(ttfts,99):.3f}')
    print(f'TPOT p50={pct(tpots,50):.4f} p90={pct(tpots,90):.4f} p99={pct(tpots,99):.4f}')
    print(f'E2E  p50={pct(e2es,50):.3f} p90={pct(e2es,90):.3f} p99={pct(e2es,99):.3f}')

# Breakdown analysis
try:
    with open('$OUTDIR/breakdown.json') as f:
        bd = json.load(f)
    classes = {}
    for r in bd:
        rc = r.get('route_class', 'UNKNOWN')
        classes[rc] = classes.get(rc, 0) + 1
    print(f'\nRoute class breakdown:')
    for rc, cnt in sorted(classes.items()):
        print(f'  {rc}: {cnt}')

    # PS utilization
    ps_reqs = [r for r in bd if r.get('route_class') == 'HEAVY_PS']
    print(f'\nPS offloaded: {len(ps_reqs)} requests')

    # Offload reasons for HEAVY
    heavy = [r for r in bd if r.get('route_class', '').startswith('HEAVY')]
    reasons = {}
    for r in heavy:
        reason = r.get('offload_reason', 'unknown')
        reasons[reason] = reasons.get(reason, 0) + 1
    if reasons:
        print(f'HEAVY offload reasons:')
        for reason, cnt in sorted(reasons.items()):
            print(f'  {reason}: {cnt}')
except Exception as e:
    print(f'Breakdown analysis error: {e}')
"

echo ""
echo "=== Phase 1 experiment complete ==="