agentic-kvc/scripts/legacy/run_ps_ablation.sh

#!/bin/bash
# PS offload ablation: 3 experiments back-to-back with full cleanup between each.
# 1. always_offload: all HEAVY → PS (zero hyperparameters)
# 2. cost_model: cost-based decision (no interference gate)
# 3. high_load: cost-based + 1000 requests (higher contention)
set -euo pipefail

cd /home/admin/cpfs/wjh/agentic-kv
source .venv/bin/activate

MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
VENV=.venv/bin
TRACE=traces/sampled_1000req_seed42.jsonl

cleanup() {
    echo "[cleanup] Killing all processes..."
    for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do
        kill -9 "$p" 2>/dev/null || true
    done
    sleep 3
    for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true); do
        kill -9 "$p" 2>/dev/null || true
    done
    sleep 5
    local used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}')
    if [ "${used:-0}" -gt 100 ]; then
        echo "[ERROR] GPUs not free (${used}MB). Waiting 10s..."
        sleep 10
        for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true); do
            kill -9 "$p" 2>/dev/null || true
        done
        sleep 5
    fi
    echo "[cleanup] Done."
}

launch_7c_1ps() {
    echo "[launch] 7C (kv_both) + 1PS (kv_both)..."
    for i in $(seq 0 6); do
        VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \
        $VENV/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching --enforce-eager \
            --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
            --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
            > "$1/vllm_c_$i.log" 2>&1 &
        sleep 2
    done
    VLLM_MOONCAKE_BOOTSTRAP_PORT=9005 MASTER_PORT=29507 CUDA_VISIBLE_DEVICES=7 \
    $VENV/vllm serve "$MODEL" --host 0.0.0.0 --port 8007 --tensor-parallel-size 1 \
        --trust-remote-code --enable-prefix-caching --enforce-eager \
        --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
        --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
        > "$1/vllm_ps.log" 2>&1 &

    for i in $(seq 0 7); do
        timeout 600 bash -c "until curl -s localhost:$((8000+i))/health > /dev/null 2>&1; do sleep 5; done"
        echo "  inst_$i healthy"
    done
    for bp in $(seq 8998 9005); do
        timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
    done
    echo "[launch] All ready."
}

run_experiment() {
    local tag=$1
    local offload_mode=$2
    local requests=$3
    local sessions=$4
    local outdir=outputs/$tag

    echo ""
    echo "================================================================"
    echo "  Experiment: $tag (mode=$offload_mode, requests=$requests)"
    echo "  $(date)"
    echo "================================================================"

    cleanup
    mkdir -p "$outdir"

    launch_7c_1ps "$outdir"

    echo "[proxy] Starting (offload_mode=$offload_mode)..."
    $VENV/python scripts/cache_aware_proxy.py \
        --combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \
                   http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 \
        --ps-instances http://127.0.0.1:8007 \
        --ps-bootstrap-ports 9005 \
        --bootstrap-ports 8998,8999,9000,9001,9002,9003,9004 \
        --offload-mode "$offload_mode" \
        --port 9090 > "$outdir/proxy.log" 2>&1 &
    sleep 3

    echo "[bench] Running $requests requests, $sessions sessions..."
    $VENV/python -m replayer --trace "$TRACE" \
        --output "$outdir/metrics.jsonl" \
        --endpoint http://localhost:9090 --model "$MODEL" \
        --time-scale 20 --max-inflight-sessions "$sessions" \
        --request-limit "$requests" -v 2>&1 | tail -5

    curl -sf http://localhost:9090/breakdown > "$outdir/breakdown.json" 2>/dev/null || true
    curl -sf http://localhost:9090/stats > "$outdir/stats.json" 2>/dev/null || true

    # Quick summary
    $VENV/python -c "
import json
from collections import Counter
rows = [json.loads(l) for l in open('$outdir/metrics.jsonl')]
ok = [r for r in rows if not r.get('error')]
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
e2es = sorted([r['latency_s'] for r in ok])
print('  OK=%d/%d  TTFT50=%.3f  TTFT90=%.3f  TPOT90=%.4f  E2E50=%.3f' % (
    len(ok), len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MED'),(20000,200000,'HEAVY')]:
    sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
    if sub:
        t = sorted([r['ttft_s'] for r in sub])
        tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
        print('  %-6s n=%3d  TTFT50=%.3f  TPOT90=%.4f' % (cl, len(sub), p(t,.5), p(tp,.9) if tp else 0))
try:
    bd = json.load(open('$outdir/breakdown.json'))
    rc = Counter(b.get('route_class','') for b in bd)
    print('  Routes: %s' % dict(rc))
except: pass
"
    echo "  Done: $tag"
}

# ─── Run experiments ───
run_experiment "ps_always"   "always" 200 7
run_experiment "ps_cost"     "cost"   200 7
run_experiment "ps_highload" "cost"   1000 7

# ─── Final comparison ───
cleanup

echo ""
echo "================================================================"
echo "  FINAL COMPARISON"
echo "================================================================"
$VENV/python -c "
import json

configs = [
    ('outputs/phase0a_7c_kvboth/metrics.jsonl', 'Control: 7C no PS'),
    ('outputs/ps_always/metrics.jsonl', 'PS always offload'),
    ('outputs/ps_cost/metrics.jsonl', 'PS cost model'),
    ('outputs/ps_highload/metrics.jsonl', 'PS cost 1000req'),
    ('outputs/baseline_stability_fresh/metrics.jsonl', 'Baseline: 8C plain'),
]
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
print('%-25s  %7s  %7s %7s  %7s  %7s' % ('Config', 'OK/N', 'TTFT50', 'TTFT90', 'TPOT90', 'E2E50'))
print('-' * 80)
for path, label in configs:
    try:
        rows = [json.loads(l) for l in open(path)]
        ok = [r for r in rows if not r.get('error')]
        ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
        tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
        e2es = sorted([r['latency_s'] for r in ok])
        print('%-25s  %3d/%3d  %7.3f %7.3f  %7.4f  %7.3f' % (
            label, len(ok), len(rows), p(ttfts,.5), p(ttfts,.9), p(tpots,.9), p(e2es,.5)))
    except Exception as e:
        print('%-25s  %s' % (label, e))
"
echo ""
echo "=== ALL DONE $(date) ==="