#!/bin/bash # Exp (c): does wall-clock amplification survive Mode 2 (real think-time)? # Launch N vLLM instances; replayer round-robins across them; replay the SAME # annotated trace under Mode 1 (tracets) and Mode 2 (thinktime). set -uo pipefail cd /home/admin/cpfs/wjh/agentic-kv PY=.venv/bin/python MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct N=${N:-4} TRACE=${TRACE:-traces/w600_ttp_win.jsonl} REQLIMIT=${REQLIMIT:-} OUT=v2/exp_c_dispatch_ablation/results mkdir -p "$OUT" PIDS=() EPS="" launch() { # $1 gpu, $2 port CUDA_VISIBLE_DEVICES=$1 VLLM_LOGGING_LEVEL=WARNING \ $PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \ --host 0.0.0.0 --port $2 --tensor-parallel-size 1 --trust-remote-code \ --enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \ --gpu-memory-utilization 0.9 > "$OUT/vllm_$2.log" 2>&1 & PIDS+=($!) } teardown() { for p in "${PIDS[@]:-}"; do kill -TERM "$p" 2>/dev/null; done sleep 6 for p in $(pgrep -f "VLLM::EngineCore"); do kill -9 "$p" 2>/dev/null; done sleep 3 } trap teardown EXIT echo ">>> launch $N instances" for i in $(seq 0 $((N-1))); do launch "$i" $((8000+i)) EPS="$EPS,http://127.0.0.1:$((8000+i))" done EPS="${EPS#,}" for i in $(seq 0 $((N-1))); do echo -n " wait health $((8000+i))..." timeout 900 bash -c "until curl -sf http://127.0.0.1:$((8000+i))/health >/dev/null 2>&1; do sleep 5; done" \ && echo ok || { echo FAIL; exit 1; } done LIM=""; [ -n "$REQLIMIT" ] && LIM="--request-limit $REQLIMIT" for MODE in tracets thinktime; do echo "=== replay dispatch-mode=$MODE ===" $PY -m replayer --trace "$TRACE" --output "$OUT/metrics_$MODE.jsonl" \ --endpoint "$EPS" --model "$MODEL" --dispatch-mode "$MODE" $LIM cp "$OUT/metrics_$MODE.summary.json" "$OUT/summary_$MODE.json" 2>/dev/null || true done teardown echo "=== exp (c) DONE ==="