Adds `--dispatch-mode {tracets,thinktime}` to the replayer and documents that
agentic serving should be benchmarked with `thinktime` (the faithful load).
- `tracets` (old default): turn-k at the absolute trace timestamp, i.e.
max(prev_finished, trace_ts) -- collapses inter-turn think-time to ~0 when the
system is behind, manufacturing request bursts.
- `thinktime`: turn-1 at trace arrival; turn-k at prev_finished +
time_to_parent_chat (real production gap). scripts/add_time_to_parent.py
annotates a trace with that gap from the raw trace's request_ready/end_ms.
exp(c) ablation (v2/exp_c_dispatch_ablation/): at N=8 (capacity slack) thinktime
beats tracets -- E2E p90 -28% (73.5 vs 102.8s), TTFT p90 -29%, TPS +7%, because
tracets' bursts spike concurrency -> KV pressure -> preemption. At N=6
(saturated) they converge. So tracets makes the system look ~30% worse on tail
latency than realistic agent pacing. Root README.md carries the headline
guidance; raw per-request metrics gitignored (perf_summary.json kept).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
54 lines
1.9 KiB
Bash
54 lines
1.9 KiB
Bash
#!/bin/bash
|
|
# Exp (c): does wall-clock amplification survive Mode 2 (real think-time)?
|
|
# Launch N vLLM instances; replayer round-robins across them; replay the SAME
|
|
# annotated trace under Mode 1 (tracets) and Mode 2 (thinktime).
|
|
set -uo pipefail
|
|
cd /home/admin/cpfs/wjh/agentic-kv
|
|
PY=.venv/bin/python
|
|
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
|
N=${N:-4}
|
|
TRACE=${TRACE:-traces/w600_ttp_win.jsonl}
|
|
REQLIMIT=${REQLIMIT:-}
|
|
OUT=v2/exp_c_dispatch_ablation/results
|
|
mkdir -p "$OUT"
|
|
PIDS=()
|
|
EPS=""
|
|
|
|
launch() { # $1 gpu, $2 port
|
|
CUDA_VISIBLE_DEVICES=$1 VLLM_LOGGING_LEVEL=WARNING \
|
|
$PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
|
|
--host 0.0.0.0 --port $2 --tensor-parallel-size 1 --trust-remote-code \
|
|
--enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \
|
|
--gpu-memory-utilization 0.9 > "$OUT/vllm_$2.log" 2>&1 &
|
|
PIDS+=($!)
|
|
}
|
|
teardown() {
|
|
for p in "${PIDS[@]:-}"; do kill -TERM "$p" 2>/dev/null; done
|
|
sleep 6
|
|
for p in $(pgrep -f "VLLM::EngineCore"); do kill -9 "$p" 2>/dev/null; done
|
|
sleep 3
|
|
}
|
|
trap teardown EXIT
|
|
|
|
echo ">>> launch $N instances"
|
|
for i in $(seq 0 $((N-1))); do
|
|
launch "$i" $((8000+i))
|
|
EPS="$EPS,http://127.0.0.1:$((8000+i))"
|
|
done
|
|
EPS="${EPS#,}"
|
|
for i in $(seq 0 $((N-1))); do
|
|
echo -n " wait health $((8000+i))..."
|
|
timeout 900 bash -c "until curl -sf http://127.0.0.1:$((8000+i))/health >/dev/null 2>&1; do sleep 5; done" \
|
|
&& echo ok || { echo FAIL; exit 1; }
|
|
done
|
|
|
|
LIM=""; [ -n "$REQLIMIT" ] && LIM="--request-limit $REQLIMIT"
|
|
for MODE in tracets thinktime; do
|
|
echo "=== replay dispatch-mode=$MODE ==="
|
|
$PY -m replayer --trace "$TRACE" --output "$OUT/metrics_$MODE.jsonl" \
|
|
--endpoint "$EPS" --model "$MODEL" --dispatch-mode "$MODE" $LIM
|
|
cp "$OUT/metrics_$MODE.summary.json" "$OUT/summary_$MODE.json" 2>/dev/null || true
|
|
done
|
|
teardown
|
|
echo "=== exp (c) DONE ==="
|