Experiments run: - Phase 0: kv_both has zero idle overhead (TPOT +1.3%, noise) - PS V1 (cold prefill): REJECTED — PS always slower than cached C - PS V1+flexD: 92.5% OK, HEAVY TTFT 7.8s (baseline 5.0s) — PS bottleneck - V2 (C_s prefill + flexible D): E2E -9% but 6 errors, RDMA bimodal - H4 (cache-gate): 198/200 OK, GPU imbalance 4.0x→2.0x, but HEAVY_OFFLOAD TTFT=11.5s due to RDMA. HEAVY_COLO improved 10.5% from better balance. - H5: Mooncake RDMA transfer R²=0.095, bimodal (0.6s or 18-30s) Key findings: - Mooncake lacks layerwise KV transfer → RDMA is pure sequential overhead - 92% of HEAVY are turn-1 cold → offloading cold requests always loses - GPU balance improvement from routing IS real (-10.5% HEAVY_COLO TTFT) - RDMA transfer negates the routing benefit for offloaded requests Code changes: - bench.sh: add GPU timeline monitoring (gpu_monitor.sh during benchmark) - cache_aware_proxy.py: H4 cache-gate, flexible D, PS routing - mooncake_connector.py: elif→if fix (allow dual prefill+decode flags) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
80 lines
3.3 KiB
Bash
Executable File
80 lines
3.3 KiB
Bash
Executable File
#!/bin/bash
|
|
# Run ps_cost and ps_highload experiments (ps_always already done)
|
|
set -euo pipefail
|
|
cd /home/admin/cpfs/wjh/agentic-kv
|
|
source .venv/bin/activate
|
|
|
|
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
|
VENV=.venv/bin
|
|
TRACE=traces/sampled_1000req_seed42.jsonl
|
|
|
|
cleanup() {
|
|
for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do kill -9 "$p" 2>/dev/null || true; done
|
|
sleep 3
|
|
for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true); do kill -9 "$p" 2>/dev/null || true; done
|
|
sleep 5
|
|
}
|
|
|
|
launch_7c_1ps() {
|
|
local outdir=$1
|
|
mkdir -p "$outdir"
|
|
for i in $(seq 0 6); do
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \
|
|
$VENV/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
> "$outdir/vllm_c_$i.log" 2>&1 &
|
|
sleep 2
|
|
done
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=9005 MASTER_PORT=29507 CUDA_VISIBLE_DEVICES=7 \
|
|
$VENV/vllm serve "$MODEL" --host 0.0.0.0 --port 8007 --tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
> "$outdir/vllm_ps.log" 2>&1 &
|
|
for i in $(seq 0 7); do
|
|
timeout 600 bash -c "until curl -s localhost:$((8000+i))/health > /dev/null 2>&1; do sleep 5; done"
|
|
echo " inst_$i healthy"
|
|
done
|
|
for bp in $(seq 8998 9005); do
|
|
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
|
|
done
|
|
echo " All ready"
|
|
}
|
|
|
|
run_exp() {
|
|
local tag=$1 mode=$2 reqs=$3 sess=$4
|
|
local outdir=outputs/$tag
|
|
echo ""
|
|
echo "================================================================"
|
|
echo " $tag (mode=$mode, reqs=$reqs, sess=$sess)"
|
|
echo " $(date)"
|
|
echo "================================================================"
|
|
cleanup
|
|
launch_7c_1ps "$outdir"
|
|
|
|
$VENV/python scripts/cache_aware_proxy.py \
|
|
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \
|
|
http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 \
|
|
--ps-instances http://127.0.0.1:8007 --ps-bootstrap-ports 9005 \
|
|
--bootstrap-ports 8998,8999,9000,9001,9002,9003,9004 \
|
|
--offload-mode "$mode" --port 9090 > "$outdir/proxy.log" 2>&1 &
|
|
sleep 3
|
|
|
|
$VENV/python -m replayer --trace "$TRACE" \
|
|
--output "$outdir/metrics.jsonl" --endpoint http://localhost:9090 --model "$MODEL" \
|
|
--time-scale 20 --max-inflight-sessions "$sess" --request-limit "$reqs" -v 2>&1 | tail -5
|
|
|
|
curl -sf http://localhost:9090/breakdown > "$outdir/breakdown.json" 2>/dev/null || true
|
|
curl -sf http://localhost:9090/stats > "$outdir/stats.json" 2>/dev/null || true
|
|
echo " Done: $tag"
|
|
}
|
|
|
|
run_exp ps_cost cost 200 7
|
|
run_exp ps_highload cost 1000 7
|
|
cleanup
|
|
|
|
echo ""
|
|
echo "=== ALL REMAINING DONE $(date) ==="
|