Files
agentic-kvc/scripts/legacy/run_experiments.sh
Gahow Wang 547611e022 scripts: archive obsolete one-off shell/python scripts to legacy/ (D2, D3)
D2: run_benchmark.sh and run_experiments.sh still pass --time-scale and
--max-inflight-sessions to the replayer, but those flags were removed when
the project moved to trace-driven dispatch. The scripts cannot run as-is.

D3: ~25 ad-hoc analyze_* / compare_* / profile_* / final_* scripts and a
handful of single-experiment run_*.sh point at /home/admin/cpfs paths,
deleted output directories, or a sampled trace file that no longer exists.
Keep them in scripts/legacy/ for historical reference; the scripts that
remain in scripts/ (analyze_trace, analyze_breakdown, analyze_cache_hit,
analyze_eviction, compare_results, compute_roofline, sample_trace,
analyze_agentic_patterns, simulate_cache_policies, plus launch_*.sh,
gpu_monitor.sh, bench.sh) cover the current workflow.

Adds scripts/legacy/README.md to document the archival policy.
2026-05-23 20:57:32 +08:00

255 lines
9.0 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Run the complete experiment matrix:
# 1. Combined TP=2 DP=4 (4 instances, baseline)
# 2. Combined TP=1 DP=8 (8 instances, max throughput)
# 3. PD-Sep TP=1: P×4 + D×4 via Mooncake/RDMA
#
# All use the same trace, same concurrency, same timeout.
set -euo pipefail
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
# Uniform benchmark params
MAX_SESSIONS=${MAX_SESSIONS:-8}
MAX_CONCURRENT=${MAX_CONCURRENT:-16}
TIME_SCALE=10
REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-300}
REQUEST_LIMIT="${REQUEST_LIMIT:-}" # empty = all 1000
cleanup_gpu() {
pkill -9 -f "vllm" 2>/dev/null || true
pkill -9 -f "cache_aware_proxy\|mooncake_connector_proxy\|uvicorn" 2>/dev/null || true
fuser 9090/tcp 8000/tcp 2>/dev/null | xargs -r kill -9 2>/dev/null || true
sleep 5
fuser /dev/nvidia* 2>/dev/null | tr " " "\n" | sort -u | xargs -r kill -9 2>/dev/null || true
sleep 10
}
wait_for_server() {
local port=$1
local timeout=${2:-600}
timeout "$timeout" bash -c "until curl -s localhost:$port/v1/models >/dev/null 2>&1; do sleep 5; done"
}
run_benchmark() {
local tag=$1
local endpoint=$2
local extra_args="${3:-}"
local outdir="$PROJECT_DIR/outputs/$tag"
echo " Running benchmark -> $outdir"
local limit_arg=""
if [ -n "$REQUEST_LIMIT" ]; then
limit_arg="--request-limit $REQUEST_LIMIT"
fi
$PYTHON -m replayer \
--trace "$TRACE" \
--output "$outdir/metrics.jsonl" \
--endpoint "$endpoint" \
--model "$MODEL" \
--time-scale $TIME_SCALE \
--max-inflight-sessions $MAX_SESSIONS \
--concurrency-limit $MAX_CONCURRENT \
--request-timeout $REQUEST_TIMEOUT \
$limit_arg \
-v
echo " Done: $(wc -l < "$outdir/metrics.jsonl") requests"
}
#######################################################################
# Experiment 1: Combined TP=2 DP=4
#######################################################################
run_combined_tp2_dp4() {
echo ""
echo "================================================================"
echo " Experiment 1: Combined TP=2 DP=4 (4 instances on 8 GPUs)"
echo "================================================================"
cleanup_gpu
for i in 0 1 2 3; do
local gpu_start=$((i * 2))
local gpu_end=$((gpu_start + 1))
local port=$((8000 + i))
echo " Starting instance $i: GPUs $gpu_start,$gpu_end, port $port"
CUDA_VISIBLE_DEVICES=$gpu_start,$gpu_end $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 2 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 &
done
for i in 0 1 2 3; do
wait_for_server $((8000 + i))
echo " Instance $i ready"
done
echo " All 4 instances ready"
# Start global scheduler (cache-aware proxy in combined mode)
echo " Starting global scheduler..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 http://127.0.0.1:8003 \
--port 9090 &
sleep 5
run_benchmark "exp1_combined_tp2_dp4" "http://localhost:9090"
}
#######################################################################
# Experiment 2: Combined TP=1 DP=8
#######################################################################
run_combined_tp1_dp8() {
echo ""
echo "================================================================"
echo " Experiment 2: Combined TP=1 DP=8 (8 instances on 8 GPUs)"
echo "================================================================"
cleanup_gpu
for i in $(seq 0 7); do
local port=$((8000 + i))
echo " Starting instance $i: GPU $i, port $port"
CUDA_VISIBLE_DEVICES=$i $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 &
done
for i in $(seq 0 7); do
wait_for_server $((8000 + i))
echo " Instance $i ready"
done
echo " All 8 instances ready"
# Start global scheduler (cache-aware proxy in combined mode)
echo " Starting global scheduler..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 http://127.0.0.1:8003 \
http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 http://127.0.0.1:8007 \
--port 9090 &
sleep 5
run_benchmark "exp2_combined_tp1_dp8" "http://localhost:9090"
}
#######################################################################
# Experiment 3: PD-Sep TP=1 P×4 D×4 (Mooncake/RDMA)
#######################################################################
run_pd_sep_tp1() {
echo ""
echo "================================================================"
echo " Experiment 3: PD-Sep TP=1 P×4 + D×4 (Mooncake/RDMA)"
echo "================================================================"
cleanup_gpu
PROXY_SCRIPT="$PROJECT_DIR/scripts/cache_aware_proxy.py"
# Start 4 prefill instances (GPUs 0-3)
local prefill_args=""
for i in 0 1 2 3; do
local port=$((8010 + i))
local bootstrap=$((8998 + i))
echo " Prefill $i: GPU $i, port $port, bootstrap $bootstrap"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
CUDA_VISIBLE_DEVICES=$i $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" &
prefill_args="$prefill_args --prefill http://127.0.0.1:$port $bootstrap"
done
# Start 4 decode instances (GPUs 4-7)
local decode_args=""
for i in 0 1 2 3; do
local gpu=$((4 + i))
local port=$((8020 + i))
echo " Decode $i: GPU $gpu, port $port"
CUDA_VISIBLE_DEVICES=$gpu $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_load_failure_policy\":\"recompute\"}" &
decode_args="$decode_args --decode http://127.0.0.1:$port"
done
# Wait for all instances
for i in 0 1 2 3; do
wait_for_server $((8010 + i))
echo " Prefill $i ready"
done
for i in 0 1 2 3; do
wait_for_server $((8020 + i))
echo " Decode $i ready"
done
# Start proxy (wait for bootstrap to be queryable first)
echo " Waiting for bootstrap servers..."
for bp in 8998 8999 9000 9001; do
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
echo " Bootstrap $bp ready"
done
echo " Starting proxy on port 9000..."
$PYTHON "$PROXY_SCRIPT" $prefill_args $decode_args --host 0.0.0.0 --port 9090 &
sleep 15
# Smoke test with retry
echo " Smoke test..."
for attempt in 1 2 3; do
result=$(curl -s -m 120 http://localhost:9090/v1/completions \
-X POST -H "Content-Type: application/json" \
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
if echo "$result" | grep -q "choices"; then
echo " Smoke test passed!"
break
fi
echo " Attempt $attempt failed, retrying..."
sleep 10
done
run_benchmark "exp3_pd_sep_tp1_mooncake" "http://localhost:9090"
}
#######################################################################
# Main
#######################################################################
echo "Starting experiment matrix on $(hostname)"
echo "Model: $MODEL"
echo "Trace: $TRACE"
echo "Params: sessions=$MAX_SESSIONS, concurrent=$MAX_CONCURRENT, time_scale=$TIME_SCALE"
echo ""
case "${1:-all}" in
1|tp2dp4) run_combined_tp2_dp4 ;;
2|tp1dp8) run_combined_tp1_dp8 ;;
3|pdsep) run_pd_sep_tp1 ;;
all)
run_combined_tp2_dp4
run_combined_tp1_dp8
run_pd_sep_tp1
;;
*)
echo "Usage: $0 {1|2|3|all|tp2dp4|tp1dp8|pdsep}"
exit 1
;;
esac
echo ""
echo "================================================================"
echo " All experiments complete!"
echo "================================================================"
cleanup_gpu