High-concurrency test (512 input, 64 output, rates 4-32 req/s): Rate=8: plain TTFT p90=94ms, mooncake_both=102ms → +9% tax Rate=16: plain TTFT p90=144ms, mooncake_both=156ms → +8% tax Rate=32: both saturated at ~6.1s → no distinguishable difference Low-concurrency back-to-back retest (4096 input, 256 output): mooncake_both_v2 vs plain_v2: tax is ≈0% (within noise) because scheduler's 1.4ms/step is hidden behind model forward. Decomposition of trace-replay's +45%: +7-9% from build_connector_meta per-step cost (this microbench) +20-30% from multi-instance coupling amplification (not measurable here) remainder from large-cache O(|cache|) scaling (Phase B follow-up) Also: bench_loop.py now emits mean/p50/p90/p99 for all three metrics.
103 lines
3.1 KiB
Bash
Executable File
103 lines
3.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# Run remaining configs for Phase A, skipping those with summary_A.json.
|
|
# Adds a hard timeout of 120s per cell (rate) and limits to rates 0.5,1,2
|
|
# (the non-saturated regime) to avoid wasting GPU-hours on queue buildup.
|
|
#
|
|
# Usage: bash run_remaining.sh
|
|
|
|
set -uo pipefail
|
|
|
|
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
RUN_ROOT="$HERE/results/20260526_1728"
|
|
PORT=8000
|
|
GPU_ID=0
|
|
MODEL_PATH="$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
|
|
|
PROJ_DIR="$(cd "$HERE/../.." && pwd)"
|
|
PYTHON="$PROJ_DIR/.venv/bin/python"
|
|
export PYTHONPATH="$PROJ_DIR:${PYTHONPATH:-}"
|
|
|
|
# Configs to run (in priority order)
|
|
ALL_CONFIGS=(plain noop_connector mooncake_producer mooncake_both nixl_both)
|
|
# Skip: mooncake_consumer (needs dummy bootstrap, pre-flight likely fails)
|
|
# lmcache_only (not installed)
|
|
# multi_mooncake_lmcache (not installed)
|
|
|
|
# Rates: only non-saturated to avoid runaway drain
|
|
RATES="0.5,1,2"
|
|
|
|
kill_all_vllm() {
|
|
pkill -9 -f "port $PORT" 2>/dev/null || true
|
|
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
|
|
pkill -9 -f "vllm.entrypoints" 2>/dev/null || true
|
|
sleep 5
|
|
for _ in $(seq 1 20); do
|
|
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i "$GPU_ID" 2>/dev/null | tr -d ' ')
|
|
[[ -n "$used" && "$used" -lt 1000 ]] && return 0
|
|
fuser -k "/dev/nvidia${GPU_ID}" 2>/dev/null || true
|
|
sleep 3
|
|
done
|
|
echo "WARNING: GPU not free" >&2
|
|
}
|
|
|
|
for config in "${ALL_CONFIGS[@]}"; do
|
|
cfg_dir="$RUN_ROOT/$config"
|
|
|
|
# Skip if already has valid summary
|
|
if [[ -f "$cfg_dir/summary_A.json" ]]; then
|
|
ncells=$(python3 -c "import json; print(len(json.load(open('$cfg_dir/summary_A.json'))))" 2>/dev/null || echo 0)
|
|
if [[ "$ncells" -ge 3 ]]; then
|
|
echo "SKIP $config (already has $ncells cells in summary_A.json)"
|
|
continue
|
|
fi
|
|
fi
|
|
|
|
echo ""
|
|
echo "====== Running: $config ======"
|
|
mkdir -p "$cfg_dir"
|
|
|
|
# Launch
|
|
export RUN_DIR="$cfg_dir"
|
|
export PORT GPU_ID MODEL_PATH
|
|
export AGENTIC_STEP_LOG_PATH="$cfg_dir/engine_step.jsonl"
|
|
|
|
launch_script="$HERE/launch/launch_${config}.sh"
|
|
if [[ ! -f "$launch_script" ]]; then
|
|
echo "SKIP $config (no launch script)"
|
|
continue
|
|
fi
|
|
|
|
bash "$launch_script" 2>&1 | tail -5
|
|
rc=$?
|
|
if [[ $rc == 42 ]]; then
|
|
echo "SKIP $config (dependency missing, rc=42)"
|
|
kill_all_vllm
|
|
continue
|
|
fi
|
|
if [[ $rc != 0 ]]; then
|
|
echo "FAIL $config (launch rc=$rc)"
|
|
kill_all_vllm
|
|
continue
|
|
fi
|
|
|
|
# Run bench_loop with rates 0.5,1,2
|
|
echo " Benchmarking $config rates=$RATES ..."
|
|
"$PYTHON" "$HERE/bench_loop.py" \
|
|
--url "http://127.0.0.1:$PORT/v1/chat/completions" \
|
|
--model "$MODEL_PATH" \
|
|
--phase A \
|
|
--rates "$RATES" \
|
|
--shape "4096,256" \
|
|
--duration 60 \
|
|
--min-completed 200 \
|
|
--warmup 10 \
|
|
--output-dir "$cfg_dir" 2>&1 | tail -10
|
|
|
|
echo " Done: $config"
|
|
kill_all_vllm
|
|
echo " GPU released."
|
|
done
|
|
|
|
echo ""
|
|
echo "All configs processed. Check $RUN_ROOT/*/summary_A.json"
|