Per-step timing from engine_step.jsonl definitively resolves H3: plain: 53 μs/step (p50) noop_connector: 69 μs/step (+16 μs = negligible framework cost) mooncake_producer: 1461 μs/step (build_connector_meta = 1386 μs) mooncake_both: 1452 μs/step (same as producer) The substrate tax is NOT in the v1 framework — it's specifically in Mooncake's build_connector_meta() which walks set(cache.keys()) every scheduler step (O(|cache|) per step, E2 audit §6.5). Accumulated per-request tax: 256 decode steps × 1.4ms = 358ms. Observed TTFT tax at rate=1.0: plain 378ms vs mooncake_both 422ms (+12%). At rate=2.0 (near saturation): +29%, approaching trace-replay's +45%. Also fixes kill_vllm() to properly kill EngineCore subprocesses.
240 lines
8.7 KiB
Bash
Executable File
240 lines
8.7 KiB
Bash
Executable File
#!/bin/bash
|
|
# 5-stage barrier orchestrator for connector_tax microbench.
|
|
#
|
|
# Stage 0 — pre-flight + (optional) step-timing patch
|
|
# Stage 1 — Phase A (rate sweep) for all configs
|
|
# Stage 2 — pick reference rates from Phase A
|
|
# Stage 3 — Phase B (shape sweep at ref_safe) for all configs
|
|
# Stage 4 — revert patch + analyze + plot
|
|
#
|
|
# Configurable via env:
|
|
# CT_DATE : run-id directory tag (default $(date +%Y%m%d_%H%M))
|
|
# PORT : vLLM port (default 8000)
|
|
# GPU_ID : single GPU index (default 0)
|
|
# MODEL_PATH : path to model
|
|
# PHASE_A_RATES : default 0.5,1,2,4,8,16,32
|
|
# PHASE_B_SHAPES : default 512x64,512x256,512x1024,4096x64,4096x256,4096x1024,32768x64,32768x256,32768x1024
|
|
# MIN_COMPLETED : default 200
|
|
# DURATION : default 60
|
|
# SKIP_PATCH : set to 1 to bypass scheduler patch
|
|
# STAGES : space-separated list of stages to run, e.g. "1 3 4"
|
|
# defaults to "0 1 2 3 4"
|
|
|
|
set -uo pipefail
|
|
|
|
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
RESULTS_ROOT="$HERE/results"
|
|
RUN_DATE="${CT_DATE:-$(date +%Y%m%d_%H%M)}"
|
|
RUN_ROOT="$RESULTS_ROOT/$RUN_DATE"
|
|
mkdir -p "$RUN_ROOT"
|
|
|
|
PORT="${PORT:-8000}"
|
|
GPU_ID="${GPU_ID:-0}"
|
|
MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
PHASE_A_RATES="${PHASE_A_RATES:-0.5,1,2,4,8,16,32}"
|
|
PHASE_B_SHAPES="${PHASE_B_SHAPES:-512x64,512x256,512x1024,4096x64,4096x256,4096x1024,32768x64,32768x256,32768x1024}"
|
|
MIN_COMPLETED="${MIN_COMPLETED:-200}"
|
|
DURATION="${DURATION:-60}"
|
|
STAGES="${STAGES:-0 1 2 3 4}"
|
|
|
|
ALL_CONFIGS=(plain noop_connector mooncake_producer mooncake_consumer mooncake_both nixl_both lmcache_only multi_mooncake_lmcache)
|
|
|
|
# Shuffle config order for thermal-drift robustness.
|
|
shuffle_configs() {
|
|
printf '%s\n' "$@" | shuf
|
|
}
|
|
|
|
PYTHON="$(cd "$HERE/../.." && pwd)/.venv/bin/python"
|
|
[[ -x "$PYTHON" ]] || PYTHON="$(command -v python3)"
|
|
|
|
PROJ_DIR="$(cd "$HERE/../.." && pwd)"
|
|
export PYTHONPATH="$PROJ_DIR:${PYTHONPATH:-}"
|
|
|
|
manifest() {
|
|
local config="$1" stage="$2" status="$3" note="$4"
|
|
echo "$(date -Iseconds) | $config | $stage | $status | $note" \
|
|
>> "$RUN_ROOT/MANIFEST.tsv"
|
|
}
|
|
|
|
kill_vllm() {
|
|
local pidfile="$1"
|
|
# Kill the API server PID
|
|
if [[ -f "$pidfile" ]]; then
|
|
local pid; pid=$(cat "$pidfile")
|
|
if [[ -n "$pid" ]]; then
|
|
kill -9 "$pid" 2>/dev/null || true
|
|
fi
|
|
fi
|
|
# Also kill any vLLM/EngineCore processes on this port or GPU
|
|
pkill -9 -f "port $PORT" 2>/dev/null || true
|
|
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
|
|
pkill -9 -f "vllm.entrypoints" 2>/dev/null || true
|
|
sleep 5
|
|
# Wait for GPU memory release (up to 60s)
|
|
for _ in $(seq 1 30); do
|
|
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i "$GPU_ID" 2>/dev/null | tr -d ' ')
|
|
if [[ -n "$used" && "$used" -lt 1000 ]]; then
|
|
return 0
|
|
fi
|
|
# Try killing any remaining GPU holders
|
|
fuser -k "/dev/nvidia${GPU_ID}" 2>/dev/null || true
|
|
sleep 2
|
|
done
|
|
echo "WARNING: GPU $GPU_ID still not free after 60s" >&2
|
|
}
|
|
|
|
run_phase_a() {
|
|
local config="$1"
|
|
local cfg_dir="$RUN_ROOT/$config"
|
|
mkdir -p "$cfg_dir"
|
|
echo "=== [$config] Phase A ==="
|
|
|
|
# Launch
|
|
RUN_DIR="$cfg_dir" PORT="$PORT" GPU_ID="$GPU_ID" MODEL_PATH="$MODEL_PATH" \
|
|
bash "$HERE/launch/launch_${config}.sh"
|
|
local rc=$?
|
|
if [[ $rc == 42 ]]; then
|
|
manifest "$config" "phase_a" "SKIP" "dependency missing"
|
|
return 0
|
|
fi
|
|
if [[ $rc != 0 ]]; then
|
|
manifest "$config" "phase_a" "FAIL" "launch rc=$rc"
|
|
return 0
|
|
fi
|
|
|
|
# /metrics sampler in background
|
|
"$PYTHON" "$HERE/metrics_sampler.py" \
|
|
--url "http://127.0.0.1:$PORT/metrics" \
|
|
--output "$cfg_dir/metrics_A.jsonl" \
|
|
--interval 1.0 &
|
|
local sampler_pid=$!
|
|
|
|
# bench loop
|
|
"$PYTHON" "$HERE/bench_loop.py" \
|
|
--url "http://127.0.0.1:$PORT/v1/chat/completions" \
|
|
--model "$MODEL_PATH" \
|
|
--phase A \
|
|
--rates "$PHASE_A_RATES" \
|
|
--shape "4096,256" \
|
|
--duration "$DURATION" \
|
|
--min-completed "$MIN_COMPLETED" \
|
|
--warmup 10 \
|
|
--output-dir "$cfg_dir"
|
|
local bench_rc=$?
|
|
|
|
kill -9 "$sampler_pid" 2>/dev/null || true
|
|
kill_vllm "$cfg_dir/.vllm.pid"
|
|
[[ -f "$cfg_dir/.dummy.pid" ]] && kill -9 "$(cat "$cfg_dir/.dummy.pid")" 2>/dev/null
|
|
|
|
if [[ $bench_rc != 0 ]]; then
|
|
manifest "$config" "phase_a" "FAIL" "bench rc=$bench_rc"
|
|
return 0
|
|
fi
|
|
manifest "$config" "phase_a" "OK" ""
|
|
}
|
|
|
|
run_phase_b() {
|
|
local config="$1"
|
|
local cfg_dir="$RUN_ROOT/$config"
|
|
mkdir -p "$cfg_dir"
|
|
|
|
local ref_safe; ref_safe=$(jq -r '.ref_safe // empty' "$RUN_ROOT/aggregate.json" 2>/dev/null)
|
|
if [[ -z "$ref_safe" || "$ref_safe" == "null" ]]; then
|
|
echo "ref_safe not available; skipping Phase B for $config"
|
|
manifest "$config" "phase_b" "SKIP" "ref_safe undefined"
|
|
return 0
|
|
fi
|
|
echo "=== [$config] Phase B (rate=$ref_safe) ==="
|
|
|
|
RUN_DIR="$cfg_dir" PORT="$PORT" GPU_ID="$GPU_ID" MODEL_PATH="$MODEL_PATH" \
|
|
bash "$HERE/launch/launch_${config}.sh"
|
|
local rc=$?
|
|
if [[ $rc == 42 ]]; then
|
|
manifest "$config" "phase_b" "SKIP" "dependency missing"
|
|
return 0
|
|
fi
|
|
if [[ $rc != 0 ]]; then
|
|
manifest "$config" "phase_b" "FAIL" "launch rc=$rc"
|
|
return 0
|
|
fi
|
|
|
|
"$PYTHON" "$HERE/metrics_sampler.py" \
|
|
--url "http://127.0.0.1:$PORT/metrics" \
|
|
--output "$cfg_dir/metrics_B.jsonl" \
|
|
--interval 1.0 &
|
|
local sampler_pid=$!
|
|
|
|
"$PYTHON" "$HERE/bench_loop.py" \
|
|
--url "http://127.0.0.1:$PORT/v1/chat/completions" \
|
|
--model "$MODEL_PATH" \
|
|
--phase B \
|
|
--rate "$ref_safe" \
|
|
--shapes "$PHASE_B_SHAPES" \
|
|
--duration "$DURATION" \
|
|
--min-completed "$MIN_COMPLETED" \
|
|
--warmup 10 \
|
|
--output-dir "$cfg_dir"
|
|
local bench_rc=$?
|
|
|
|
kill -9 "$sampler_pid" 2>/dev/null || true
|
|
kill_vllm "$cfg_dir/.vllm.pid"
|
|
[[ -f "$cfg_dir/.dummy.pid" ]] && kill -9 "$(cat "$cfg_dir/.dummy.pid")" 2>/dev/null
|
|
|
|
if [[ $bench_rc != 0 ]]; then
|
|
manifest "$config" "phase_b" "FAIL" "bench rc=$bench_rc"
|
|
return 0
|
|
fi
|
|
manifest "$config" "phase_b" "OK" ""
|
|
}
|
|
|
|
# ── Stage 0 — pre-flight ────────────────────────────────────────────────
|
|
if [[ " $STAGES " == *" 0 "* ]]; then
|
|
echo "=== Stage 0 — pre-flight ==="
|
|
mkdir -p "$RUN_ROOT/preflight"
|
|
pip freeze > "$RUN_ROOT/preflight/pip_freeze.txt" 2>&1 || true
|
|
nvidia-smi -L > "$RUN_ROOT/preflight/nvidia.txt" 2>&1 || true
|
|
|
|
# Apply step-timing patch unless SKIP_PATCH=1
|
|
if [[ "${SKIP_PATCH:-0}" != "1" ]]; then
|
|
if "$PYTHON" "$HERE/patches/apply_step_timing.py" --apply > "$RUN_ROOT/preflight/patch.log" 2>&1; then
|
|
echo "step_timing_available=true" > "$RUN_ROOT/preflight/patch_status.txt"
|
|
else
|
|
echo "step_timing_available=false (apply failed)" > "$RUN_ROOT/preflight/patch_status.txt"
|
|
cat "$RUN_ROOT/preflight/patch.log"
|
|
fi
|
|
else
|
|
echo "step_timing_available=false (SKIP_PATCH=1)" > "$RUN_ROOT/preflight/patch_status.txt"
|
|
fi
|
|
fi
|
|
|
|
# ── Stage 1 — Phase A all configs ──────────────────────────────────────
|
|
if [[ " $STAGES " == *" 1 "* ]]; then
|
|
echo "=== Stage 1 — Phase A (all configs, randomized) ==="
|
|
for cfg in $(shuffle_configs "${ALL_CONFIGS[@]}"); do
|
|
run_phase_a "$cfg"
|
|
done
|
|
fi
|
|
|
|
# ── Stage 2 — pick reference rates ─────────────────────────────────────
|
|
if [[ " $STAGES " == *" 2 "* ]]; then
|
|
echo "=== Stage 2 — pick reference rates ==="
|
|
"$PYTHON" "$HERE/analyze.py" --root "$RUN_ROOT"
|
|
fi
|
|
|
|
# ── Stage 3 — Phase B all configs ──────────────────────────────────────
|
|
if [[ " $STAGES " == *" 3 "* ]]; then
|
|
echo "=== Stage 3 — Phase B (all configs, randomized) ==="
|
|
for cfg in $(shuffle_configs "${ALL_CONFIGS[@]}"); do
|
|
run_phase_b "$cfg"
|
|
done
|
|
fi
|
|
|
|
# ── Stage 4 — analyze + plot + revert ──────────────────────────────────
|
|
if [[ " $STAGES " == *" 4 "* ]]; then
|
|
echo "=== Stage 4 — re-analyze with Phase B + plot + revert patch ==="
|
|
"$PYTHON" "$HERE/analyze.py" --root "$RUN_ROOT"
|
|
"$PYTHON" "$HERE/plot_connector_tax.py" --root "$RUN_ROOT"
|
|
"$PYTHON" "$HERE/patches/apply_step_timing.py" --revert >> "$RUN_ROOT/preflight/patch.log" 2>&1 || true
|
|
echo "Done. Results in $RUN_ROOT"
|
|
fi
|