Files
Gahow Wang a473c71cac Connector tax Phase A: build_connector_meta is 1.4ms/step (the tax source)
Per-step timing from engine_step.jsonl definitively resolves H3:
  plain:            53 μs/step (p50)
  noop_connector:   69 μs/step (+16 μs = negligible framework cost)
  mooncake_producer: 1461 μs/step (build_connector_meta = 1386 μs)
  mooncake_both:    1452 μs/step (same as producer)

The substrate tax is NOT in the v1 framework — it's specifically in
Mooncake's build_connector_meta() which walks set(cache.keys()) every
scheduler step (O(|cache|) per step, E2 audit §6.5).

Accumulated per-request tax: 256 decode steps × 1.4ms = 358ms.
Observed TTFT tax at rate=1.0: plain 378ms vs mooncake_both 422ms (+12%).
At rate=2.0 (near saturation): +29%, approaching trace-replay's +45%.

Also fixes kill_vllm() to properly kill EngineCore subprocesses.
2026-05-26 19:33:15 +08:00

240 lines
8.7 KiB
Bash
Executable File

#!/bin/bash
# 5-stage barrier orchestrator for connector_tax microbench.
#
# Stage 0 — pre-flight + (optional) step-timing patch
# Stage 1 — Phase A (rate sweep) for all configs
# Stage 2 — pick reference rates from Phase A
# Stage 3 — Phase B (shape sweep at ref_safe) for all configs
# Stage 4 — revert patch + analyze + plot
#
# Configurable via env:
# CT_DATE : run-id directory tag (default $(date +%Y%m%d_%H%M))
# PORT : vLLM port (default 8000)
# GPU_ID : single GPU index (default 0)
# MODEL_PATH : path to model
# PHASE_A_RATES : default 0.5,1,2,4,8,16,32
# PHASE_B_SHAPES : default 512x64,512x256,512x1024,4096x64,4096x256,4096x1024,32768x64,32768x256,32768x1024
# MIN_COMPLETED : default 200
# DURATION : default 60
# SKIP_PATCH : set to 1 to bypass scheduler patch
# STAGES : space-separated list of stages to run, e.g. "1 3 4"
# defaults to "0 1 2 3 4"
set -uo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
RESULTS_ROOT="$HERE/results"
RUN_DATE="${CT_DATE:-$(date +%Y%m%d_%H%M)}"
RUN_ROOT="$RESULTS_ROOT/$RUN_DATE"
mkdir -p "$RUN_ROOT"
PORT="${PORT:-8000}"
GPU_ID="${GPU_ID:-0}"
MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
PHASE_A_RATES="${PHASE_A_RATES:-0.5,1,2,4,8,16,32}"
PHASE_B_SHAPES="${PHASE_B_SHAPES:-512x64,512x256,512x1024,4096x64,4096x256,4096x1024,32768x64,32768x256,32768x1024}"
MIN_COMPLETED="${MIN_COMPLETED:-200}"
DURATION="${DURATION:-60}"
STAGES="${STAGES:-0 1 2 3 4}"
ALL_CONFIGS=(plain noop_connector mooncake_producer mooncake_consumer mooncake_both nixl_both lmcache_only multi_mooncake_lmcache)
# Shuffle config order for thermal-drift robustness.
shuffle_configs() {
printf '%s\n' "$@" | shuf
}
PYTHON="$(cd "$HERE/../.." && pwd)/.venv/bin/python"
[[ -x "$PYTHON" ]] || PYTHON="$(command -v python3)"
PROJ_DIR="$(cd "$HERE/../.." && pwd)"
export PYTHONPATH="$PROJ_DIR:${PYTHONPATH:-}"
manifest() {
local config="$1" stage="$2" status="$3" note="$4"
echo "$(date -Iseconds) | $config | $stage | $status | $note" \
>> "$RUN_ROOT/MANIFEST.tsv"
}
kill_vllm() {
local pidfile="$1"
# Kill the API server PID
if [[ -f "$pidfile" ]]; then
local pid; pid=$(cat "$pidfile")
if [[ -n "$pid" ]]; then
kill -9 "$pid" 2>/dev/null || true
fi
fi
# Also kill any vLLM/EngineCore processes on this port or GPU
pkill -9 -f "port $PORT" 2>/dev/null || true
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
pkill -9 -f "vllm.entrypoints" 2>/dev/null || true
sleep 5
# Wait for GPU memory release (up to 60s)
for _ in $(seq 1 30); do
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i "$GPU_ID" 2>/dev/null | tr -d ' ')
if [[ -n "$used" && "$used" -lt 1000 ]]; then
return 0
fi
# Try killing any remaining GPU holders
fuser -k "/dev/nvidia${GPU_ID}" 2>/dev/null || true
sleep 2
done
echo "WARNING: GPU $GPU_ID still not free after 60s" >&2
}
run_phase_a() {
local config="$1"
local cfg_dir="$RUN_ROOT/$config"
mkdir -p "$cfg_dir"
echo "=== [$config] Phase A ==="
# Launch
RUN_DIR="$cfg_dir" PORT="$PORT" GPU_ID="$GPU_ID" MODEL_PATH="$MODEL_PATH" \
bash "$HERE/launch/launch_${config}.sh"
local rc=$?
if [[ $rc == 42 ]]; then
manifest "$config" "phase_a" "SKIP" "dependency missing"
return 0
fi
if [[ $rc != 0 ]]; then
manifest "$config" "phase_a" "FAIL" "launch rc=$rc"
return 0
fi
# /metrics sampler in background
"$PYTHON" "$HERE/metrics_sampler.py" \
--url "http://127.0.0.1:$PORT/metrics" \
--output "$cfg_dir/metrics_A.jsonl" \
--interval 1.0 &
local sampler_pid=$!
# bench loop
"$PYTHON" "$HERE/bench_loop.py" \
--url "http://127.0.0.1:$PORT/v1/chat/completions" \
--model "$MODEL_PATH" \
--phase A \
--rates "$PHASE_A_RATES" \
--shape "4096,256" \
--duration "$DURATION" \
--min-completed "$MIN_COMPLETED" \
--warmup 10 \
--output-dir "$cfg_dir"
local bench_rc=$?
kill -9 "$sampler_pid" 2>/dev/null || true
kill_vllm "$cfg_dir/.vllm.pid"
[[ -f "$cfg_dir/.dummy.pid" ]] && kill -9 "$(cat "$cfg_dir/.dummy.pid")" 2>/dev/null
if [[ $bench_rc != 0 ]]; then
manifest "$config" "phase_a" "FAIL" "bench rc=$bench_rc"
return 0
fi
manifest "$config" "phase_a" "OK" ""
}
run_phase_b() {
local config="$1"
local cfg_dir="$RUN_ROOT/$config"
mkdir -p "$cfg_dir"
local ref_safe; ref_safe=$(jq -r '.ref_safe // empty' "$RUN_ROOT/aggregate.json" 2>/dev/null)
if [[ -z "$ref_safe" || "$ref_safe" == "null" ]]; then
echo "ref_safe not available; skipping Phase B for $config"
manifest "$config" "phase_b" "SKIP" "ref_safe undefined"
return 0
fi
echo "=== [$config] Phase B (rate=$ref_safe) ==="
RUN_DIR="$cfg_dir" PORT="$PORT" GPU_ID="$GPU_ID" MODEL_PATH="$MODEL_PATH" \
bash "$HERE/launch/launch_${config}.sh"
local rc=$?
if [[ $rc == 42 ]]; then
manifest "$config" "phase_b" "SKIP" "dependency missing"
return 0
fi
if [[ $rc != 0 ]]; then
manifest "$config" "phase_b" "FAIL" "launch rc=$rc"
return 0
fi
"$PYTHON" "$HERE/metrics_sampler.py" \
--url "http://127.0.0.1:$PORT/metrics" \
--output "$cfg_dir/metrics_B.jsonl" \
--interval 1.0 &
local sampler_pid=$!
"$PYTHON" "$HERE/bench_loop.py" \
--url "http://127.0.0.1:$PORT/v1/chat/completions" \
--model "$MODEL_PATH" \
--phase B \
--rate "$ref_safe" \
--shapes "$PHASE_B_SHAPES" \
--duration "$DURATION" \
--min-completed "$MIN_COMPLETED" \
--warmup 10 \
--output-dir "$cfg_dir"
local bench_rc=$?
kill -9 "$sampler_pid" 2>/dev/null || true
kill_vllm "$cfg_dir/.vllm.pid"
[[ -f "$cfg_dir/.dummy.pid" ]] && kill -9 "$(cat "$cfg_dir/.dummy.pid")" 2>/dev/null
if [[ $bench_rc != 0 ]]; then
manifest "$config" "phase_b" "FAIL" "bench rc=$bench_rc"
return 0
fi
manifest "$config" "phase_b" "OK" ""
}
# ── Stage 0 — pre-flight ────────────────────────────────────────────────
if [[ " $STAGES " == *" 0 "* ]]; then
echo "=== Stage 0 — pre-flight ==="
mkdir -p "$RUN_ROOT/preflight"
pip freeze > "$RUN_ROOT/preflight/pip_freeze.txt" 2>&1 || true
nvidia-smi -L > "$RUN_ROOT/preflight/nvidia.txt" 2>&1 || true
# Apply step-timing patch unless SKIP_PATCH=1
if [[ "${SKIP_PATCH:-0}" != "1" ]]; then
if "$PYTHON" "$HERE/patches/apply_step_timing.py" --apply > "$RUN_ROOT/preflight/patch.log" 2>&1; then
echo "step_timing_available=true" > "$RUN_ROOT/preflight/patch_status.txt"
else
echo "step_timing_available=false (apply failed)" > "$RUN_ROOT/preflight/patch_status.txt"
cat "$RUN_ROOT/preflight/patch.log"
fi
else
echo "step_timing_available=false (SKIP_PATCH=1)" > "$RUN_ROOT/preflight/patch_status.txt"
fi
fi
# ── Stage 1 — Phase A all configs ──────────────────────────────────────
if [[ " $STAGES " == *" 1 "* ]]; then
echo "=== Stage 1 — Phase A (all configs, randomized) ==="
for cfg in $(shuffle_configs "${ALL_CONFIGS[@]}"); do
run_phase_a "$cfg"
done
fi
# ── Stage 2 — pick reference rates ─────────────────────────────────────
if [[ " $STAGES " == *" 2 "* ]]; then
echo "=== Stage 2 — pick reference rates ==="
"$PYTHON" "$HERE/analyze.py" --root "$RUN_ROOT"
fi
# ── Stage 3 — Phase B all configs ──────────────────────────────────────
if [[ " $STAGES " == *" 3 "* ]]; then
echo "=== Stage 3 — Phase B (all configs, randomized) ==="
for cfg in $(shuffle_configs "${ALL_CONFIGS[@]}"); do
run_phase_b "$cfg"
done
fi
# ── Stage 4 — analyze + plot + revert ──────────────────────────────────
if [[ " $STAGES " == *" 4 "* ]]; then
echo "=== Stage 4 — re-analyze with Phase B + plot + revert patch ==="
"$PYTHON" "$HERE/analyze.py" --root "$RUN_ROOT"
"$PYTHON" "$HERE/plot_connector_tax.py" --root "$RUN_ROOT"
"$PYTHON" "$HERE/patches/apply_step_timing.py" --revert >> "$RUN_ROOT/preflight/patch.log" 2>&1 || true
echo "Done. Results in $RUN_ROOT"
fi