bench.sh: add trap for graceful cleanup on kill/interrupt
Added EXIT/INT/TERM traps to ensure vLLM, proxy, and gpu_monitor processes are cleaned up even when bench.sh is killed externally. Also includes gpu_monitor in cleanup_gpu pattern matching. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -87,12 +87,11 @@ CONF
|
|||||||
# ─── GPU Cleanup (verified) ────────────────────────────────────────────────
|
# ─── GPU Cleanup (verified) ────────────────────────────────────────────────
|
||||||
|
|
||||||
cleanup_gpu() {
|
cleanup_gpu() {
|
||||||
echo "[cleanup] Killing all vLLM/proxy processes..."
|
echo "[cleanup] Killing all vLLM/proxy/monitor processes..."
|
||||||
for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do
|
for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy|gpu_monitor' | grep -v grep | awk '{print $2}' 2>/dev/null); do
|
||||||
kill -9 "$p" 2>/dev/null || true
|
kill -9 "$p" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
sleep 3
|
sleep 3
|
||||||
# Kill any remaining GPU holders
|
|
||||||
local gpu_pids
|
local gpu_pids
|
||||||
gpu_pids=$(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true)
|
gpu_pids=$(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true)
|
||||||
if [ -n "$gpu_pids" ]; then
|
if [ -n "$gpu_pids" ]; then
|
||||||
@@ -100,7 +99,6 @@ cleanup_gpu() {
|
|||||||
echo "$gpu_pids" | xargs -r kill -9 2>/dev/null || true
|
echo "$gpu_pids" | xargs -r kill -9 2>/dev/null || true
|
||||||
sleep 5
|
sleep 5
|
||||||
fi
|
fi
|
||||||
# Verify GPUs are free
|
|
||||||
local used
|
local used
|
||||||
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}')
|
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END{print s}')
|
||||||
if [ "${used:-0}" -gt 100 ]; then
|
if [ "${used:-0}" -gt 100 ]; then
|
||||||
@@ -111,6 +109,9 @@ cleanup_gpu() {
|
|||||||
echo "[cleanup] All GPUs verified free."
|
echo "[cleanup] All GPUs verified free."
|
||||||
}
|
}
|
||||||
|
|
||||||
|
trap 'echo "[bench.sh] Caught signal, cleaning up..."; cleanup_gpu; exit 1' INT TERM
|
||||||
|
trap 'cleanup_gpu' EXIT
|
||||||
|
|
||||||
# ─── Launch vLLM instances ─────────────────────────────────────────────────
|
# ─── Launch vLLM instances ─────────────────────────────────────────────────
|
||||||
|
|
||||||
launch_instances() {
|
launch_instances() {
|
||||||
@@ -335,6 +336,6 @@ launch_proxy
|
|||||||
run_benchmark
|
run_benchmark
|
||||||
collect_artifacts
|
collect_artifacts
|
||||||
print_summary
|
print_summary
|
||||||
cleanup_gpu
|
# cleanup_gpu runs automatically via EXIT trap
|
||||||
|
|
||||||
echo "[done] $(date)"
|
echo "[done] $(date)"
|
||||||
|
|||||||
Reference in New Issue
Block a user