#!/usr/bin/env bash # Smoke sweep: validate backpressure code change on top of v5 Option D config. # Designed to fit in ~3-4h GPU budget (4 runs × ~30-60 min). # # Usage: # bash scripts/sweep_backpressure_smoke.sh # # Prerequisites: GPUs available; trace at outputs/qwen35-swebench-50sess.jsonl; # model at $MODEL_PATH (default Qwen3-30B-A3B-Instruct-2507). set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$REPO_ROOT" OUT_ROOT=${OUT_ROOT:-outputs/sweep_backpressure_smoke} TRACE=${TRACE:-outputs/qwen35-swebench-50sess.jsonl} MODEL=${MODEL:-/mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507} mkdir -p "$OUT_ROOT" LOG="$OUT_ROOT/sweep.log" echo "[$(date '+%F %T')] Starting backpressure smoke sweep" | tee -a "$LOG" echo " Trace: $TRACE" | tee -a "$LOG" echo " Model: $MODEL" | tee -a "$LOG" echo " Output root: $OUT_ROOT" | tee -a "$LOG" KVC_COMMON_ARGS=( --trace "$TRACE" --model "$MODEL" --mechanism kvcache-centric --policy kv-aware --kvcache-admission-mode worker --kvcache-seed-min-turn-id 1 --kvcache-seed-max-inflight-decode -1 --kvcache-prefill-backup-policy release-after-transfer --kvcache-prefill-priority-eviction --prefill-workers 2 --decode-workers 6 --prefill-gpu-ids 0,1 --decode-gpu-ids 2,3,4,5,6,7 --transfer-backend mooncake --target-duration-s 2000 --session-sample-rate 1.0 --min-turns 2 --concurrency-limit 32 ) DP_COMMON_ARGS=( --trace "$TRACE" --model "$MODEL" --mechanism pd-colo --policy kv-aware --direct-workers 8 --direct-gpu-ids 0,1,2,3,4,5,6,7 --transfer-backend mooncake --target-duration-s 2000 --session-sample-rate 1.0 --min-turns 2 --concurrency-limit 32 ) run_kvc_baseline_ts10() { local out="$OUT_ROOT/E1_kvc_baseline_ts10" echo "[$(date '+%F %T')] === E1: KVC baseline (no backpressure) time-scale=10 ===" | tee -a "$LOG" python -m agentic_pd_hybrid.cli benchmark-live \ "${KVC_COMMON_ARGS[@]}" \ --output-root "$out" \ --time-scale 10 \ 2>&1 | tee -a "$LOG" } run_kvc_backpressure_ts10() { local out="$OUT_ROOT/E2_kvc_backpressure_ts10" echo "[$(date '+%F %T')] === E2: KVC + backpressure ON, time-scale=10 ===" | tee -a "$LOG" python -m agentic_pd_hybrid.cli benchmark-live \ "${KVC_COMMON_ARGS[@]}" \ --output-root "$out" \ --time-scale 10 \ --enable-backpressure \ --backpressure-max-pause-s 2.0 \ 2>&1 | tee -a "$LOG" } run_kvc_backpressure_ts1() { local out="$OUT_ROOT/E3_kvc_backpressure_ts1_short" echo "[$(date '+%F %T')] === E3: KVC + backpressure ON, time-scale=1, FIRST 1000 reqs ===" | tee -a "$LOG" python -m agentic_pd_hybrid.cli benchmark-live \ "${KVC_COMMON_ARGS[@]}" \ --output-root "$out" \ --time-scale 1 \ --enable-backpressure \ --backpressure-max-pause-s 2.0 \ --target-duration-s 1800 \ 2>&1 | tee -a "$LOG" } run_dp_baseline_ts1() { local out="$OUT_ROOT/E4_dp_ts1_short" echo "[$(date '+%F %T')] === E4: 8-way DP cache-aware, time-scale=1, FIRST 1000 reqs ===" | tee -a "$LOG" python -m agentic_pd_hybrid.cli benchmark-live \ "${DP_COMMON_ARGS[@]}" \ --output-root "$out" \ --time-scale 1 \ --target-duration-s 1800 \ 2>&1 | tee -a "$LOG" } # Sequence — add/remove as fits the budget. run_kvc_baseline_ts10 run_kvc_backpressure_ts10 run_kvc_backpressure_ts1 run_dp_baseline_ts1 echo "[$(date '+%F %T')] === sweep DONE ===" | tee -a "$LOG" echo "Run analysis with: python scripts/analysis/analyze_backpressure_smoke.py $OUT_ROOT" | tee -a "$LOG"