Files
agentic-pd-hybrid/scripts/sweep_backpressure_smoke.sh
kzlin 7affb565b2 feat(kvc): add backpressure smoke sweep + analyzer (and v6 p1 profile script)
scripts/sweep_backpressure_smoke.sh: 4-run smoke matrix (KVC baseline /
KVC + backpressure / KVC + backpressure @ time-scale=1 / DP @
time-scale=1) designed to fit ~3-4h GPU budget. Validates §3 backpressure
implementation and partially probes §7 time-scale distortion.

scripts/analysis/analyze_backpressure_smoke.py: consumes the new
structural/* jsonl files plus request-metrics; emits headline metrics,
backpressure histograms, admission probe stats, and per-session pinning
distribution.

scripts/sweep_tp1_v6_p1_profile.sh: pre-existing v6 P1 profile sweep
script (was untracked; included for completeness).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 21:29:56 +08:00

115 lines
3.6 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Smoke sweep: validate backpressure code change on top of v5 Option D config.
# Designed to fit in ~3-4h GPU budget (4 runs × ~30-60 min).
#
# Usage:
# bash scripts/sweep_backpressure_smoke.sh
#
# Prerequisites: GPUs available; trace at outputs/qwen35-swebench-50sess.jsonl;
# model at $MODEL_PATH (default Qwen3-30B-A3B-Instruct-2507).
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$REPO_ROOT"
OUT_ROOT=${OUT_ROOT:-outputs/sweep_backpressure_smoke}
TRACE=${TRACE:-outputs/qwen35-swebench-50sess.jsonl}
MODEL=${MODEL:-/mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507}
mkdir -p "$OUT_ROOT"
LOG="$OUT_ROOT/sweep.log"
echo "[$(date '+%F %T')] Starting backpressure smoke sweep" | tee -a "$LOG"
echo " Trace: $TRACE" | tee -a "$LOG"
echo " Model: $MODEL" | tee -a "$LOG"
echo " Output root: $OUT_ROOT" | tee -a "$LOG"
KVC_COMMON_ARGS=(
--trace "$TRACE"
--model "$MODEL"
--mechanism kvcache-centric
--policy kv-aware
--kvcache-admission-mode worker
--kvcache-seed-min-turn-id 1
--kvcache-seed-max-inflight-decode -1
--kvcache-prefill-backup-policy release-after-transfer
--kvcache-prefill-priority-eviction
--prefill-workers 2
--decode-workers 6
--prefill-gpu-ids 0,1
--decode-gpu-ids 2,3,4,5,6,7
--transfer-backend mooncake
--target-duration-s 2000
--session-sample-rate 1.0
--min-turns 2
--concurrency-limit 32
)
DP_COMMON_ARGS=(
--trace "$TRACE"
--model "$MODEL"
--mechanism pd-colo
--policy kv-aware
--direct-workers 8
--direct-gpu-ids 0,1,2,3,4,5,6,7
--transfer-backend mooncake
--target-duration-s 2000
--session-sample-rate 1.0
--min-turns 2
--concurrency-limit 32
)
run_kvc_baseline_ts10() {
local out="$OUT_ROOT/E1_kvc_baseline_ts10"
echo "[$(date '+%F %T')] === E1: KVC baseline (no backpressure) time-scale=10 ===" | tee -a "$LOG"
python -m agentic_pd_hybrid.cli benchmark-live \
"${KVC_COMMON_ARGS[@]}" \
--output-root "$out" \
--time-scale 10 \
2>&1 | tee -a "$LOG"
}
run_kvc_backpressure_ts10() {
local out="$OUT_ROOT/E2_kvc_backpressure_ts10"
echo "[$(date '+%F %T')] === E2: KVC + backpressure ON, time-scale=10 ===" | tee -a "$LOG"
python -m agentic_pd_hybrid.cli benchmark-live \
"${KVC_COMMON_ARGS[@]}" \
--output-root "$out" \
--time-scale 10 \
--enable-backpressure \
--backpressure-max-pause-s 2.0 \
2>&1 | tee -a "$LOG"
}
run_kvc_backpressure_ts1() {
local out="$OUT_ROOT/E3_kvc_backpressure_ts1_short"
echo "[$(date '+%F %T')] === E3: KVC + backpressure ON, time-scale=1, FIRST 1000 reqs ===" | tee -a "$LOG"
python -m agentic_pd_hybrid.cli benchmark-live \
"${KVC_COMMON_ARGS[@]}" \
--output-root "$out" \
--time-scale 1 \
--enable-backpressure \
--backpressure-max-pause-s 2.0 \
--target-duration-s 1800 \
2>&1 | tee -a "$LOG"
}
run_dp_baseline_ts1() {
local out="$OUT_ROOT/E4_dp_ts1_short"
echo "[$(date '+%F %T')] === E4: 8-way DP cache-aware, time-scale=1, FIRST 1000 reqs ===" | tee -a "$LOG"
python -m agentic_pd_hybrid.cli benchmark-live \
"${DP_COMMON_ARGS[@]}" \
--output-root "$out" \
--time-scale 1 \
--target-duration-s 1800 \
2>&1 | tee -a "$LOG"
}
# Sequence — add/remove as fits the budget.
run_kvc_baseline_ts10
run_kvc_backpressure_ts10
run_kvc_backpressure_ts1
run_dp_baseline_ts1
echo "[$(date '+%F %T')] === sweep DONE ===" | tee -a "$LOG"
echo "Run analysis with: python scripts/analysis/analyze_backpressure_smoke.py $OUT_ROOT" | tee -a "$LOG"