agentic-pd-hybrid/scripts/sweep_tp1_v5_baseline_rerun_exp2.sh

#!/bin/bash
# P0: Re-run v5 baseline EXP2 (2P6D) three times to establish whether
# errors=9 is a stable property of the v5 config or single-run variance.
# Critic of V5_PROFILE_INVESTIGATION_ZH.md flagged that the 415 errors in
# v5+profile EXP2 may have been polling-induced. We need 3 baseline runs
# (no polling, identical config to original v5) to test reproducibility.
#
# Output:
#   outputs/qwen3-30b-tp1-v5-optD-baseline-rerun/
#     ├── exp2_2p6d_run{1,2,3}_summary.json
#     ├── exp2_2p6d_run{1,2,3}_metrics.jsonl
#     └── kvcache-centric-...<ts>/   (one per run)
set -euo pipefail
cd "$(dirname "$0")/.."

MODEL=/mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507
TRACE=outputs/qwen35-swebench-50sess.jsonl
OUTPUT=outputs/qwen3-30b-tp1-v5-optD-baseline-rerun
VENV_PYTHON=.venv/bin/python
RESULTS_FILE=$OUTPUT/sweep_results.txt

mkdir -p $OUTPUT

log() {
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a $RESULTS_FILE
}

run_exp2() {
  local run_idx=$1
  local label="exp2_2p6d_run${run_idx}"
  log ""
  log "=== [RUN ${run_idx}/3] EXP2 2P6D KVC kv-aware Option D (no polling) ==="
  PYTHONPATH=src:third_party/sglang/python \
  $VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
    --trace $TRACE \
    --output-root $OUTPUT \
    --mechanism kvcache-centric \
    --policy kv-aware \
    --model-path $MODEL \
    --prefill-workers 2 --decode-workers 6 \
    --prefill-tp-size 1 --decode-tp-size 1 \
    --prefill-gpu-ids 0,1 --decode-gpu-ids 2,3,4,5,6,7 \
    --transfer-backend mooncake \
    --gpu-budget 8 \
    --time-scale 10 \
    --session-sample-rate 1.0 \
    --target-duration-s 100000 \
    --concurrency-limit 32 \
    --timeout-s 900 \
    --request-timeout-s 300 \
    --kvcache-admission-mode worker \
    --kvcache-seed-min-turn-id 1 \
    --kvcache-seed-max-inflight-decode -1 \
    --kvcache-prefill-backup-policy release-after-transfer \
    --kvcache-prefill-priority-eviction

  local run_dir=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
  log "=== [RUN ${run_idx}/3] $label COMPLETED ==="
  if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
    cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
    cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
    local errs=$($VENV_PYTHON -c "import json; d=json.load(open('$OUTPUT/${label}_summary.json')); print(d.get('error_count',0))")
    log "  errors = $errs (baseline reference = 9)"
    cat "$run_dir/request-metrics.jsonl.summary.json" >> $RESULTS_FILE
    echo "" >> $RESULTS_FILE
  else
    log "WARNING: no summary file in $run_dir"
  fi
}

log "=== P0: v5 baseline EXP2 reproducibility test (3 runs, no polling) ==="
log "Model: $MODEL"
log "Trace: $TRACE (4449 requests, 52 sessions)"
log "Goal: confirm whether errors=9 in v5 baseline EXP2 is reproducible"
log "      (v5+profile saw 415 errors; we need to know if polling was causal)"

for i in 1 2 3; do
  run_exp2 $i
done

log ""
log "=== P0 SUMMARY: errors per run ==="
for i in 1 2 3; do
  if [ -f "$OUTPUT/exp2_2p6d_run${i}_summary.json" ]; then
    e=$($VENV_PYTHON -c "import json; d=json.load(open('$OUTPUT/exp2_2p6d_run${i}_summary.json')); print(d.get('error_count',0))")
    log "  run ${i}: errors = $e"
  fi
done
log "=== P0 ALL DONE ==="