agentic-pd-hybrid/scripts/sweep_ts1_migration_v2.sh

#!/bin/bash
# Migration v2 validation: KVC 1P3D ts=1 with BOTH:
#   (1) reset-on-success blacklist decay (replay.py code change)
#   (2) --kvcache-direct-max-uncached-tokens 8192 (was 2048 default)
#
# v1 results (kvc_1p3d_migration_run1) showed:
#   - lat mean WORSE +11.7%, TTFT mean WORSE +71.3% — thrashing tax
#   - direct-to-D rate UP +10.5pp (42.8 → 53.3%)
#   - Fallback breakdown surprise: 41.3% are 'real-large-append' (>2048 tok),
#     NOT 'session-not-resident' as we hypothesized
#
# v2 design (REFACTOR_PLAN_V1 + MIGRATION_V1_FINDINGS):
#   (1) reset-on-success: clear (sess,D) reject counter on successful direct-to-D
#       — eliminates blacklist-permanence bug → kills thrashing
#   (2) bump direct-append threshold 2048 → 8192: lets more large-append turns
#       go direct-to-D instead of fall through to seed (which often rejects)
set -euo pipefail
cd "$(dirname "$0")/.."

MODEL=/mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507
TRACE=outputs/qwen35-swebench-50sess.jsonl
OUTPUT=outputs/qwen3-30b-tp1-ts1-migration-v2
VENV_PYTHON=.venv/bin/python
RESULTS_FILE=$OUTPUT/sweep_results.txt

mkdir -p $OUTPUT

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a $RESULTS_FILE; }

log "=== TS=1 MIGRATION v2: reset-on-success + threshold=8192 ==="
log "Baselines:"
log "  baseline (no migration):        kvc_1p3d_run1 errors=5 lat_p50=0.811s ttft_p50=0.124s direct=42.8%"
log "  v1 (migration permanent):       kvc_1p3d_migration_run1 errors=6 lat_p50=0.773s ttft_p50=0.057s direct=53.3% lat_mean=1.758s"
log "  4DP ts=1:                       errors=0 lat_p50=0.659s ttft_p50=0.090s lat_mean=1.443s"
log "Goal: kill thrashing tax (lat_mean ≤ 1.5s, p99 ≤ 9s) while preserving v1's direct-to-D gains."

label=kvc_1p3d_migration_v2_run1
log ""
log "=== [migration v2] starting ==="
PYTHONPATH=src:third_party/sglang/python \
$VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
  --trace $TRACE \
  --output-root $OUTPUT \
  --mechanism kvcache-centric \
  --policy kv-aware \
  --model-path $MODEL \
  --prefill-workers 1 --decode-workers 3 \
  --prefill-tp-size 1 --decode-tp-size 1 \
  --prefill-gpu-ids 0 --decode-gpu-ids 1,2,3 \
  --transfer-backend mooncake \
  --gpu-budget 4 \
  --time-scale 1 \
  --session-sample-rate 1.0 \
  --target-duration-s 100000 \
  --concurrency-limit 32 \
  --timeout-s 900 \
  --request-timeout-s 300 \
  --kvcache-admission-mode worker \
  --kvcache-seed-min-turn-id 1 \
  --kvcache-seed-max-inflight-decode -1 \
  --kvcache-prefill-backup-policy release-after-transfer \
  --kvcache-prefill-priority-eviction \
  --kvcache-migration-reject-threshold 3 \
  --kvcache-direct-max-uncached-tokens 8192

run_dir=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
log "=== [migration v2] $label COMPLETED ==="
if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
  cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
  cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
  errs=$($VENV_PYTHON -c "import json; d=json.load(open('$OUTPUT/${label}_summary.json')); print(d.get('error_count',0))")
  p50=$($VENV_PYTHON -c "import json; d=json.load(open('$OUTPUT/${label}_summary.json')); print(d.get('latency_stats_s',{}).get('p50',0))")
  log "  errors=$errs lat_p50=${p50}s"
  cat "$run_dir/request-metrics.jsonl.summary.json" >> $RESULTS_FILE
fi
log "=== migration v2 DONE ==="