agentic-pd-hybrid/scripts/sweep_e4_pressured.sh

#!/usr/bin/env bash
# E4-pressured — same as E4 but tuned to force admission rejections so the
# D→P snapshot fast-path actually fires.
#
# Key delta vs sweep_e4_kvc_v2_d_to_p_sync.sh:
#   --kvcache-migration-reject-threshold 1   (was 3)
#       After ONE rejection the policy migrates the session to a different
#       D, which in turn triggers _invoke_kvcache_seeded_router → D→P sync.
#   --decode-mem-fraction-static 0.4
#       Plumbed through cli.py → topology.decode_extra_server_args →
#       launcher. Shrinks per-decode KV pool, forcing admit_direct_append
#       to reject more often.
#
# Hypotheses (same as docs/E4_PROTOCOL_ZH.md but in a stressed regime):
#   H1'  E4-pressured TTFT p99 ≤ E1 TTFT p99
#   H2'  D→P snapshot succeeds for ≥ 20% of reseed-triggering requests
#   H3'  D→P-pushed-then-cache-hit reduces re-prefill segment of reseed
#        path TTFT measurably

set -euo pipefail
cd "$(dirname "$0")/.."

if [ -z "${CUDA_HOME:-}" ]; then
  echo "ERROR: CUDA_HOME not set. Source scripts/setup_env.sh first." >&2
  exit 1
fi

MODEL=${MODEL:-/mnt/models/Qwen/Qwen3-30B-A3B-Instruct-2507}
TRACE=${TRACE:-third_party/traces/qwen35-swebench-50sess.jsonl}
OUTPUT=${OUTPUT:-outputs/e4p_kvc_v2_d_to_p_sync_pressured_50sess}
IB_DEVICE=${IB_DEVICE:-mlx5_60}
LOAD_FLOOR_BONUS=${LOAD_FLOOR_BONUS:-200}
REJECT_THRESHOLD=${REJECT_THRESHOLD:-1}
MEM_FRACTION=${MEM_FRACTION:-0.5}
# time-scale: 1 = realistic 5.44h timeline for the SWE-Bench trace;
# 10 = compress to ~33 min; 60 = compress to ~5.5 min (stress test).
TIME_SCALE=${TIME_SCALE:-1}

if [ ! -f "$TRACE" ]; then
  echo "ERROR: trace not found at $TRACE" >&2
  exit 1
fi

mkdir -p "$OUTPUT"
LOG="$OUTPUT/sweep.log"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }

log "=== E4-pressured: KVC v2 + RDMA + load-floor K=$LOAD_FLOOR_BONUS + D→P sync + reject_threshold=$REJECT_THRESHOLD + mem_fraction=$MEM_FRACTION ==="
log "MODEL=$MODEL"
log "TRACE=$TRACE ($(wc -l < $TRACE) requests)"
log "OUTPUT=$OUTPUT"

label=e4p_kvc_v2_d_to_p_sync_run1
log "=== [E4p] $label starting ==="

# Background GPU utilization sampler — every 1 s, all 4 GPUs, CSV output.
GPU_CSV="$OUTPUT/gpu_util.csv"
log "GPU sampling → $GPU_CSV (1 Hz, gpus 0-3)"
echo "timestamp_iso,gpu_index,util_pct,mem_used_MiB,mem_total_MiB,sm_clock_MHz,power_W,temperature_C" > "$GPU_CSV"
(
  while true; do
    ts_iso=$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)
    nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total,clocks.sm,power.draw,temperature.gpu \
               --format=csv,noheader,nounits 2>/dev/null \
      | sed -e "s/^/${ts_iso},/" -e 's/ //g' >> "$GPU_CSV" || true
    sleep 1
  done
) &
GPU_SAMPLER_PID=$!
log "GPU sampler pid=$GPU_SAMPLER_PID"

cleanup_gpu_sampler() {
  kill -9 "$GPU_SAMPLER_PID" 2>/dev/null || true
  wait "$GPU_SAMPLER_PID" 2>/dev/null || true
  log "GPU sampler stopped (output: $GPU_CSV, $(wc -l < "$GPU_CSV") rows)"
}
trap cleanup_gpu_sampler EXIT INT TERM

uv run --no-sync python -m agentic_pd_hybrid.cli benchmark-live \
  --trace "$TRACE" \
  --output-root "$OUTPUT" \
  --mechanism kvcache-centric \
  --policy kv-aware \
  --model-path "$MODEL" \
  --prefill-workers 1 --decode-workers 3 \
  --prefill-tp-size 1 --decode-tp-size 1 \
  --prefill-gpu-ids 0 --decode-gpu-ids 1,2,3 \
  --transfer-backend mooncake \
  --force-rdma --ib-device "$IB_DEVICE" \
  --gpu-budget 4 \
  --time-scale "$TIME_SCALE" \
  --session-sample-rate 1.0 \
  --target-duration-s 100000 \
  --concurrency-limit 32 \
  --timeout-s 1800 \
  --request-timeout-s 300 \
  --kvcache-admission-mode worker \
  --kvcache-seed-min-turn-id 1 \
  --kvcache-seed-max-inflight-decode -1 \
  --kvcache-prefill-backup-policy release-after-transfer \
  --kvcache-prefill-priority-eviction \
  --kvcache-migration-reject-threshold "$REJECT_THRESHOLD" \
  --kvcache-direct-max-uncached-tokens 8192 \
  --kvcache-load-floor-bonus "$LOAD_FLOOR_BONUS" \
  --decode-mem-fraction-static "${DECODE_MEM_FRAC:-0.4}" \
  --enable-d-to-p-sync 2>&1 | tee -a "$LOG"

run_dir=$(ls -td "$OUTPUT"/kvcache-centric-*/ 2>/dev/null | head -1)
log "=== [E4p] $label COMPLETED, artifacts at $run_dir ==="

if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
  cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
  cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
  log "=== summary saved to $OUTPUT/${label}_summary.json ==="
fi