diff --git a/scripts/sweep_e4_pressured.sh b/scripts/sweep_e4_pressured.sh new file mode 100755 index 0000000..71ade30 --- /dev/null +++ b/scripts/sweep_e4_pressured.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# E4-pressured — same as E4 but tuned to force admission rejections so the +# D→P snapshot fast-path actually fires. +# +# Key delta vs sweep_e4_kvc_v2_d_to_p_sync.sh: +# --kvcache-migration-reject-threshold 1 (was 3) +# After ONE rejection the policy migrates the session to a different +# D, which in turn triggers _invoke_kvcache_seeded_router → D→P sync. +# (rely solely on reject_threshold=1 for now; mem-fraction reduction +# would need extra_server_args plumbing which is out-of-scope here) +# +# Hypotheses (same as docs/E4_PROTOCOL_ZH.md but in a stressed regime): +# H1' E4-pressured TTFT p99 ≤ E1 TTFT p99 +# H2' D→P snapshot succeeds for ≥ 20% of reseed-triggering requests +# H3' D→P-pushed-then-cache-hit reduces re-prefill segment of reseed +# path TTFT measurably + +set -euo pipefail +cd "$(dirname "$0")/.." + +if [ -z "${CUDA_HOME:-}" ]; then + echo "ERROR: CUDA_HOME not set. Source scripts/setup_env.sh first." >&2 + exit 1 +fi + +MODEL=${MODEL:-/mnt/models/Qwen/Qwen3-30B-A3B-Instruct-2507} +TRACE=${TRACE:-outputs/inferact_50sess.jsonl} +OUTPUT=${OUTPUT:-outputs/e4p_kvc_v2_d_to_p_sync_pressured_50sess} +IB_DEVICE=${IB_DEVICE:-mlx5_60} +LOAD_FLOOR_BONUS=${LOAD_FLOOR_BONUS:-200} +REJECT_THRESHOLD=${REJECT_THRESHOLD:-1} +MEM_FRACTION=${MEM_FRACTION:-0.5} + +if [ ! -f "$TRACE" ]; then + echo "ERROR: trace not found at $TRACE" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT" +LOG="$OUTPUT/sweep.log" + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; } + +log "=== E4-pressured: KVC v2 + RDMA + load-floor K=$LOAD_FLOOR_BONUS + D→P sync + reject_threshold=$REJECT_THRESHOLD + mem_fraction=$MEM_FRACTION ===" +log "MODEL=$MODEL" +log "TRACE=$TRACE ($(wc -l < $TRACE) requests)" +log "OUTPUT=$OUTPUT" + +label=e4p_kvc_v2_d_to_p_sync_run1 +log "=== [E4p] $label starting ===" + +uv run --no-sync python -m agentic_pd_hybrid.cli benchmark-live \ + --trace "$TRACE" \ + --output-root "$OUTPUT" \ + --mechanism kvcache-centric \ + --policy kv-aware \ + --model-path "$MODEL" \ + --prefill-workers 1 --decode-workers 3 \ + --prefill-tp-size 1 --decode-tp-size 1 \ + --prefill-gpu-ids 0 --decode-gpu-ids 1,2,3 \ + --transfer-backend mooncake \ + --force-rdma --ib-device "$IB_DEVICE" \ + --gpu-budget 4 \ + --time-scale 1 \ + --session-sample-rate 1.0 \ + --target-duration-s 100000 \ + --concurrency-limit 32 \ + --timeout-s 1800 \ + --request-timeout-s 300 \ + --kvcache-admission-mode worker \ + --kvcache-seed-min-turn-id 1 \ + --kvcache-seed-max-inflight-decode -1 \ + --kvcache-prefill-backup-policy release-after-transfer \ + --kvcache-prefill-priority-eviction \ + --kvcache-migration-reject-threshold "$REJECT_THRESHOLD" \ + --kvcache-direct-max-uncached-tokens 8192 \ + --kvcache-load-floor-bonus "$LOAD_FLOOR_BONUS" \ + --enable-d-to-p-sync 2>&1 | tee -a "$LOG" + +run_dir=$(ls -td "$OUTPUT"/kvcache-centric-*/ 2>/dev/null | head -1) +log "=== [E4p] $label COMPLETED, artifacts at $run_dir ===" + +if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then + cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json" + cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl" + log "=== summary saved to $OUTPUT/${label}_summary.json ===" +fi