#!/usr/bin/env bash # E3 — KVC v2 + RDMA + load-floor bonus, ts=1 # # Validates the load-floor bonus fix proposed in # docs/E1_E2_FIX_DESIGN_ZH.md §Q2.B. Identical to E2 except: # --kvcache-load-floor-bonus 200 # # Pair-wise vs E1 (no KVC layer) and E2 (KVC v2 without bonus) on the # exact same outputs/inferact_50sess.jsonl subset. # # Hypotheses being tested: # H1 (load balance): D2 should now receive non-trivial bindings # (E1/E2 had 0 — see E1_E2_RESULTS_ZH.md §5d). # H2 (failure rate): mooncake batch_transfer_sync timeouts should # stop firing because D0/D1 KV pool no longer # saturates → no LRU thrash → control plane no # longer starves. E2 had 1054 failures; expect # ≤ E1's 85. # H3 (TTFT): the 231 successful E2 reqs had TTFT p50 = 0.43s, # well under E1's 88.6s. With the failure cascade # removed, these should generalize to most reqs. # # Prerequisites: # - source scripts/setup_env.sh # (sets CUDA_HOME, MC_TRANSFER_TIMEOUT=1800, etc.) # - outputs/inferact_50sess.jsonl exists (md5 7bb263a32600ef5a6ef5099ba340a487) # - Previous sweep done; GPUs idle. # # Usage: # bash scripts/sweep_e3_kvc_v2_loadfloor_rdma.sh # # Override defaults via env: # K=500 LOAD_FLOOR_BONUS=$K bash scripts/sweep_e3_kvc_v2_loadfloor_rdma.sh set -euo pipefail cd "$(dirname "$0")/.." if [ -z "${CUDA_HOME:-}" ]; then echo "ERROR: CUDA_HOME not set. Source scripts/setup_env.sh first." >&2 exit 1 fi MODEL=${MODEL:-/mnt/models/Qwen/Qwen3-30B-A3B-Instruct-2507} TRACE=${TRACE:-outputs/inferact_50sess.jsonl} OUTPUT=${OUTPUT:-outputs/e3_kvc_v2_loadfloor_rdma_50sess} IB_DEVICE=${IB_DEVICE:-mlx5_60} LOAD_FLOOR_BONUS=${LOAD_FLOOR_BONUS:-200} if [ ! -f "$TRACE" ]; then echo "ERROR: trace not found at $TRACE" >&2 echo "Run: uv run --no-sync python scripts/sample_trace_subset.py --output $TRACE --sessions 50" >&2 exit 1 fi mkdir -p "$OUTPUT" LOG="$OUTPUT/sweep.log" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; } log "=== E3: KVC v2 + RDMA + load-floor bonus K=$LOAD_FLOOR_BONUS, ts=1 ===" log "MODEL=$MODEL" log "TRACE=$TRACE ($(wc -l < $TRACE) requests)" log "OUTPUT=$OUTPUT" log "IB_DEVICE=$IB_DEVICE" log "MC_TRANSFER_TIMEOUT=${MC_TRANSFER_TIMEOUT:-default-30s}" label=e3_kvc_v2_loadfloor_run1 log "" log "=== [E3] $label starting ===" uv run --no-sync python -m agentic_pd_hybrid.cli benchmark-live \ --trace "$TRACE" \ --output-root "$OUTPUT" \ --mechanism kvcache-centric \ --policy kv-aware \ --model-path "$MODEL" \ --prefill-workers 1 --decode-workers 3 \ --prefill-tp-size 1 --decode-tp-size 1 \ --prefill-gpu-ids 0 --decode-gpu-ids 1,2,3 \ --transfer-backend mooncake \ --force-rdma --ib-device "$IB_DEVICE" \ --gpu-budget 4 \ --time-scale 1 \ --session-sample-rate 1.0 \ --target-duration-s 100000 \ --concurrency-limit 32 \ --timeout-s 1800 \ --request-timeout-s 300 \ --kvcache-admission-mode worker \ --kvcache-seed-min-turn-id 1 \ --kvcache-seed-max-inflight-decode -1 \ --kvcache-prefill-backup-policy release-after-transfer \ --kvcache-prefill-priority-eviction \ --kvcache-migration-reject-threshold 3 \ --kvcache-direct-max-uncached-tokens 8192 \ --kvcache-load-floor-bonus "$LOAD_FLOOR_BONUS" 2>&1 | tee -a "$LOG" run_dir=$(ls -td "$OUTPUT"/kvcache-centric-*/ 2>/dev/null | head -1) log "=== [E3] $label COMPLETED, artifacts at $run_dir ===" if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json" cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl" log "=== summary saved to $OUTPUT/${label}_summary.json ===" fi