Same outputs/inferact_50sess.jsonl subset as E1/E2 (md5
7bb263a32600ef5a6ef5099ba340a487). Identical to E2 except adds
--kvcache-load-floor-bonus 200. Tests three hypotheses:
H1 (load balance): D2 receives non-trivial bindings (E1/E2: 0)
H2 (failure rate): mooncake batch_transfer timeouts disappear
because D0/D1 KV pool no longer saturates
(E2 had 1054 fails; expect ≤ E1's 85)
H3 (TTFT): E2's 0.43s p50 (over the 231 successes)
generalizes to most reqs once cascade is gone
K override via LOAD_FLOOR_BONUS env var (default 200).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
106 lines
3.7 KiB
Bash
Executable File
106 lines
3.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# E3 — KVC v2 + RDMA + load-floor bonus, ts=1
|
|
#
|
|
# Validates the load-floor bonus fix proposed in
|
|
# docs/E1_E2_FIX_DESIGN_ZH.md §Q2.B. Identical to E2 except:
|
|
# --kvcache-load-floor-bonus 200
|
|
#
|
|
# Pair-wise vs E1 (no KVC layer) and E2 (KVC v2 without bonus) on the
|
|
# exact same outputs/inferact_50sess.jsonl subset.
|
|
#
|
|
# Hypotheses being tested:
|
|
# H1 (load balance): D2 should now receive non-trivial bindings
|
|
# (E1/E2 had 0 — see E1_E2_RESULTS_ZH.md §5d).
|
|
# H2 (failure rate): mooncake batch_transfer_sync timeouts should
|
|
# stop firing because D0/D1 KV pool no longer
|
|
# saturates → no LRU thrash → control plane no
|
|
# longer starves. E2 had 1054 failures; expect
|
|
# ≤ E1's 85.
|
|
# H3 (TTFT): the 231 successful E2 reqs had TTFT p50 = 0.43s,
|
|
# well under E1's 88.6s. With the failure cascade
|
|
# removed, these should generalize to most reqs.
|
|
#
|
|
# Prerequisites:
|
|
# - source scripts/setup_env.sh
|
|
# (sets CUDA_HOME, MC_TRANSFER_TIMEOUT=1800, etc.)
|
|
# - outputs/inferact_50sess.jsonl exists (md5 7bb263a32600ef5a6ef5099ba340a487)
|
|
# - Previous sweep done; GPUs idle.
|
|
#
|
|
# Usage:
|
|
# bash scripts/sweep_e3_kvc_v2_loadfloor_rdma.sh
|
|
#
|
|
# Override defaults via env:
|
|
# K=500 LOAD_FLOOR_BONUS=$K bash scripts/sweep_e3_kvc_v2_loadfloor_rdma.sh
|
|
|
|
set -euo pipefail
|
|
cd "$(dirname "$0")/.."
|
|
|
|
if [ -z "${CUDA_HOME:-}" ]; then
|
|
echo "ERROR: CUDA_HOME not set. Source scripts/setup_env.sh first." >&2
|
|
exit 1
|
|
fi
|
|
|
|
MODEL=${MODEL:-/mnt/models/Qwen/Qwen3-30B-A3B-Instruct-2507}
|
|
TRACE=${TRACE:-outputs/inferact_50sess.jsonl}
|
|
OUTPUT=${OUTPUT:-outputs/e3_kvc_v2_loadfloor_rdma_50sess}
|
|
IB_DEVICE=${IB_DEVICE:-mlx5_60}
|
|
LOAD_FLOOR_BONUS=${LOAD_FLOOR_BONUS:-200}
|
|
|
|
if [ ! -f "$TRACE" ]; then
|
|
echo "ERROR: trace not found at $TRACE" >&2
|
|
echo "Run: uv run --no-sync python scripts/sample_trace_subset.py --output $TRACE --sessions 50" >&2
|
|
exit 1
|
|
fi
|
|
|
|
mkdir -p "$OUTPUT"
|
|
LOG="$OUTPUT/sweep.log"
|
|
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }
|
|
|
|
log "=== E3: KVC v2 + RDMA + load-floor bonus K=$LOAD_FLOOR_BONUS, ts=1 ==="
|
|
log "MODEL=$MODEL"
|
|
log "TRACE=$TRACE ($(wc -l < $TRACE) requests)"
|
|
log "OUTPUT=$OUTPUT"
|
|
log "IB_DEVICE=$IB_DEVICE"
|
|
log "MC_TRANSFER_TIMEOUT=${MC_TRANSFER_TIMEOUT:-default-30s}"
|
|
|
|
label=e3_kvc_v2_loadfloor_run1
|
|
log ""
|
|
log "=== [E3] $label starting ==="
|
|
|
|
uv run --no-sync python -m agentic_pd_hybrid.cli benchmark-live \
|
|
--trace "$TRACE" \
|
|
--output-root "$OUTPUT" \
|
|
--mechanism kvcache-centric \
|
|
--policy kv-aware \
|
|
--model-path "$MODEL" \
|
|
--prefill-workers 1 --decode-workers 3 \
|
|
--prefill-tp-size 1 --decode-tp-size 1 \
|
|
--prefill-gpu-ids 0 --decode-gpu-ids 1,2,3 \
|
|
--transfer-backend mooncake \
|
|
--force-rdma --ib-device "$IB_DEVICE" \
|
|
--gpu-budget 4 \
|
|
--time-scale 1 \
|
|
--session-sample-rate 1.0 \
|
|
--target-duration-s 100000 \
|
|
--concurrency-limit 32 \
|
|
--timeout-s 1800 \
|
|
--request-timeout-s 300 \
|
|
--kvcache-admission-mode worker \
|
|
--kvcache-seed-min-turn-id 1 \
|
|
--kvcache-seed-max-inflight-decode -1 \
|
|
--kvcache-prefill-backup-policy release-after-transfer \
|
|
--kvcache-prefill-priority-eviction \
|
|
--kvcache-migration-reject-threshold 3 \
|
|
--kvcache-direct-max-uncached-tokens 8192 \
|
|
--kvcache-load-floor-bonus "$LOAD_FLOOR_BONUS" 2>&1 | tee -a "$LOG"
|
|
|
|
run_dir=$(ls -td "$OUTPUT"/kvcache-centric-*/ 2>/dev/null | head -1)
|
|
log "=== [E3] $label COMPLETED, artifacts at $run_dir ==="
|
|
|
|
if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
|
|
cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
|
|
cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
|
|
log "=== summary saved to $OUTPUT/${label}_summary.json ==="
|
|
fi
|