Files
agentic-pd-hybrid/scripts/sweep_e2_kvc_v2_rdma.sh
tim ad8aaa8c5a feat(experiments): E2 sweep — KVC v2 + RDMA on the matched subset
KVC v2 config from sweep_ts1_migration_v2.sh (reset-on-success +
direct-append threshold 8192) layered on top of the RDMA-enabled
mooncake stack, against the same outputs/inferact_50sess.jsonl
subset that E1 uses. Pair-wise contrast tests H1 (KVC layer marginal
contribution on top of 1P3D + kv-aware) and H2/H3 (RDMA reducing
reseed slow-path tail).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 00:49:53 +08:00

91 lines
2.9 KiB
Bash
Executable File

#!/usr/bin/env bash
# E2 — KVC v2 + RDMA, ts=1
#
# Tests hypotheses H2/H3 from ONBOARDING_NEXT_AGENT_ZH §3.1: validate
# that enabling real RDMA pushes TTFT p99 from the reported 1.28s
# (TCP loopback) down toward ~0.7s (still expected to lose to DP 0.43s
# because re-prefill segment of reseed slow-path remains).
#
# Mechanism = kvcache-centric; policy = kv-aware; topology = 1P3D.
# All --kvcache-* tuning flags from sweep_ts1_migration_v2.sh
# (reset-on-success + threshold 8192). RDMA on (mlx5_60).
#
# Uses the same outputs/inferact_50sess.jsonl as E1 — see
# scripts/sample_trace_subset.py — so the two runs are paired.
#
# Prerequisites:
# - source scripts/setup_env.sh
# - E1 must already have completed (releases GPUs)
#
# Usage:
# bash scripts/sweep_e2_kvc_v2_rdma.sh
set -euo pipefail
cd "$(dirname "$0")/.."
if [ -z "${CUDA_HOME:-}" ]; then
echo "ERROR: CUDA_HOME not set. Source scripts/setup_env.sh first." >&2
exit 1
fi
MODEL=${MODEL:-/mnt/models/Qwen/Qwen3-30B-A3B-Instruct-2507}
TRACE=${TRACE:-outputs/inferact_50sess.jsonl}
OUTPUT=${OUTPUT:-outputs/e2_kvc_v2_rdma_50sess}
IB_DEVICE=${IB_DEVICE:-mlx5_60}
if [ ! -f "$TRACE" ]; then
echo "ERROR: trace not found at $TRACE" >&2
echo "Run: uv run --no-sync python scripts/sample_trace_subset.py --output $TRACE --sessions 50" >&2
exit 1
fi
mkdir -p "$OUTPUT"
LOG="$OUTPUT/sweep.log"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }
log "=== E2: KVC v2 + RDMA, ts=1 ==="
log "MODEL=$MODEL"
log "TRACE=$TRACE ($(wc -l < $TRACE) requests)"
log "OUTPUT=$OUTPUT"
log "IB_DEVICE=$IB_DEVICE"
label=e2_kvc_v2_rdma_run1
log ""
log "=== [E2] $label starting ==="
uv run --no-sync python -m agentic_pd_hybrid.cli benchmark-live \
--trace "$TRACE" \
--output-root "$OUTPUT" \
--mechanism kvcache-centric \
--policy kv-aware \
--model-path "$MODEL" \
--prefill-workers 1 --decode-workers 3 \
--prefill-tp-size 1 --decode-tp-size 1 \
--prefill-gpu-ids 0 --decode-gpu-ids 1,2,3 \
--transfer-backend mooncake \
--force-rdma --ib-device "$IB_DEVICE" \
--gpu-budget 4 \
--time-scale 1 \
--session-sample-rate 1.0 \
--target-duration-s 100000 \
--concurrency-limit 32 \
--timeout-s 1800 \
--request-timeout-s 300 \
--kvcache-admission-mode worker \
--kvcache-seed-min-turn-id 1 \
--kvcache-seed-max-inflight-decode -1 \
--kvcache-prefill-backup-policy release-after-transfer \
--kvcache-prefill-priority-eviction \
--kvcache-migration-reject-threshold 3 \
--kvcache-direct-max-uncached-tokens 8192 2>&1 | tee -a "$LOG"
run_dir=$(ls -td "$OUTPUT"/kvcache-centric-*/ 2>/dev/null | head -1)
log "=== [E2] $label COMPLETED, artifacts at $run_dir ==="
if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
log "=== summary saved to $OUTPUT/${label}_summary.json ==="
fi