cache_aware_proxy: add lmetric_decode_weight (decode-load penalty in the LMetric fallback score) and a v3 anti-hotspot recent-migration penalty (effective_load = num_req + recent-migration count over a sliding window), preventing back-to-back migration clustering. UNIFIED_ABLATION.md documents the A (overload_factor=1.3) + B' (decode-weight, max(num_req,1)) + RaceFix sweep: A+B'+RaceFix reaches TTFT p90 7770ms, beating v3 PD-sep migration by ~20%. Runners/analyzer for the b3 trace replay included. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
67 lines
2.2 KiB
Bash
Executable File
67 lines
2.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Trace replay for the new unified_v3 (offload-decode) policy.
|
|
#
|
|
# Runs the same trace as run_b3_replay.sh on a single policy:
|
|
# unified_v3 — prefill on session-affinity host (uses prefix cache),
|
|
# decode migrated to a low-load target via Mooncake
|
|
# KV transfer (kv_role=kv_both). Session affinity rotates
|
|
# to decode_target after migration so next turn lands
|
|
# where the KV now lives.
|
|
#
|
|
# Applies CT_DR_FIX so the run uses the "best Mooncake state" we have
|
|
# today (post-e3a1d70 + DR sync skipped).
|
|
#
|
|
# Usage: bash run_v3_replay.sh
|
|
|
|
set -uo pipefail
|
|
|
|
PROJ_DIR="${PROJ_DIR:-/home/admin/cpfs/wjh/agentic-kv}"
|
|
TRACE="${TRACE:-$PROJ_DIR/traces/w600_r0.0015_st30.jsonl}"
|
|
DATE="$(date +%Y%m%d_%H%M)"
|
|
OUTROOT="${OUTROOT:-$PROJ_DIR/outputs/b3_v3_${DATE}}"
|
|
PYTHON="$PROJ_DIR/.venv/bin/python"
|
|
DR_FIX_SCRIPT="$PROJ_DIR/microbench/connector_tax/cache_sweep/apply_direct_read_fix.py"
|
|
VLLM_ROOT="${VLLM_ROOT:-$PROJ_DIR/.venv/lib/python3.12/site-packages/vllm}"
|
|
|
|
mkdir -p "$OUTROOT"
|
|
echo "=== unified_v3 (offload-decode) trace replay ==="
|
|
echo "Trace : $TRACE"
|
|
echo "Out : $OUTROOT"
|
|
echo ""
|
|
|
|
cleanup_all() {
|
|
pkill -9 -f cache_aware_proxy 2>/dev/null || true
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 5
|
|
"$PYTHON" "$DR_FIX_SCRIPT" --revert --vllm-root "$VLLM_ROOT" 2>/dev/null || true
|
|
}
|
|
trap cleanup_all EXIT
|
|
cleanup_all
|
|
|
|
echo "[stage 0] applying CT_DR_FIX (env-gated)"
|
|
"$PYTHON" "$DR_FIX_SCRIPT" --apply --vllm-root "$VLLM_ROOT"
|
|
|
|
cfg_dir="$OUTROOT/unified_v3"
|
|
mkdir -p "$cfg_dir"
|
|
|
|
# Activate the DR-fix env-gate (unified_v3 uses Mooncake kv_both)
|
|
export VLLM_MOONCAKE_DISABLE_DIRECT_READ_SYNC=1
|
|
|
|
echo ""
|
|
echo "====== unified_v3 ; DR_SYNC_DISABLED=1 ======"
|
|
bash "$PROJ_DIR/scripts/b3_isolated_policy.sh" "unified_v3" "$TRACE" "$cfg_dir" \
|
|
2>&1 | tee "$cfg_dir/orchestrator.log" | tail -30
|
|
|
|
pkill -9 -f cache_aware_proxy 2>/dev/null || true
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 5
|
|
|
|
echo ""
|
|
echo "[stage Z] reverting CT_DR_FIX"
|
|
"$PYTHON" "$DR_FIX_SCRIPT" --revert --vllm-root "$VLLM_ROOT"
|
|
|
|
echo ""
|
|
echo "Done. Artifacts: $OUTROOT/unified_v3/metrics.jsonl"
|