bench harness: env-tunable vLLM health timeout + both-modes 5-policy driver

- b3_isolated_policy.sh: HEALTH_MAX_TRIES now env-overridable (default 180 ->
  360s unchanged); slow-node launches can pass HEALTH_MAX_TRIES=300 (600s) to
  ride out a single-instance startup flake without aborting the whole arm.
- run_5policy_both_modes.sh: runs run_5policy_600s.sh twice on the SAME ttp
  trace with REPLAY_DISPATCH_MODE={tracets,thinktime}, so the only variable is
  dispatch mode. Outputs to outputs/policy5_600s_{mode}_<date>/.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-05-30 20:59:02 +08:00
parent 075f5bbc22
commit 68f21bef23
2 changed files with 23 additions and 2 deletions

View File

@@ -0,0 +1,20 @@
#!/usr/bin/env bash
# 5-policy comparison in BOTH dispatch modes on the SAME ttp-annotated trace,
# so the only variable is dispatch-mode (tracets vs thinktime). Fresh vLLM
# (cold APC) per arm via run_5policy_600s.sh -> b3_isolated_policy.sh.
set -uo pipefail
ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
TRACE_FILE="${TRACE_FILE:-$ROOT/traces/w600_r0.0015_st30_first600s_ttp.jsonl}"
RUN5="$ROOT/microbench/connector_tax/cache_sweep/run_5policy_600s.sh"
DATE="$(date +%Y%m%d_%H%M)"
echo "=== 5policy x {tracets,thinktime} | trace=$(basename "$TRACE_FILE") | $DATE ==="
for MODE in tracets thinktime; do
OUT="$ROOT/outputs/policy5_600s_${MODE}_${DATE}"
echo "############ MODE=$MODE OUT=$OUT $(date) ############"
TRACE="$TRACE_FILE" REPLAY_DISPATCH_MODE="$MODE" OUTROOT="$OUT" \
bash "$RUN5"
echo "dispatch_mode=$MODE" >> "$OUT/RUNINFO.txt"
echo "trace=$TRACE_FILE" >> "$OUT/RUNINFO.txt"
done
echo "=== ALL DONE (both modes) $(date) ==="

View File

@@ -126,8 +126,9 @@ done
echo "[isolated] waiting for vLLM health ..." echo "[isolated] waiting for vLLM health ..."
# NIXL init takes ~100-150s per instance even with concurrent launches; # NIXL init takes ~100-150s per instance even with concurrent launches;
# Mooncake is closer to ~30-60s. Use a generous 360s timeout to cover # Mooncake is closer to ~30-60s. Use a generous 360s timeout to cover
# both (90s -> 360s vs the previous 180s). # both (90s -> 360s vs the previous 180s). Override via env for slow nodes
HEALTH_MAX_TRIES=180 # (e.g. HEALTH_MAX_TRIES=300 -> 600s).
HEALTH_MAX_TRIES="${HEALTH_MAX_TRIES:-180}"
for i in $(seq 0 $((N_INSTANCES - 1))); do for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i)) port=$((BASE_PORT + i))
tries=0 tries=0