bench harness: env-tunable vLLM health timeout + both-modes 5-policy driver
- b3_isolated_policy.sh: HEALTH_MAX_TRIES now env-overridable (default 180 ->
360s unchanged); slow-node launches can pass HEALTH_MAX_TRIES=300 (600s) to
ride out a single-instance startup flake without aborting the whole arm.
- run_5policy_both_modes.sh: runs run_5policy_600s.sh twice on the SAME ttp
trace with REPLAY_DISPATCH_MODE={tracets,thinktime}, so the only variable is
dispatch mode. Outputs to outputs/policy5_600s_{mode}_<date>/.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
# 5-policy comparison in BOTH dispatch modes on the SAME ttp-annotated trace,
|
||||
# so the only variable is dispatch-mode (tracets vs thinktime). Fresh vLLM
|
||||
# (cold APC) per arm via run_5policy_600s.sh -> b3_isolated_policy.sh.
|
||||
set -uo pipefail
|
||||
ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
|
||||
TRACE_FILE="${TRACE_FILE:-$ROOT/traces/w600_r0.0015_st30_first600s_ttp.jsonl}"
|
||||
RUN5="$ROOT/microbench/connector_tax/cache_sweep/run_5policy_600s.sh"
|
||||
DATE="$(date +%Y%m%d_%H%M)"
|
||||
|
||||
echo "=== 5policy x {tracets,thinktime} | trace=$(basename "$TRACE_FILE") | $DATE ==="
|
||||
for MODE in tracets thinktime; do
|
||||
OUT="$ROOT/outputs/policy5_600s_${MODE}_${DATE}"
|
||||
echo "############ MODE=$MODE OUT=$OUT $(date) ############"
|
||||
TRACE="$TRACE_FILE" REPLAY_DISPATCH_MODE="$MODE" OUTROOT="$OUT" \
|
||||
bash "$RUN5"
|
||||
echo "dispatch_mode=$MODE" >> "$OUT/RUNINFO.txt"
|
||||
echo "trace=$TRACE_FILE" >> "$OUT/RUNINFO.txt"
|
||||
done
|
||||
echo "=== ALL DONE (both modes) $(date) ==="
|
||||
@@ -126,8 +126,9 @@ done
|
||||
echo "[isolated] waiting for vLLM health ..."
|
||||
# NIXL init takes ~100-150s per instance even with concurrent launches;
|
||||
# Mooncake is closer to ~30-60s. Use a generous 360s timeout to cover
|
||||
# both (90s -> 360s vs the previous 180s).
|
||||
HEALTH_MAX_TRIES=180
|
||||
# both (90s -> 360s vs the previous 180s). Override via env for slow nodes
|
||||
# (e.g. HEALTH_MAX_TRIES=300 -> 600s).
|
||||
HEALTH_MAX_TRIES="${HEALTH_MAX_TRIES:-180}"
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
port=$((BASE_PORT + i))
|
||||
tries=0
|
||||
|
||||
Reference in New Issue
Block a user