Captures 5 runs from the experiment matrix (combined-ca x3 seeds, pdsep-4p4d seed1, pdsep-6p2d seed1) on traces/w600_r0.0015_st30.jsonl with cuda graphs enabled. The headline: combined-ca: TTFT p50 0.91s success 99.5% pdsep-4p4d: TTFT p50 62.8s success 52% (69x worse, half dropped) pdsep-6p2d: TTFT p50 51.1s success 68% (56x worse, third dropped) C2 (fig_c2): headline bars per config with error bars. C3 (fig_c3): per-instance KV utilization time-series. Both PD-sep splits hit the memory wall, but the side differs by P:D ratio -- 4P+4D pins the P-side, 6P+2D pins both sides (D-side back-pressures P-side). C4 (fig_c4): TTFT stacked breakdown. 99% of PD-sep TTFT is P-side prefill compute; D-side wait + first token is <=1.2s. The bottleneck is P-side prefill queueing, not D-side decode wait as the original analytical model assumed. system_analysis.md gains a Layer 5b that reconciles the analytical KV-wall model (which considered D-side only) with the empirical finding that the wall hits whichever side has fewer GPUs, and co-saturates both at extreme splits via D-side back-pressure. plot_pd_matrix.py ingests outputs/pd_matrix/* into all four figures. bench.sh gained AGENTIC_STEP_LOG_DIR hooks for future runs (set during this work but not used by the current matrix's data). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
181 lines
5.7 KiB
Bash
Executable File
181 lines
5.7 KiB
Bash
Executable File
#!/bin/bash
|
|
# Experiment matrix that rigorously evaluates PD separation vs Combined on
|
|
# the agentic trace (traces/w600_r0.0015_st30.jsonl), using cuda graphs by
|
|
# default and capturing step-level KV utilization for the C3 time-series
|
|
# figure.
|
|
#
|
|
# Matrix (default minimal set, ~2 h wall-clock on 8 x H20):
|
|
# Configs: combined-ca pdsep-4p4d pdsep-6p2d
|
|
# Modes: cudagraph
|
|
# Seeds: 1, 2, 3
|
|
#
|
|
# Optional extensions:
|
|
# --with-rr also run combined-rr (refresh C7 routing-lever data)
|
|
# --with-eager also run each config with --enforce-eager (cuda-graph
|
|
# ablation; this doubles wall-clock)
|
|
#
|
|
# Output structure (on the host that runs this):
|
|
# outputs/pd_matrix/
|
|
# <config>_<mode>_seed<N>/
|
|
# config.json, metrics.jsonl, metrics.summary.json
|
|
# breakdown.json, stats.json, apc.txt
|
|
# proxy.log, replayer.log, gpu_util.csv, gpu_snapshot.csv
|
|
# vllm_inst_*.log <-- step-level "KV cache: X%" lines for C3
|
|
#
|
|
# Run on dash0:
|
|
# cd ~/agentic-kv
|
|
# bash analysis/pd_sep_paper_section/scripts/bench_pd_matrix.sh
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
|
BENCH="$PROJECT_DIR/scripts/bench.sh"
|
|
TRACE_DEFAULT="$PROJECT_DIR/traces/w600_r0.0015_st30.jsonl"
|
|
|
|
# Defaults
|
|
TRACE="$TRACE_DEFAULT"
|
|
REQUESTS=850
|
|
SEEDS=3
|
|
WITH_RR=false
|
|
WITH_EAGER=false
|
|
DRY_RUN=false
|
|
TAG_PREFIX="pd_matrix"
|
|
ONLY="" # comma-separated list of tags to run (subset of the matrix)
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--trace) TRACE="$2"; shift 2 ;;
|
|
--requests) REQUESTS="$2"; shift 2 ;;
|
|
--seeds) SEEDS="$2"; shift 2 ;;
|
|
--with-rr) WITH_RR=true; shift ;;
|
|
--with-eager) WITH_EAGER=true; shift ;;
|
|
--tag-prefix) TAG_PREFIX="$2"; shift 2 ;;
|
|
--only) ONLY="$2"; shift 2 ;;
|
|
--dry-run) DRY_RUN=true; shift ;;
|
|
-h|--help)
|
|
sed -n '2,30p' "$0"; exit 0 ;;
|
|
*) echo "Unknown: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
# Build set of allowed tags from --only (if provided).
|
|
declare -A ALLOWED
|
|
if [ -n "$ONLY" ]; then
|
|
IFS=',' read -ra _tags <<< "$ONLY"
|
|
for t in "${_tags[@]}"; do
|
|
ALLOWED["$(echo "$t" | xargs)"]=1
|
|
done
|
|
fi
|
|
is_allowed() {
|
|
# if ONLY not set, everything is allowed
|
|
[ -z "$ONLY" ] && return 0
|
|
[ -n "${ALLOWED[$1]:-}" ]
|
|
}
|
|
|
|
if [ ! -f "$TRACE" ]; then
|
|
echo "[ERROR] trace not found: $TRACE"
|
|
exit 1
|
|
fi
|
|
if [ ! -x "$BENCH" ]; then
|
|
echo "[ERROR] bench.sh not found or not executable: $BENCH"
|
|
exit 1
|
|
fi
|
|
|
|
# Config table: name | bench.sh args
|
|
declare -a CONFIGS
|
|
CONFIGS+=("combined-ca|--mode baseline --policy linear")
|
|
CONFIGS+=("pdsep-4p4d|--mode pdsep --pd-ratio 4:4 --policy linear")
|
|
CONFIGS+=("pdsep-6p2d|--mode pdsep --pd-ratio 6:2 --policy linear")
|
|
if [ "$WITH_RR" = "true" ]; then
|
|
# round-robin via lmetric without affinity is the cleanest proxy for RR;
|
|
# to get pure RR you'd need to add a --policy rr to cache_aware_proxy.
|
|
# For now this slot is a placeholder so the matrix script is uniform.
|
|
CONFIGS+=("combined-rr|--mode baseline --policy lmetric")
|
|
fi
|
|
|
|
declare -a MODES
|
|
MODES+=("cudagraph|")
|
|
if [ "$WITH_EAGER" = "true" ]; then
|
|
MODES+=("eager|--eager")
|
|
fi
|
|
|
|
# Pretty-print matrix
|
|
echo "=================================================================="
|
|
echo " PD-sep paper section experiment matrix"
|
|
echo " trace = $TRACE"
|
|
echo " requests = $REQUESTS"
|
|
echo " seeds = 1..$SEEDS"
|
|
echo " configs = ${#CONFIGS[@]}"
|
|
echo " modes = ${#MODES[@]}"
|
|
echo " total runs = $(( ${#CONFIGS[@]} * ${#MODES[@]} * SEEDS ))"
|
|
echo "=================================================================="
|
|
|
|
run_one() {
|
|
local config_name="$1"; local config_args="$2"
|
|
local mode_name="$3"; local mode_args="$4"
|
|
local seed="$5"
|
|
|
|
local tag="$TAG_PREFIX/${config_name}_${mode_name}_seed${seed}"
|
|
local outdir="$PROJECT_DIR/outputs/$tag"
|
|
|
|
if [ -d "$outdir" ] && [ -f "$outdir/metrics.summary.json" ]; then
|
|
echo "[skip] $tag (already complete)"
|
|
return 0
|
|
fi
|
|
rm -rf "$outdir" # clear partial runs
|
|
|
|
local cmd="bash $BENCH --tag $tag $config_args $mode_args \
|
|
--trace $TRACE --requests $REQUESTS"
|
|
|
|
echo
|
|
echo "[run] $tag"
|
|
echo " cmd: $cmd"
|
|
if [ "$DRY_RUN" = "true" ]; then
|
|
return 0
|
|
fi
|
|
|
|
# PYTHONHASHSEED is set inside bench.sh for elastic, but not for the
|
|
# other modes. We export a different seed per run for reproducibility
|
|
# of any RNG inside the proxy/replayer.
|
|
PYTHONHASHSEED=$((42 + seed)) eval "$cmd"
|
|
local rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
echo "[FAIL] $tag exited rc=$rc"
|
|
return $rc
|
|
fi
|
|
echo "[done] $tag"
|
|
}
|
|
|
|
START_TS=$(date +%s)
|
|
N_DONE=0
|
|
N_FAIL=0
|
|
N_TOTAL=$(( ${#CONFIGS[@]} * ${#MODES[@]} * SEEDS ))
|
|
|
|
for c in "${CONFIGS[@]}"; do
|
|
cfg_name="${c%%|*}"; cfg_args="${c##*|}"
|
|
for m in "${MODES[@]}"; do
|
|
mode_name="${m%%|*}"; mode_args="${m##*|}"
|
|
for s in $(seq 1 $SEEDS); do
|
|
tag_name="${cfg_name}_${mode_name}_seed${s}"
|
|
if ! is_allowed "$tag_name"; then
|
|
continue
|
|
fi
|
|
if run_one "$cfg_name" "$cfg_args" "$mode_name" "$mode_args" "$s"; then
|
|
N_DONE=$((N_DONE + 1))
|
|
else
|
|
N_FAIL=$((N_FAIL + 1))
|
|
fi
|
|
ELAPSED=$(( $(date +%s) - START_TS ))
|
|
echo "[progress] $N_DONE done, $N_FAIL failed, ${ELAPSED}s elapsed"
|
|
done
|
|
done
|
|
done
|
|
|
|
echo
|
|
echo "=================================================================="
|
|
echo " MATRIX COMPLETE: $N_DONE/$N_TOTAL succeeded, $N_FAIL failed"
|
|
echo " elapsed: $(( ($(date +%s) - START_TS) / 60 )) min"
|
|
echo " outputs: $PROJECT_DIR/outputs/$TAG_PREFIX/"
|
|
echo "=================================================================="
|