PD-sep matrix infrastructure: bench.sh pdsep mode + matrix driver
Adds the experiment harness that gates the empirical claims (C2/C3/C4/C5)
in the PD-sep paper section. Three pieces:
1. scripts/bench.sh: new --mode pdsep with --pd-ratio P:D, and an
--eager flag to re-enable --enforce-eager for the cuda-graph
ablation. pdsep reuses the elastic-mode Mooncake kv_both launch and
swaps the proxy command from --combined to --prefill/--decode.
baseline and elastic flows are unchanged.
2. analysis/pd_sep_paper_section/scripts/bench_pd_matrix.sh: matrix
driver that runs {combined-ca, pdsep-4p4d, pdsep-6p2d} x cudagraph
x 3 seeds by default (~2 h on dash0). --with-rr adds combined-rr;
--with-eager doubles to ~5 h with the cuda-graph ablation. Skips
completed runs, captures per-instance vLLM logs (needed for C3
step-level KV-utilization mining).
3. fig_kv_memory_wall.pdf: empirical anchor (star) at REPORT.md §3.3's
observed 6P+2D 97% KV utilization. The marker lands on the model's
predicted curve at p90 input, confirming the steady-state analysis.
README updated with the run command, output layout, and the followup
plotters that consume outputs/pd_matrix/.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -25,7 +25,7 @@ TRACE="${TRACE:-$PROJECT_DIR/traces/w600_r0.0015_st30.jsonl}"
|
||||
|
||||
# Defaults
|
||||
TAG=""
|
||||
MODE="baseline" # baseline | elastic
|
||||
MODE="baseline" # baseline | elastic | pdsep
|
||||
POLICY="linear" # linear | lmetric | unified
|
||||
POLICY_SET=false
|
||||
N_INSTANCES=8
|
||||
@@ -39,6 +39,8 @@ MAX_BATCHED_TOKENS=""
|
||||
MAX_OFFLOAD_INFLIGHT=""
|
||||
CACHE_GATE_RATIO=""
|
||||
OFFLOAD_MODE=""
|
||||
PD_RATIO="4:4" # P:D split when MODE=pdsep
|
||||
EAGER=false # add --enforce-eager back (cuda-graph ablation)
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
@@ -56,12 +58,18 @@ while [[ $# -gt 0 ]]; do
|
||||
--max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
|
||||
--cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
|
||||
--offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
|
||||
--pd-ratio) PD_RATIO="$2"; shift 2 ;;
|
||||
--eager) EAGER=true; shift ;;
|
||||
*) echo "Unknown: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$TAG" ]; then
|
||||
echo "Usage: bench.sh --tag NAME --mode {baseline|elastic} [--instances N] [--policy {linear|lmetric|unified}] [--requests N]"
|
||||
echo "Usage: bench.sh --tag NAME --mode {baseline|elastic|pdsep}"
|
||||
echo " [--policy {linear|lmetric|unified}] [--instances N]"
|
||||
echo " [--pd-ratio P:D] (only with --mode pdsep, default 4:4)"
|
||||
echo " [--eager] (re-enable --enforce-eager for the cuda-graph ablation)"
|
||||
echo " [--requests N] [--trace PATH]"
|
||||
echo " Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh."
|
||||
exit 1
|
||||
fi
|
||||
@@ -70,6 +78,15 @@ if [ "$MODE" = "elastic" ] && [ "$POLICY_SET" = "false" ]; then
|
||||
POLICY="unified"
|
||||
fi
|
||||
|
||||
if [ "$MODE" = "pdsep" ]; then
|
||||
N_P_INST=${PD_RATIO%%:*}
|
||||
N_D_INST=${PD_RATIO##*:}
|
||||
if [ $((N_P_INST + N_D_INST)) -ne "$N_INSTANCES" ]; then
|
||||
echo "[ERROR] --pd-ratio $PD_RATIO must sum to --instances $N_INSTANCES"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
OUTDIR="$PROJECT_DIR/outputs/$TAG"
|
||||
if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then
|
||||
echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag."
|
||||
@@ -133,13 +150,23 @@ launch_instances() {
|
||||
if [ -n "$MAX_BATCHED_TOKENS" ]; then
|
||||
vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS"
|
||||
fi
|
||||
if [ "$EAGER" = "true" ]; then
|
||||
vllm_extra_args="$vllm_extra_args --enforce-eager"
|
||||
fi
|
||||
|
||||
# elastic and pdsep both run Mooncake kv_both; difference is only the
|
||||
# proxy routing. baseline runs plain vLLM (no Mooncake).
|
||||
local use_mooncake=false
|
||||
if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
|
||||
use_mooncake=true
|
||||
fi
|
||||
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
local port=$((BASE_PORT + i))
|
||||
local master=$((29500 + i))
|
||||
local logfile="$OUTDIR/vllm_inst_${i}.log"
|
||||
|
||||
if [ "$MODE" = "elastic" ]; then
|
||||
if [ "$use_mooncake" = "true" ]; then
|
||||
PYTHONHASHSEED=42 \
|
||||
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \
|
||||
MASTER_PORT=$master \
|
||||
@@ -186,8 +213,8 @@ launch_instances() {
|
||||
echo " inst_$i healthy"
|
||||
done
|
||||
|
||||
# Wait for bootstrap (elastic only)
|
||||
if [ "$MODE" = "elastic" ]; then
|
||||
# Wait for bootstrap (Mooncake modes only)
|
||||
if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
|
||||
echo "[launch] Waiting for Mooncake bootstrap servers..."
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
local bp=$((8998 + i))
|
||||
@@ -210,10 +237,6 @@ launch_instances() {
|
||||
|
||||
launch_proxy() {
|
||||
echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..."
|
||||
local combined_args=""
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
||||
done
|
||||
|
||||
local extra_args="--policy $POLICY"
|
||||
if [ -n "$OVERLOAD_FACTOR_ARG" ]; then
|
||||
@@ -228,20 +251,38 @@ launch_proxy() {
|
||||
if [ -n "$OFFLOAD_MODE" ]; then
|
||||
extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
|
||||
fi
|
||||
if [ "$MODE" = "elastic" ]; then
|
||||
local bp_list=""
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
|
||||
|
||||
local proxy_mode_args=""
|
||||
if [ "$MODE" = "pdsep" ]; then
|
||||
# First N_P_INST instances are prefill (with their bootstrap ports),
|
||||
# remaining N_D_INST are decode.
|
||||
for i in $(seq 0 $((N_P_INST - 1))); do
|
||||
proxy_mode_args="$proxy_mode_args --prefill http://127.0.0.1:$((BASE_PORT + i)) $((8998 + i))"
|
||||
done
|
||||
if [ "$NO_OFFLOAD" = "true" ]; then
|
||||
extra_args="$extra_args --bootstrap-ports $bp_list"
|
||||
else
|
||||
extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
|
||||
for i in $(seq $N_P_INST $((N_INSTANCES - 1))); do
|
||||
proxy_mode_args="$proxy_mode_args --decode http://127.0.0.1:$((BASE_PORT + i))"
|
||||
done
|
||||
else
|
||||
local combined_args=""
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
||||
done
|
||||
proxy_mode_args="--combined $combined_args"
|
||||
if [ "$MODE" = "elastic" ]; then
|
||||
local bp_list=""
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
|
||||
done
|
||||
if [ "$NO_OFFLOAD" = "true" ]; then
|
||||
extra_args="$extra_args --bootstrap-ports $bp_list"
|
||||
else
|
||||
extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
||||
--combined $combined_args \
|
||||
$proxy_mode_args \
|
||||
--port $PROXY_PORT \
|
||||
$extra_args \
|
||||
> "$OUTDIR/proxy.log" 2>&1 &
|
||||
|
||||
Reference in New Issue
Block a user