PD-sep matrix infrastructure: bench.sh pdsep mode + matrix driver

Adds the experiment harness that gates the empirical claims (C2/C3/C4/C5)
in the PD-sep paper section. Three pieces:

  1. scripts/bench.sh: new --mode pdsep with --pd-ratio P:D, and an
     --eager flag to re-enable --enforce-eager for the cuda-graph
     ablation. pdsep reuses the elastic-mode Mooncake kv_both launch and
     swaps the proxy command from --combined to --prefill/--decode.
     baseline and elastic flows are unchanged.

  2. analysis/pd_sep_paper_section/scripts/bench_pd_matrix.sh: matrix
     driver that runs {combined-ca, pdsep-4p4d, pdsep-6p2d} x cudagraph
     x 3 seeds by default (~2 h on dash0). --with-rr adds combined-rr;
     --with-eager doubles to ~5 h with the cuda-graph ablation. Skips
     completed runs, captures per-instance vLLM logs (needed for C3
     step-level KV-utilization mining).

  3. fig_kv_memory_wall.pdf: empirical anchor (star) at REPORT.md §3.3's
     observed 6P+2D 97% KV utilization. The marker lands on the model's
     predicted curve at p90 input, confirming the steady-state analysis.

README updated with the run command, output layout, and the followup
plotters that consume outputs/pd_matrix/.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 11:47:33 +08:00
parent 4028c587b1
commit 21ffb3d4f7
5 changed files with 279 additions and 27 deletions

View File

@@ -25,7 +25,7 @@ TRACE="${TRACE:-$PROJECT_DIR/traces/w600_r0.0015_st30.jsonl}"
# Defaults
TAG=""
MODE="baseline" # baseline | elastic
MODE="baseline" # baseline | elastic | pdsep
POLICY="linear" # linear | lmetric | unified
POLICY_SET=false
N_INSTANCES=8
@@ -39,6 +39,8 @@ MAX_BATCHED_TOKENS=""
MAX_OFFLOAD_INFLIGHT=""
CACHE_GATE_RATIO=""
OFFLOAD_MODE=""
PD_RATIO="4:4" # P:D split when MODE=pdsep
EAGER=false # add --enforce-eager back (cuda-graph ablation)
# Parse args
while [[ $# -gt 0 ]]; do
@@ -56,12 +58,18 @@ while [[ $# -gt 0 ]]; do
--max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
--cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
--offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
--pd-ratio) PD_RATIO="$2"; shift 2 ;;
--eager) EAGER=true; shift ;;
*) echo "Unknown: $1"; exit 1 ;;
esac
done
if [ -z "$TAG" ]; then
echo "Usage: bench.sh --tag NAME --mode {baseline|elastic} [--instances N] [--policy {linear|lmetric|unified}] [--requests N]"
echo "Usage: bench.sh --tag NAME --mode {baseline|elastic|pdsep}"
echo " [--policy {linear|lmetric|unified}] [--instances N]"
echo " [--pd-ratio P:D] (only with --mode pdsep, default 4:4)"
echo " [--eager] (re-enable --enforce-eager for the cuda-graph ablation)"
echo " [--requests N] [--trace PATH]"
echo " Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh."
exit 1
fi
@@ -70,6 +78,15 @@ if [ "$MODE" = "elastic" ] && [ "$POLICY_SET" = "false" ]; then
POLICY="unified"
fi
if [ "$MODE" = "pdsep" ]; then
N_P_INST=${PD_RATIO%%:*}
N_D_INST=${PD_RATIO##*:}
if [ $((N_P_INST + N_D_INST)) -ne "$N_INSTANCES" ]; then
echo "[ERROR] --pd-ratio $PD_RATIO must sum to --instances $N_INSTANCES"
exit 1
fi
fi
OUTDIR="$PROJECT_DIR/outputs/$TAG"
if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then
echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag."
@@ -133,13 +150,23 @@ launch_instances() {
if [ -n "$MAX_BATCHED_TOKENS" ]; then
vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS"
fi
if [ "$EAGER" = "true" ]; then
vllm_extra_args="$vllm_extra_args --enforce-eager"
fi
# elastic and pdsep both run Mooncake kv_both; difference is only the
# proxy routing. baseline runs plain vLLM (no Mooncake).
local use_mooncake=false
if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
use_mooncake=true
fi
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local master=$((29500 + i))
local logfile="$OUTDIR/vllm_inst_${i}.log"
if [ "$MODE" = "elastic" ]; then
if [ "$use_mooncake" = "true" ]; then
PYTHONHASHSEED=42 \
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \
MASTER_PORT=$master \
@@ -186,8 +213,8 @@ launch_instances() {
echo " inst_$i healthy"
done
# Wait for bootstrap (elastic only)
if [ "$MODE" = "elastic" ]; then
# Wait for bootstrap (Mooncake modes only)
if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
echo "[launch] Waiting for Mooncake bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local bp=$((8998 + i))
@@ -210,10 +237,6 @@ launch_instances() {
launch_proxy() {
echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..."
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
local extra_args="--policy $POLICY"
if [ -n "$OVERLOAD_FACTOR_ARG" ]; then
@@ -228,20 +251,38 @@ launch_proxy() {
if [ -n "$OFFLOAD_MODE" ]; then
extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
fi
if [ "$MODE" = "elastic" ]; then
local bp_list=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
local proxy_mode_args=""
if [ "$MODE" = "pdsep" ]; then
# First N_P_INST instances are prefill (with their bootstrap ports),
# remaining N_D_INST are decode.
for i in $(seq 0 $((N_P_INST - 1))); do
proxy_mode_args="$proxy_mode_args --prefill http://127.0.0.1:$((BASE_PORT + i)) $((8998 + i))"
done
if [ "$NO_OFFLOAD" = "true" ]; then
extra_args="$extra_args --bootstrap-ports $bp_list"
else
extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
for i in $(seq $N_P_INST $((N_INSTANCES - 1))); do
proxy_mode_args="$proxy_mode_args --decode http://127.0.0.1:$((BASE_PORT + i))"
done
else
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
proxy_mode_args="--combined $combined_args"
if [ "$MODE" = "elastic" ]; then
local bp_list=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
bp_list="${bp_list:+$bp_list,}$((8998 + i))"
done
if [ "$NO_OFFLOAD" = "true" ]; then
extra_args="$extra_args --bootstrap-ports $bp_list"
else
extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
fi
fi
fi
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $combined_args \
$proxy_mode_args \
--port $PROXY_PORT \
$extra_args \
> "$OUTDIR/proxy.log" 2>&1 &