PD-sep matrix infrastructure: bench.sh pdsep mode + matrix driver

Adds the experiment harness that gates the empirical claims (C2/C3/C4/C5) in the PD-sep paper section. Three pieces: 1. scripts/bench.sh: new --mode pdsep with --pd-ratio P:D, and an --eager flag to re-enable --enforce-eager for the cuda-graph ablation. pdsep reuses the elastic-mode Mooncake kv_both launch and swaps the proxy command from --combined to --prefill/--decode. baseline and elastic flows are unchanged. 2. analysis/pd_sep_paper_section/scripts/bench_pd_matrix.sh: matrix driver that runs {combined-ca, pdsep-4p4d, pdsep-6p2d} x cudagraph x 3 seeds by default (~2 h on dash0). --with-rr adds combined-rr; --with-eager doubles to ~5 h with the cuda-graph ablation. Skips completed runs, captures per-instance vLLM logs (needed for C3 step-level KV-utilization mining). 3. fig_kv_memory_wall.pdf: empirical anchor (star) at REPORT.md §3.3's observed 6P+2D 97% KV utilization. The marker lands on the model's predicted curve at p90 input, confirming the steady-state analysis. README updated with the run command, output layout, and the followup plotters that consume outputs/pd_matrix/. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 11:47:33 +08:00
parent 4028c587b1
commit 21ffb3d4f7
5 changed files with 279 additions and 27 deletions
--- a/scripts/bench.sh
+++ b/scripts/bench.sh
@@ -25,7 +25,7 @@ TRACE="${TRACE:-$PROJECT_DIR/traces/w600_r0.0015_st30.jsonl}"

 # Defaults
 TAG=""
-MODE="baseline"     # baseline | elastic
+MODE="baseline"     # baseline | elastic | pdsep
 POLICY="linear"     # linear | lmetric | unified
 POLICY_SET=false
 N_INSTANCES=8
@@ -39,6 +39,8 @@ MAX_BATCHED_TOKENS=""
 MAX_OFFLOAD_INFLIGHT=""
 CACHE_GATE_RATIO=""
 OFFLOAD_MODE=""
+PD_RATIO="4:4"      # P:D split when MODE=pdsep
+EAGER=false         # add --enforce-eager back (cuda-graph ablation)

 # Parse args
 while [[ $# -gt 0 ]]; do
@@ -56,12 +58,18 @@ while [[ $# -gt 0 ]]; do
        --max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
        --cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
        --offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
+        --pd-ratio) PD_RATIO="$2"; shift 2 ;;
+        --eager) EAGER=true; shift ;;
        *) echo "Unknown: $1"; exit 1 ;;
    esac
 done

 if [ -z "$TAG" ]; then
-    echo "Usage: bench.sh --tag NAME --mode {baseline|elastic} [--instances N] [--policy {linear|lmetric|unified}] [--requests N]"
+    echo "Usage: bench.sh --tag NAME --mode {baseline|elastic|pdsep}"
+    echo "                [--policy {linear|lmetric|unified}] [--instances N]"
+    echo "                [--pd-ratio P:D]   (only with --mode pdsep, default 4:4)"
+    echo "                [--eager]          (re-enable --enforce-eager for the cuda-graph ablation)"
+    echo "                [--requests N] [--trace PATH]"
    echo "  Trace QPS is controlled by sample_trace.py --sample-ratio, not by bench.sh."
    exit 1
 fi
@@ -70,6 +78,15 @@ if [ "$MODE" = "elastic" ] && [ "$POLICY_SET" = "false" ]; then
    POLICY="unified"
 fi

+if [ "$MODE" = "pdsep" ]; then
+    N_P_INST=${PD_RATIO%%:*}
+    N_D_INST=${PD_RATIO##*:}
+    if [ $((N_P_INST + N_D_INST)) -ne "$N_INSTANCES" ]; then
+        echo "[ERROR] --pd-ratio $PD_RATIO must sum to --instances $N_INSTANCES"
+        exit 1
+    fi
+fi
+
 OUTDIR="$PROJECT_DIR/outputs/$TAG"
 if [ -d "$OUTDIR" ] && [ -f "$OUTDIR/metrics.jsonl" ]; then
    echo "[ERROR] Output directory $OUTDIR already exists with data. Use a different --tag."
@@ -133,13 +150,23 @@ launch_instances() {
    if [ -n "$MAX_BATCHED_TOKENS" ]; then
        vllm_extra_args="--max-num-batched-tokens $MAX_BATCHED_TOKENS"
    fi
+    if [ "$EAGER" = "true" ]; then
+        vllm_extra_args="$vllm_extra_args --enforce-eager"
+    fi
+
+    # elastic and pdsep both run Mooncake kv_both; difference is only the
+    # proxy routing. baseline runs plain vLLM (no Mooncake).
+    local use_mooncake=false
+    if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
+        use_mooncake=true
+    fi

    for i in $(seq 0 $((N_INSTANCES - 1))); do
        local port=$((BASE_PORT + i))
        local master=$((29500 + i))
        local logfile="$OUTDIR/vllm_inst_${i}.log"

-        if [ "$MODE" = "elastic" ]; then
+        if [ "$use_mooncake" = "true" ]; then
            PYTHONHASHSEED=42 \
            VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i)) \
            MASTER_PORT=$master \
@@ -186,8 +213,8 @@ launch_instances() {
        echo "  inst_$i healthy"
    done

-    # Wait for bootstrap (elastic only)
-    if [ "$MODE" = "elastic" ]; then
+    # Wait for bootstrap (Mooncake modes only)
+    if [ "$MODE" = "elastic" ] || [ "$MODE" = "pdsep" ]; then
        echo "[launch] Waiting for Mooncake bootstrap servers..."
        for i in $(seq 0 $((N_INSTANCES - 1))); do
            local bp=$((8998 + i))
@@ -210,10 +237,6 @@ launch_instances() {

 launch_proxy() {
    echo "[proxy] Starting (mode=$MODE, policy=$POLICY)..."
-    local combined_args=""
-    for i in $(seq 0 $((N_INSTANCES - 1))); do
-        combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
-    done

    local extra_args="--policy $POLICY"
    if [ -n "$OVERLOAD_FACTOR_ARG" ]; then
@@ -228,20 +251,38 @@ launch_proxy() {
    if [ -n "$OFFLOAD_MODE" ]; then
        extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
    fi
-    if [ "$MODE" = "elastic" ]; then
-        local bp_list=""
-        for i in $(seq 0 $((N_INSTANCES - 1))); do
-            bp_list="${bp_list:+$bp_list,}$((8998 + i))"
+
+    local proxy_mode_args=""
+    if [ "$MODE" = "pdsep" ]; then
+        # First N_P_INST instances are prefill (with their bootstrap ports),
+        # remaining N_D_INST are decode.
+        for i in $(seq 0 $((N_P_INST - 1))); do
+            proxy_mode_args="$proxy_mode_args --prefill http://127.0.0.1:$((BASE_PORT + i)) $((8998 + i))"
        done
-        if [ "$NO_OFFLOAD" = "true" ]; then
-            extra_args="$extra_args --bootstrap-ports $bp_list"
-        else
-            extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
+        for i in $(seq $N_P_INST $((N_INSTANCES - 1))); do
+            proxy_mode_args="$proxy_mode_args --decode http://127.0.0.1:$((BASE_PORT + i))"
+        done
+    else
+        local combined_args=""
+        for i in $(seq 0 $((N_INSTANCES - 1))); do
+            combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
+        done
+        proxy_mode_args="--combined $combined_args"
+        if [ "$MODE" = "elastic" ]; then
+            local bp_list=""
+            for i in $(seq 0 $((N_INSTANCES - 1))); do
+                bp_list="${bp_list:+$bp_list,}$((8998 + i))"
+            done
+            if [ "$NO_OFFLOAD" = "true" ]; then
+                extra_args="$extra_args --bootstrap-ports $bp_list"
+            else
+                extra_args="$extra_args --offload --heavy-threshold $HEAVY_THRESHOLD --bootstrap-ports $bp_list"
+            fi
        fi
    fi

    $PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
-        --combined $combined_args \
+        $proxy_mode_args \
        --port $PROXY_PORT \
        $extra_args \
        > "$OUTDIR/proxy.log" 2>&1 &