Systematic study of prefill-decode disaggregation for agentic LLM workloads using production GLM-5.1 coder trace (2.1M requests, 71B input tokens). Key findings: - Cache-aware routing improves TPOT p90 by 15% and APC from 20.8% to 44.7% without PD separation, matching PD-Sep's decode isolation benefit - PD separation adds +72% TTFT overhead (KV transfer) with no TPOT gain when using the same cache-aware scheduler - Prefill remains compute-bound even at 95% KV cache reuse (AI >1000x vs decode AI <2), but absolute FLOPs drop 71% from cache hits - For agentic MoE workloads, cache-aware routing > PD separation Infrastructure: - Trace sampler preserving session structure + hash_ids for prefix sharing - Async trace replayer with streaming TTFT/TPOT/E2E measurement - Unified cache-aware + token-level load-balanced global scheduler proxy supporting both PD-colocated and PD-disaggregated (Mooncake/RDMA) modes - vLLM 0.18.1 scheduler patch for KV transfer abort race condition - Roofline analysis tool for prefill/decode compute characterization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
78 lines
2.2 KiB
Bash
Executable File
78 lines
2.2 KiB
Bash
Executable File
#!/bin/bash
|
|
# Run the full benchmark suite: sample trace → replay against vLLM → collect metrics.
|
|
#
|
|
# Prerequisites:
|
|
# - vLLM server running (use scripts/launch_vllm.sh)
|
|
# - Sampled trace file exists (or will be created)
|
|
#
|
|
# Usage:
|
|
# bash scripts/run_benchmark.sh [--endpoint URL] [--tag NAME]
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
cd "$PROJECT_DIR"
|
|
|
|
# Defaults
|
|
TRACE_INPUT="${TRACE_INPUT:-$HOME/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl}"
|
|
ENDPOINT="${ENDPOINT:-http://localhost:8000}"
|
|
TAG="${TAG:-default}"
|
|
TARGET_REQUESTS="${TARGET_REQUESTS:-5000}"
|
|
TIME_SCALE="${TIME_SCALE:-1.0}"
|
|
MAX_INFLIGHT="${MAX_INFLIGHT:-32}"
|
|
SEED="${SEED:-42}"
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--endpoint) ENDPOINT="$2"; shift 2 ;;
|
|
--tag) TAG="$2"; shift 2 ;;
|
|
--target-requests) TARGET_REQUESTS="$2"; shift 2 ;;
|
|
--time-scale) TIME_SCALE="$2"; shift 2 ;;
|
|
--max-inflight) MAX_INFLIGHT="$2"; shift 2 ;;
|
|
*) echo "Unknown arg: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
SAMPLED_TRACE="traces/sampled_${TARGET_REQUESTS}req_seed${SEED}.jsonl"
|
|
OUTPUT_DIR="outputs/${TAG}_$(date +%Y%m%d_%H%M%S)"
|
|
|
|
echo "=== Benchmark: tag=$TAG ==="
|
|
echo " Trace: $TRACE_INPUT"
|
|
echo " Endpoint: $ENDPOINT"
|
|
echo " Target requests: $TARGET_REQUESTS"
|
|
echo " Time scale: $TIME_SCALE"
|
|
echo " Max inflight sessions: $MAX_INFLIGHT"
|
|
|
|
# Step 1: Sample trace (if not already done)
|
|
if [ ! -f "$SAMPLED_TRACE" ]; then
|
|
echo ""
|
|
echo "=== Step 1: Sampling trace ==="
|
|
python scripts/sample_trace.py \
|
|
--input "$TRACE_INPUT" \
|
|
--output "$SAMPLED_TRACE" \
|
|
--target-requests "$TARGET_REQUESTS" \
|
|
--seed "$SEED"
|
|
else
|
|
echo ""
|
|
echo "=== Step 1: Using existing sampled trace: $SAMPLED_TRACE ==="
|
|
fi
|
|
|
|
# Step 2: Run replay
|
|
echo ""
|
|
echo "=== Step 2: Replaying trace ==="
|
|
mkdir -p "$OUTPUT_DIR"
|
|
python -m replayer \
|
|
--trace "$SAMPLED_TRACE" \
|
|
--output "$OUTPUT_DIR/metrics.jsonl" \
|
|
--endpoint "$ENDPOINT" \
|
|
--time-scale "$TIME_SCALE" \
|
|
--max-inflight-sessions "$MAX_INFLIGHT" \
|
|
-v
|
|
|
|
echo ""
|
|
echo "=== Done ==="
|
|
echo " Metrics: $OUTPUT_DIR/metrics.jsonl"
|
|
echo " Summary: $OUTPUT_DIR/metrics.summary.json"
|