Agentic workload PD separation analysis with trace-driven benchmarks

Systematic study of prefill-decode disaggregation for agentic LLM workloads
using production GLM-5.1 coder trace (2.1M requests, 71B input tokens).

Key findings:
- Cache-aware routing improves TPOT p90 by 15% and APC from 20.8% to 44.7%
  without PD separation, matching PD-Sep's decode isolation benefit
- PD separation adds +72% TTFT overhead (KV transfer) with no TPOT gain
  when using the same cache-aware scheduler
- Prefill remains compute-bound even at 95% KV cache reuse (AI >1000x
  vs decode AI <2), but absolute FLOPs drop 71% from cache hits
- For agentic MoE workloads, cache-aware routing > PD separation

Infrastructure:
- Trace sampler preserving session structure + hash_ids for prefix sharing
- Async trace replayer with streaming TTFT/TPOT/E2E measurement
- Unified cache-aware + token-level load-balanced global scheduler proxy
  supporting both PD-colocated and PD-disaggregated (Mooncake/RDMA) modes
- vLLM 0.18.1 scheduler patch for KV transfer abort race condition
- Roofline analysis tool for prefill/decode compute characterization

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-21 21:21:57 +08:00
commit 05592e6adc
22 changed files with 2837 additions and 0 deletions

102
scripts/compare_results.py Normal file
View File

@@ -0,0 +1,102 @@
"""Compare benchmark results between PD-combined and PD-separated modes.
Reads summary JSON files and per-request metrics to produce a detailed
comparison report including TTFT, TPOT, E2E, cache hit ratio, and
throughput analysis.
Usage:
python scripts/compare_results.py \
--combined outputs/combined_1000req/metrics.summary.json \
--separated outputs/separated_1000req/metrics.summary.json
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
def load_summary(path: Path) -> dict:
return json.loads(path.read_text())
def load_metrics(path: Path) -> list[dict]:
rows = []
with path.open() as fh:
for line in fh:
rows.append(json.loads(line))
return rows
def fmt_stat(stat: dict | None, unit: str = "s") -> str:
if stat is None:
return "N/A"
return (f"mean={stat['mean']:.3f}{unit} "
f"p50={stat['p50']:.3f}{unit} "
f"p90={stat['p90']:.3f}{unit} "
f"p99={stat['p99']:.3f}{unit}")
def compare(combined: dict, separated: dict) -> None:
print("=" * 70)
print("PD-Combined vs PD-Separated Performance Comparison")
print("=" * 70)
for label, s in [("PD-Combined", combined), ("PD-Separated", separated)]:
print(f"\n--- {label} ---")
print(f" Requests: {s['request_count']} (success: {s['success_count']}, errors: {s['error_count']})")
print(f" Wall clock: {s.get('wall_clock_s', 0):.1f}s")
print(f" TTFT: {fmt_stat(s.get('ttft_stats_s'))}")
print(f" TPOT: {fmt_stat(s.get('tpot_stats_s'))}")
print(f" E2E: {fmt_stat(s.get('latency_stats_s'))}")
hit_ratio = s.get('prefix_cache_hit_ratio', 0)
print(f" Prefix cache hit ratio: {hit_ratio*100:.1f}%")
queries = s.get('prefix_cache_queries_tokens', 0)
hits = s.get('prefix_cache_hits_tokens', 0)
print(f" ({hits}/{queries} tokens)")
print("\n--- Comparison (Separated vs Combined) ---")
for metric_key, label in [
("ttft_stats_s", "TTFT"),
("tpot_stats_s", "TPOT"),
("latency_stats_s", "E2E"),
]:
c = combined.get(metric_key, {})
s = separated.get(metric_key, {})
if c and s:
for pct in ["mean", "p50", "p90", "p99"]:
cv, sv = c.get(pct, 0), s.get(pct, 0)
if cv > 0:
change = (sv - cv) / cv * 100
direction = "slower" if change > 0 else "faster"
print(f" {label} {pct}: {abs(change):.1f}% {direction} "
f"({cv:.3f}s → {sv:.3f}s)")
c_ratio = combined.get("prefix_cache_hit_ratio", 0)
s_ratio = separated.get("prefix_cache_hit_ratio", 0)
print(f" Cache hit ratio: {c_ratio*100:.1f}% → {s_ratio*100:.1f}%")
c_wall = combined.get("wall_clock_s", 1)
s_wall = separated.get("wall_clock_s", 1)
c_tput = combined["success_count"] / c_wall
s_tput = separated["success_count"] / s_wall
print(f" Throughput: {c_tput:.1f}{s_tput:.1f} req/s "
f"({(s_tput/c_tput - 1)*100:+.1f}%)")
def main():
p = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument("--combined", type=Path, required=True)
p.add_argument("--separated", type=Path, required=True)
args = p.parse_args()
combined = load_summary(args.combined)
separated = load_summary(args.separated)
compare(combined, separated)
if __name__ == "__main__":
main()