Agentic workload PD separation analysis with trace-driven benchmarks

Systematic study of prefill-decode disaggregation for agentic LLM workloads using production GLM-5.1 coder trace (2.1M requests, 71B input tokens). Key findings: - Cache-aware routing improves TPOT p90 by 15% and APC from 20.8% to 44.7% without PD separation, matching PD-Sep's decode isolation benefit - PD separation adds +72% TTFT overhead (KV transfer) with no TPOT gain when using the same cache-aware scheduler - Prefill remains compute-bound even at 95% KV cache reuse (AI >1000x vs decode AI <2), but absolute FLOPs drop 71% from cache hits - For agentic MoE workloads, cache-aware routing > PD separation Infrastructure: - Trace sampler preserving session structure + hash_ids for prefix sharing - Async trace replayer with streaming TTFT/TPOT/E2E measurement - Unified cache-aware + token-level load-balanced global scheduler proxy supporting both PD-colocated and PD-disaggregated (Mooncake/RDMA) modes - vLLM 0.18.1 scheduler patch for KV transfer abort race condition - Roofline analysis tool for prefill/decode compute characterization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 21:21:57 +08:00
commit 05592e6adc
22 changed files with 2837 additions and 0 deletions
--- a/scripts/compare_results.py
+++ b/scripts/compare_results.py
@@ -0,0 +1,102 @@
+"""Compare benchmark results between PD-combined and PD-separated modes.
+
+Reads summary JSON files and per-request metrics to produce a detailed
+comparison report including TTFT, TPOT, E2E, cache hit ratio, and
+throughput analysis.
+
+Usage:
+    python scripts/compare_results.py \
+        --combined outputs/combined_1000req/metrics.summary.json \
+        --separated outputs/separated_1000req/metrics.summary.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def load_summary(path: Path) -> dict:
+    return json.loads(path.read_text())
+
+
+def load_metrics(path: Path) -> list[dict]:
+    rows = []
+    with path.open() as fh:
+        for line in fh:
+            rows.append(json.loads(line))
+    return rows
+
+
+def fmt_stat(stat: dict | None, unit: str = "s") -> str:
+    if stat is None:
+        return "N/A"
+    return (f"mean={stat['mean']:.3f}{unit} "
+            f"p50={stat['p50']:.3f}{unit} "
+            f"p90={stat['p90']:.3f}{unit} "
+            f"p99={stat['p99']:.3f}{unit}")
+
+
+def compare(combined: dict, separated: dict) -> None:
+    print("=" * 70)
+    print("PD-Combined vs PD-Separated Performance Comparison")
+    print("=" * 70)
+
+    for label, s in [("PD-Combined", combined), ("PD-Separated", separated)]:
+        print(f"\n--- {label} ---")
+        print(f"  Requests: {s['request_count']} (success: {s['success_count']}, errors: {s['error_count']})")
+        print(f"  Wall clock: {s.get('wall_clock_s', 0):.1f}s")
+        print(f"  TTFT:    {fmt_stat(s.get('ttft_stats_s'))}")
+        print(f"  TPOT:    {fmt_stat(s.get('tpot_stats_s'))}")
+        print(f"  E2E:     {fmt_stat(s.get('latency_stats_s'))}")
+        hit_ratio = s.get('prefix_cache_hit_ratio', 0)
+        print(f"  Prefix cache hit ratio: {hit_ratio*100:.1f}%")
+        queries = s.get('prefix_cache_queries_tokens', 0)
+        hits = s.get('prefix_cache_hits_tokens', 0)
+        print(f"    ({hits}/{queries} tokens)")
+
+    print("\n--- Comparison (Separated vs Combined) ---")
+    for metric_key, label in [
+        ("ttft_stats_s", "TTFT"),
+        ("tpot_stats_s", "TPOT"),
+        ("latency_stats_s", "E2E"),
+    ]:
+        c = combined.get(metric_key, {})
+        s = separated.get(metric_key, {})
+        if c and s:
+            for pct in ["mean", "p50", "p90", "p99"]:
+                cv, sv = c.get(pct, 0), s.get(pct, 0)
+                if cv > 0:
+                    change = (sv - cv) / cv * 100
+                    direction = "slower" if change > 0 else "faster"
+                    print(f"  {label} {pct}: {abs(change):.1f}% {direction} "
+                          f"({cv:.3f}s → {sv:.3f}s)")
+
+    c_ratio = combined.get("prefix_cache_hit_ratio", 0)
+    s_ratio = separated.get("prefix_cache_hit_ratio", 0)
+    print(f"  Cache hit ratio: {c_ratio*100:.1f}% → {s_ratio*100:.1f}%")
+
+    c_wall = combined.get("wall_clock_s", 1)
+    s_wall = separated.get("wall_clock_s", 1)
+    c_tput = combined["success_count"] / c_wall
+    s_tput = separated["success_count"] / s_wall
+    print(f"  Throughput: {c_tput:.1f} → {s_tput:.1f} req/s "
+          f"({(s_tput/c_tput - 1)*100:+.1f}%)")
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--combined", type=Path, required=True)
+    p.add_argument("--separated", type=Path, required=True)
+    args = p.parse_args()
+
+    combined = load_summary(args.combined)
+    separated = load_summary(args.separated)
+    compare(combined, separated)
+
+
+if __name__ == "__main__":
+    main()