Agentic workload PD separation analysis with trace-driven benchmarks
Systematic study of prefill-decode disaggregation for agentic LLM workloads using production GLM-5.1 coder trace (2.1M requests, 71B input tokens). Key findings: - Cache-aware routing improves TPOT p90 by 15% and APC from 20.8% to 44.7% without PD separation, matching PD-Sep's decode isolation benefit - PD separation adds +72% TTFT overhead (KV transfer) with no TPOT gain when using the same cache-aware scheduler - Prefill remains compute-bound even at 95% KV cache reuse (AI >1000x vs decode AI <2), but absolute FLOPs drop 71% from cache hits - For agentic MoE workloads, cache-aware routing > PD separation Infrastructure: - Trace sampler preserving session structure + hash_ids for prefix sharing - Async trace replayer with streaming TTFT/TPOT/E2E measurement - Unified cache-aware + token-level load-balanced global scheduler proxy supporting both PD-colocated and PD-disaggregated (Mooncake/RDMA) modes - vLLM 0.18.1 scheduler patch for KV transfer abort race condition - Roofline analysis tool for prefill/decode compute characterization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
102
scripts/compare_results.py
Normal file
102
scripts/compare_results.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Compare benchmark results between PD-combined and PD-separated modes.
|
||||
|
||||
Reads summary JSON files and per-request metrics to produce a detailed
|
||||
comparison report including TTFT, TPOT, E2E, cache hit ratio, and
|
||||
throughput analysis.
|
||||
|
||||
Usage:
|
||||
python scripts/compare_results.py \
|
||||
--combined outputs/combined_1000req/metrics.summary.json \
|
||||
--separated outputs/separated_1000req/metrics.summary.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_summary(path: Path) -> dict:
|
||||
return json.loads(path.read_text())
|
||||
|
||||
|
||||
def load_metrics(path: Path) -> list[dict]:
|
||||
rows = []
|
||||
with path.open() as fh:
|
||||
for line in fh:
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
def fmt_stat(stat: dict | None, unit: str = "s") -> str:
|
||||
if stat is None:
|
||||
return "N/A"
|
||||
return (f"mean={stat['mean']:.3f}{unit} "
|
||||
f"p50={stat['p50']:.3f}{unit} "
|
||||
f"p90={stat['p90']:.3f}{unit} "
|
||||
f"p99={stat['p99']:.3f}{unit}")
|
||||
|
||||
|
||||
def compare(combined: dict, separated: dict) -> None:
|
||||
print("=" * 70)
|
||||
print("PD-Combined vs PD-Separated Performance Comparison")
|
||||
print("=" * 70)
|
||||
|
||||
for label, s in [("PD-Combined", combined), ("PD-Separated", separated)]:
|
||||
print(f"\n--- {label} ---")
|
||||
print(f" Requests: {s['request_count']} (success: {s['success_count']}, errors: {s['error_count']})")
|
||||
print(f" Wall clock: {s.get('wall_clock_s', 0):.1f}s")
|
||||
print(f" TTFT: {fmt_stat(s.get('ttft_stats_s'))}")
|
||||
print(f" TPOT: {fmt_stat(s.get('tpot_stats_s'))}")
|
||||
print(f" E2E: {fmt_stat(s.get('latency_stats_s'))}")
|
||||
hit_ratio = s.get('prefix_cache_hit_ratio', 0)
|
||||
print(f" Prefix cache hit ratio: {hit_ratio*100:.1f}%")
|
||||
queries = s.get('prefix_cache_queries_tokens', 0)
|
||||
hits = s.get('prefix_cache_hits_tokens', 0)
|
||||
print(f" ({hits}/{queries} tokens)")
|
||||
|
||||
print("\n--- Comparison (Separated vs Combined) ---")
|
||||
for metric_key, label in [
|
||||
("ttft_stats_s", "TTFT"),
|
||||
("tpot_stats_s", "TPOT"),
|
||||
("latency_stats_s", "E2E"),
|
||||
]:
|
||||
c = combined.get(metric_key, {})
|
||||
s = separated.get(metric_key, {})
|
||||
if c and s:
|
||||
for pct in ["mean", "p50", "p90", "p99"]:
|
||||
cv, sv = c.get(pct, 0), s.get(pct, 0)
|
||||
if cv > 0:
|
||||
change = (sv - cv) / cv * 100
|
||||
direction = "slower" if change > 0 else "faster"
|
||||
print(f" {label} {pct}: {abs(change):.1f}% {direction} "
|
||||
f"({cv:.3f}s → {sv:.3f}s)")
|
||||
|
||||
c_ratio = combined.get("prefix_cache_hit_ratio", 0)
|
||||
s_ratio = separated.get("prefix_cache_hit_ratio", 0)
|
||||
print(f" Cache hit ratio: {c_ratio*100:.1f}% → {s_ratio*100:.1f}%")
|
||||
|
||||
c_wall = combined.get("wall_clock_s", 1)
|
||||
s_wall = separated.get("wall_clock_s", 1)
|
||||
c_tput = combined["success_count"] / c_wall
|
||||
s_tput = separated["success_count"] / s_wall
|
||||
print(f" Throughput: {c_tput:.1f} → {s_tput:.1f} req/s "
|
||||
f"({(s_tput/c_tput - 1)*100:+.1f}%)")
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
p.add_argument("--combined", type=Path, required=True)
|
||||
p.add_argument("--separated", type=Path, required=True)
|
||||
args = p.parse_args()
|
||||
|
||||
combined = load_summary(args.combined)
|
||||
separated = load_summary(args.separated)
|
||||
compare(combined, separated)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user