agentic-kvc/scripts/final_comparison.py

"""Final comparison of PD-Combined vs PD-Separated (Mooncake/RDMA)."""
import json, statistics, os

def pct(vals, q):
    return vals[min(int(q * len(vals)), len(vals) - 1)] if vals else 0

# Combined (16 sessions) - completed run
rows_c = [json.loads(l) for l in open("outputs/v18_combined_1000req/metrics.jsonl")]
ok_c = [r for r in rows_c if not r.get("error")]
ttfts_c = sorted([r["ttft_s"] for r in ok_c if r.get("ttft_s")])
tpots_c = sorted([r["tpot_s"] for r in ok_c if r.get("tpot_s") and r["tpot_s"] > 0])
lats_c = sorted([r["latency_s"] for r in ok_c if r.get("latency_s")])
sc = json.load(open("outputs/v18_combined_1000req/metrics.summary.json"))

# PD-Separated Mooncake (first 200 stable requests)
rows_d = [json.loads(l) for l in open("outputs/v18_pd_mooncake_lowconc/metrics.jsonl")][:200]
ok_d = [r for r in rows_d if not r.get("error")]
ttfts_d = sorted([r["ttft_s"] for r in ok_d if r.get("ttft_s")])
tpots_d = sorted([r["tpot_s"] for r in ok_d if r.get("tpot_s") and r["tpot_s"] > 0])
lats_d = sorted([r["latency_s"] for r in ok_d if r.get("latency_s")])

sep = "=" * 70
print(sep)
print("  PD-Combined vs PD-Separated (Mooncake/RDMA)")
print("  vLLM 0.18.1 | Qwen3-Coder-30B-A3B | 8xH20")
print(sep)

header = "  {:<12} {:>16} {:>16} {:>10}".format(
    "Metric", "Combined(TP=8)", "PD-Sep(TP=4+4)", "Delta")
print(header)
dash = "  {:<12} {:>16} {:>16} {:>10}".format("-" * 12, "-" * 16, "-" * 16, "-" * 10)
print(dash)

req_c = "{}/{}".format(len(ok_c), len(rows_c))
req_d = "{}/{}".format(len(ok_d), len(rows_d))
print("  {:<12} {:>16} {:>16}".format("Requests", req_c, req_d))

data = [
    ("TTFT p50", pct(ttfts_c, 0.5), pct(ttfts_d, 0.5)),
    ("TTFT p90", pct(ttfts_c, 0.9), pct(ttfts_d, 0.9)),
    ("TPOT p50", pct(tpots_c, 0.5), pct(tpots_d, 0.5)),
    ("TPOT p90", pct(tpots_c, 0.9), pct(tpots_d, 0.9)),
    ("E2E p50", pct(lats_c, 0.5), pct(lats_d, 0.5)),
    ("E2E p90", pct(lats_c, 0.9), pct(lats_d, 0.9)),
]

for label, cv, dv in data:
    delta = "{:+.0f}%".format((dv / cv - 1) * 100) if cv > 0 else "N/A"
    print("  {:<12} {:>15.3f}s {:>15.3f}s {:>10}".format(label, cv, dv, delta))

cache_c = sc.get("prefix_cache_hit_ratio", 0)
print("  {:<12} {:>15.1f}% {:>16}".format("Cache hit", cache_c * 100, "N/A"))
tput_c = len(ok_c) / sc.get("wall_clock_s", 1)
print("  {:<12} {:>14.2f}/s {:>16}".format("Throughput", tput_c, "~0.06/s"))

print()
print(sep)
print("  CONCLUSIONS FOR AGENTIC WORKLOAD")
print(sep)
print()
print("  Trace characteristics:")
print("    - I/O ratio: 61.5x (strongly prefill-dominated)")
print("    - 39% requests > 32k input tokens")
print("    - 16% prefix block sharing across sessions")
print("    - 53% prefix cache hit ratio (APC)")
print()
print("  PD separation findings:")

delta_tpot = (pct(tpots_d, 0.5) / pct(tpots_c, 0.5) - 1) * 100 if tpots_c else 0
delta_ttft = (pct(ttfts_d, 0.5) / pct(ttfts_c, 0.5) - 1) * 100 if ttfts_c else 0
delta_e2e = (pct(lats_d, 0.5) / pct(lats_c, 0.5) - 1) * 100 if lats_c else 0

print("    1. TPOT {:+.0f}% - decode isolation benefit is {}".format(
    delta_tpot, "marginal" if abs(delta_tpot) < 20 else "significant"))
print("    2. TTFT {:+.0f}% - KV transfer + TP=4 overhead dominates".format(delta_ttft))
print("    3. E2E  {:+.0f}% - net negative on single-machine".format(delta_e2e))
print("    4. Stability: Mooncake connector crashes after ~200 reqs under load")
print()
print("  Recommendation:")
print("    - Single-machine 8 GPU: Combined mode is better (lower TTFT, stable)")
print("    - Multi-machine: PD-Sep is promising IF cross-machine latency")
print("      is hidden by RDMA and prefill doesn't share GPU with decode")
print("    - Key bottleneck: this workload's heavy prefill (avg 32k tokens)")
print("      makes KV transfer cost non-trivial relative to prefill time")
print("    - Prefill-as-a-Service (Goal 5) should focus on cross-machine")
print("      KV cache sharing, not same-machine PD split")