Files
agentic-kvc/microbench/plot_breakdown.py
Gahow Wang 72790ae6c1 PD-sep server-side profiling: vLLM patches + per-request breakdown
Instrumentation patches (microbench/patches/):
  - pd_profile.py: shared event emitter (VLLM_PD_PROFILE_LOG env var)
  - apply_patches.py: idempotent patch installer for mooncake_connector.py
    and scheduler.py, marks insertions with # PD_PROFILE_PATCH
  - analyze_events.py: joins per-process JSONL event logs by transfer_id
    into per-request phase durations

Seven events captured per request:
  D_get_num_matched → P_zmq_received → P_prefill_done →
  P_rdma_start → P_rdma_end → D_recv_complete → D_request_promoted

Driver fix (microbench/lifecycle/driver.py):
  seed_prefix_cache now sends via the proxy URL so P and D both cache
  the seeded prefix with matching block hashes. Previously seeding D
  directly produced different block hashes than the proxy-routed
  measurement requests, making incremental transfer impossible.

Real breakdown (fig_breakdown_real.png, server_breakdown.csv, n=93):
  prefill_compute  620 ms median (95% of overhead)
  rdma_transfer     42 ms median (~71 Gbps effective)
  other overhead    10 ms median (dispatch + params + signal + promote)

Mooncake transfer is NOT the bottleneck. Even with bulk RDMA the
transfer cost is <10% of prefill cost for Qwen3-30B-A3B on H20.
2026-05-26 13:59:09 +08:00

163 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""
Stacked-bar breakdown of PD-sep request latency.
Axes:
X : total input length (N_total), grouped by cache hit ratio
Stacks: prefill compute (red) | KV transfer RDMA (orange) | decode (steelblue)
Measured constants (H20, Qwen3-Coder-30B-A3B, from microbench):
cold_prefill_ms(n) ≈ 0.072 * n (interference D=1 prefill_ttft, n=2k-16k)
kv_transfer_ms(n) = 35 + n * 96KB * 8 / 25Gbps (warm Mooncake RDMA)
decode_ms = output_tokens * 7.0ms/token
"""
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pathlib import Path
HERE = Path(__file__).parent
OUT = HERE / "lifecycle/results/fig_breakdown.png"
OUT.parent.mkdir(parents=True, exist_ok=True)
# ── measured constants ───────────────────────────────────────────────────────
MS_PER_TOK_COLD = 0.072 # ms / new token (cold prefill, linear regime)
KV_BYTES_PER_TOK = 2*48*4*128*2 # 98304 B per token (Qwen3-30B-A3B)
RDMA_BW_GBPS = 25 # effective Mooncake bandwidth (measured)
RDMA_OVERHEAD_MS = 35 # warm-connection fixed overhead (measured)
DECODE_MS_PER_TOK = 7.0 # TPOT baseline p50
OUTPUT_TOKENS = 128 # representative output length for decode bar
def prefill_ms(n_new):
return MS_PER_TOK_COLD * max(1, n_new)
def transfer_ms(n_new):
kv_bytes = KV_BYTES_PER_TOK * max(1, n_new)
bw_ms = kv_bytes * 8 / (RDMA_BW_GBPS * 1e9) * 1000
return RDMA_OVERHEAD_MS + bw_ms
# ── sweep parameters ─────────────────────────────────────────────────────────
N_TOTALS = [1024, 2048, 4096, 8192, 16384, 32768]
CACHE_RATIOS = [0.0, 0.25, 0.50, 0.75]
CR_LABELS = ["0%", "25%", "50%", "75%"]
CR_ALPHAS = [1.0, 0.75, 0.50, 0.28]
CR_HATCHES = [None, None, "///", "///"]
C_PREFILL = "#d62728"
C_TRANSFER = "#ff7f0e"
C_DECODE = "#1f77b4"
# ── compute matrices ─────────────────────────────────────────────────────────
nN, nC = len(N_TOTALS), len(CACHE_RATIOS)
pf_mat = np.zeros((nN, nC))
tr_mat = np.zeros((nN, nC))
dec_mat = np.zeros((nN, nC))
for i, N in enumerate(N_TOTALS):
for j, cr in enumerate(CACHE_RATIOS):
n_new = max(1, int(N * (1 - cr)))
pf_mat[i,j] = prefill_ms(n_new)
tr_mat[i,j] = transfer_ms(n_new)
dec_mat[i,j] = DECODE_MS_PER_TOK * OUTPUT_TOKENS
# ── plot ─────────────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(13, 6.5))
bar_w = 0.18
group_gap = 1.0
x_centers = np.arange(nN) * group_gap
offsets = np.linspace(-(nC-1)/2, (nC-1)/2, nC) * bar_w
for j in range(nC):
xp = x_centers + offsets[j]
pf = pf_mat[:, j]
tr = tr_mat[:, j]
dc = dec_mat[:, j]
alpha = CR_ALPHAS[j]
hatch = CR_HATCHES[j]
kw = dict(width=bar_w, alpha=alpha,
edgecolor="white", linewidth=0.5)
if hatch:
kw["hatch"] = hatch
ax.bar(xp, pf, color=C_PREFILL, **kw)
ax.bar(xp, tr, bottom=pf, color=C_TRANSFER, **kw)
ax.bar(xp, dc, bottom=pf+tr, color=C_DECODE, **kw)
# value labels on top
for xpos, total in zip(xp, pf + tr + dc):
s = f"{total/1000:.1f}s" if total >= 1000 else f"{total:.0f}ms"
ax.text(xpos, total + ax.get_ylim()[1]*0.01, s,
ha="center", va="bottom",
fontsize=7.2, color="black", alpha=max(alpha, 0.5))
# recompute ylim-based offsets after first pass
ymax = (pf_mat + tr_mat + dec_mat).max() * 1.18
ax.set_ylim(0, ymax)
# re-draw labels with correct ylim
for j in range(nC):
xp = x_centers + offsets[j]
total = pf_mat[:,j] + tr_mat[:,j] + dec_mat[:,j]
alpha = CR_ALPHAS[j]
for xpos, t in zip(xp, total):
s = f"{t/1000:.1f}s" if t >= 1000 else f"{t:.0f}ms"
# already drawn above (approximate); skip redraw
# cache-ratio sub-labels below bars
for j in range(nC):
for xi, x in enumerate(x_centers):
xp = x + offsets[j]
ax.text(xp, -ymax * 0.032, CR_LABELS[j],
ha="center", va="top", fontsize=7.8,
color="dimgrey", alpha=max(CR_ALPHAS[j], 0.4))
ax.text(x_centers[0] + offsets[0] - bar_w,
-ymax * 0.032, "cache\nhit:",
ha="right", va="top", fontsize=7.5,
color="dimgrey", style="italic")
ax.set_xticks(x_centers)
ax.set_xticklabels([f"{N//1024}k" for N in N_TOTALS], fontsize=12)
ax.set_xlabel("Total input tokens (N)", fontsize=12)
ax.set_ylabel("Latency (ms)", fontsize=12)
ax.set_title(
"PD-Disaggregated Request Latency Breakdown\n"
"Qwen3-Coder-30B-A3B · H20 · Mooncake RDMA · output=128 tokens",
fontsize=13, fontweight="bold")
ax.yaxis.grid(True, linestyle="--", alpha=0.35)
ax.set_axisbelow(True)
# ── legend ────────────────────────────────────────────────────────────────────
phase_h = [
mpatches.Patch(color=C_PREFILL, label="Prefill compute (P node)"),
mpatches.Patch(color=C_TRANSFER, label="KV transfer (Mooncake RDMA)"),
mpatches.Patch(color=C_DECODE, label="Decode generation (D node)"),
]
spacer = mpatches.Patch(color="none", label="")
cr_h = [
mpatches.Patch(facecolor="grey", alpha=CR_ALPHAS[j],
hatch=(CR_HATCHES[j] or ""),
label=f"KV cache hit {CR_LABELS[j]}")
for j in range(nC)
]
ax.legend(handles=phase_h + [spacer] + cr_h,
loc="upper left", fontsize=9, framealpha=0.9,
ncol=2, columnspacing=1.2, handlelength=1.5)
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.savefig(OUT, dpi=160, bbox_inches="tight")
print(f"Saved: {OUT}")
# ── print table ──────────────────────────────────────────────────────────────
print(f"\n{'N':>6} {'cache%':>7} | {'prefill':>8} {'transfer':>9} {'decode':>8} | {'E2E':>8}")
print("-" * 60)
for i, N in enumerate(N_TOTALS):
for j, cr in enumerate(CACHE_RATIOS):
pf = pf_mat[i,j]; tr = tr_mat[i,j]; dc = dec_mat[i,j]
print(f"{N:>6} {cr*100:>6.0f}% | {pf:>8.0f} {tr:>9.0f} {dc:>8.0f} | {pf+tr+dc:>8.0f}")
print()