Instrumentation patches (microbench/patches/):
- pd_profile.py: shared event emitter (VLLM_PD_PROFILE_LOG env var)
- apply_patches.py: idempotent patch installer for mooncake_connector.py
and scheduler.py, marks insertions with # PD_PROFILE_PATCH
- analyze_events.py: joins per-process JSONL event logs by transfer_id
into per-request phase durations
Seven events captured per request:
D_get_num_matched → P_zmq_received → P_prefill_done →
P_rdma_start → P_rdma_end → D_recv_complete → D_request_promoted
Driver fix (microbench/lifecycle/driver.py):
seed_prefix_cache now sends via the proxy URL so P and D both cache
the seeded prefix with matching block hashes. Previously seeding D
directly produced different block hashes than the proxy-routed
measurement requests, making incremental transfer impossible.
Real breakdown (fig_breakdown_real.png, server_breakdown.csv, n=93):
prefill_compute 620 ms median (95% of overhead)
rdma_transfer 42 ms median (~71 Gbps effective)
other overhead 10 ms median (dispatch + params + signal + promote)
Mooncake transfer is NOT the bottleneck. Even with bulk RDMA the
transfer cost is <10% of prefill cost for Qwen3-30B-A3B on H20.
163 lines
6.6 KiB
Python
163 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Stacked-bar breakdown of PD-sep request latency.
|
|
|
|
Axes:
|
|
X : total input length (N_total), grouped by cache hit ratio
|
|
Stacks: prefill compute (red) | KV transfer RDMA (orange) | decode (steelblue)
|
|
|
|
Measured constants (H20, Qwen3-Coder-30B-A3B, from microbench):
|
|
cold_prefill_ms(n) ≈ 0.072 * n (interference D=1 prefill_ttft, n=2k-16k)
|
|
kv_transfer_ms(n) = 35 + n * 96KB * 8 / 25Gbps (warm Mooncake RDMA)
|
|
decode_ms = output_tokens * 7.0ms/token
|
|
"""
|
|
|
|
import numpy as np
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as mpatches
|
|
from pathlib import Path
|
|
|
|
HERE = Path(__file__).parent
|
|
OUT = HERE / "lifecycle/results/fig_breakdown.png"
|
|
OUT.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# ── measured constants ───────────────────────────────────────────────────────
|
|
MS_PER_TOK_COLD = 0.072 # ms / new token (cold prefill, linear regime)
|
|
KV_BYTES_PER_TOK = 2*48*4*128*2 # 98304 B per token (Qwen3-30B-A3B)
|
|
RDMA_BW_GBPS = 25 # effective Mooncake bandwidth (measured)
|
|
RDMA_OVERHEAD_MS = 35 # warm-connection fixed overhead (measured)
|
|
DECODE_MS_PER_TOK = 7.0 # TPOT baseline p50
|
|
OUTPUT_TOKENS = 128 # representative output length for decode bar
|
|
|
|
def prefill_ms(n_new):
|
|
return MS_PER_TOK_COLD * max(1, n_new)
|
|
|
|
def transfer_ms(n_new):
|
|
kv_bytes = KV_BYTES_PER_TOK * max(1, n_new)
|
|
bw_ms = kv_bytes * 8 / (RDMA_BW_GBPS * 1e9) * 1000
|
|
return RDMA_OVERHEAD_MS + bw_ms
|
|
|
|
# ── sweep parameters ─────────────────────────────────────────────────────────
|
|
N_TOTALS = [1024, 2048, 4096, 8192, 16384, 32768]
|
|
CACHE_RATIOS = [0.0, 0.25, 0.50, 0.75]
|
|
CR_LABELS = ["0%", "25%", "50%", "75%"]
|
|
CR_ALPHAS = [1.0, 0.75, 0.50, 0.28]
|
|
CR_HATCHES = [None, None, "///", "///"]
|
|
|
|
C_PREFILL = "#d62728"
|
|
C_TRANSFER = "#ff7f0e"
|
|
C_DECODE = "#1f77b4"
|
|
|
|
# ── compute matrices ─────────────────────────────────────────────────────────
|
|
nN, nC = len(N_TOTALS), len(CACHE_RATIOS)
|
|
pf_mat = np.zeros((nN, nC))
|
|
tr_mat = np.zeros((nN, nC))
|
|
dec_mat = np.zeros((nN, nC))
|
|
|
|
for i, N in enumerate(N_TOTALS):
|
|
for j, cr in enumerate(CACHE_RATIOS):
|
|
n_new = max(1, int(N * (1 - cr)))
|
|
pf_mat[i,j] = prefill_ms(n_new)
|
|
tr_mat[i,j] = transfer_ms(n_new)
|
|
dec_mat[i,j] = DECODE_MS_PER_TOK * OUTPUT_TOKENS
|
|
|
|
# ── plot ─────────────────────────────────────────────────────────────────────
|
|
fig, ax = plt.subplots(figsize=(13, 6.5))
|
|
|
|
bar_w = 0.18
|
|
group_gap = 1.0
|
|
x_centers = np.arange(nN) * group_gap
|
|
offsets = np.linspace(-(nC-1)/2, (nC-1)/2, nC) * bar_w
|
|
|
|
for j in range(nC):
|
|
xp = x_centers + offsets[j]
|
|
pf = pf_mat[:, j]
|
|
tr = tr_mat[:, j]
|
|
dc = dec_mat[:, j]
|
|
alpha = CR_ALPHAS[j]
|
|
hatch = CR_HATCHES[j]
|
|
kw = dict(width=bar_w, alpha=alpha,
|
|
edgecolor="white", linewidth=0.5)
|
|
if hatch:
|
|
kw["hatch"] = hatch
|
|
|
|
ax.bar(xp, pf, color=C_PREFILL, **kw)
|
|
ax.bar(xp, tr, bottom=pf, color=C_TRANSFER, **kw)
|
|
ax.bar(xp, dc, bottom=pf+tr, color=C_DECODE, **kw)
|
|
|
|
# value labels on top
|
|
for xpos, total in zip(xp, pf + tr + dc):
|
|
s = f"{total/1000:.1f}s" if total >= 1000 else f"{total:.0f}ms"
|
|
ax.text(xpos, total + ax.get_ylim()[1]*0.01, s,
|
|
ha="center", va="bottom",
|
|
fontsize=7.2, color="black", alpha=max(alpha, 0.5))
|
|
|
|
# recompute ylim-based offsets after first pass
|
|
ymax = (pf_mat + tr_mat + dec_mat).max() * 1.18
|
|
ax.set_ylim(0, ymax)
|
|
|
|
# re-draw labels with correct ylim
|
|
for j in range(nC):
|
|
xp = x_centers + offsets[j]
|
|
total = pf_mat[:,j] + tr_mat[:,j] + dec_mat[:,j]
|
|
alpha = CR_ALPHAS[j]
|
|
for xpos, t in zip(xp, total):
|
|
s = f"{t/1000:.1f}s" if t >= 1000 else f"{t:.0f}ms"
|
|
# already drawn above (approximate); skip redraw
|
|
|
|
# cache-ratio sub-labels below bars
|
|
for j in range(nC):
|
|
for xi, x in enumerate(x_centers):
|
|
xp = x + offsets[j]
|
|
ax.text(xp, -ymax * 0.032, CR_LABELS[j],
|
|
ha="center", va="top", fontsize=7.8,
|
|
color="dimgrey", alpha=max(CR_ALPHAS[j], 0.4))
|
|
|
|
ax.text(x_centers[0] + offsets[0] - bar_w,
|
|
-ymax * 0.032, "cache\nhit:",
|
|
ha="right", va="top", fontsize=7.5,
|
|
color="dimgrey", style="italic")
|
|
|
|
ax.set_xticks(x_centers)
|
|
ax.set_xticklabels([f"{N//1024}k" for N in N_TOTALS], fontsize=12)
|
|
ax.set_xlabel("Total input tokens (N)", fontsize=12)
|
|
ax.set_ylabel("Latency (ms)", fontsize=12)
|
|
ax.set_title(
|
|
"PD-Disaggregated Request Latency Breakdown\n"
|
|
"Qwen3-Coder-30B-A3B · H20 · Mooncake RDMA · output=128 tokens",
|
|
fontsize=13, fontweight="bold")
|
|
ax.yaxis.grid(True, linestyle="--", alpha=0.35)
|
|
ax.set_axisbelow(True)
|
|
|
|
# ── legend ────────────────────────────────────────────────────────────────────
|
|
phase_h = [
|
|
mpatches.Patch(color=C_PREFILL, label="Prefill compute (P node)"),
|
|
mpatches.Patch(color=C_TRANSFER, label="KV transfer (Mooncake RDMA)"),
|
|
mpatches.Patch(color=C_DECODE, label="Decode generation (D node)"),
|
|
]
|
|
spacer = mpatches.Patch(color="none", label="")
|
|
cr_h = [
|
|
mpatches.Patch(facecolor="grey", alpha=CR_ALPHAS[j],
|
|
hatch=(CR_HATCHES[j] or ""),
|
|
label=f"KV cache hit {CR_LABELS[j]}")
|
|
for j in range(nC)
|
|
]
|
|
ax.legend(handles=phase_h + [spacer] + cr_h,
|
|
loc="upper left", fontsize=9, framealpha=0.9,
|
|
ncol=2, columnspacing=1.2, handlelength=1.5)
|
|
|
|
plt.tight_layout(rect=[0, 0.05, 1, 1])
|
|
plt.savefig(OUT, dpi=160, bbox_inches="tight")
|
|
print(f"Saved: {OUT}")
|
|
|
|
# ── print table ──────────────────────────────────────────────────────────────
|
|
print(f"\n{'N':>6} {'cache%':>7} | {'prefill':>8} {'transfer':>9} {'decode':>8} | {'E2E':>8}")
|
|
print("-" * 60)
|
|
for i, N in enumerate(N_TOTALS):
|
|
for j, cr in enumerate(CACHE_RATIOS):
|
|
pf = pf_mat[i,j]; tr = tr_mat[i,j]; dc = dec_mat[i,j]
|
|
print(f"{N:>6} {cr*100:>6.0f}% | {pf:>8.0f} {tr:>9.0f} {dc:>8.0f} | {pf+tr+dc:>8.0f}")
|
|
print()
|