Anchor experiment for the clean-stack PD comparison using the canonical
cache-aware proxy with --policy lmetric (scripts/bench.sh harness). Two
traces x four arms = eight runs on dash1.
Headline: with the right routing baseline (LMetric), PD-colo holds 100%
completion on both traces while every static PD-disagg ratio fails
(14-65% completion), and the failure mode rotates with the split --
no static partition has a working operating point on this workload.
LMetric improves colo dramatically (TTFT p50 1.0s vs original §3 RR
7.0s; 7x) but does NOT rescue PD-disagg, confirming the bottleneck is
structural (D-pool admission + multi-turn KV accumulation), not routing.
Completion matrix:
first600s full
colo 100% 100%
pd6 (6:2) 58.7% 65.3% (decode-bound)
pd4 (4:4) 43.1% 43.9% (both bottlenecks)
pd2 (2:6) 22.3% 13.9% (prefill-bound)
The original §3 RR "100% PD completion" appears to be a measurement
artifact of e13391e: producer-KV eviction acted as a relief valve,
letting more requests squeeze under the 600s timeout at the (uncosted)
price of cross-turn re-prefill. With the eviction off, PD-disagg is
worse than §3 advertised, not better.
Artifacts:
analysis/v2/fig4l_lmetric.json -- 8-arm summary data
analysis/v2/PD_DISAGG_LMETRIC.md -- writeup + reproduce recipe
figs/v2/fig4_lmetric_pd_vs_colo.png -- 4-panel comparison figure
microbench/fresh_setup/plot_fig4l_lmetric.py -- plot script
114 lines
4.5 KiB
Python
114 lines
4.5 KiB
Python
"""Render the LMetric PD-colo vs PD-disagg figure on the real agentic trace.
|
||
|
||
Input : analysis/v2/fig4l_lmetric.json (8 arms = 4 ratios x 2 traces)
|
||
Output : figs/v2/fig4_lmetric_pd_vs_colo.png
|
||
|
||
Four panels x four ratios x two traces:
|
||
(a) completion rate %
|
||
(b) E2E latency (mean / p50 / p90)
|
||
(c) throughput (output tokens / second)
|
||
(d) bench wall-clock seconds
|
||
|
||
The thesis the figure visualizes: with LMetric routing,
|
||
- colo (elastic 8-GPU pool) holds 100% completion on both traces
|
||
- every PD-disagg ratio fails (completion 14-65%), and the failure mode
|
||
rotates with the split (pd2 = prefill-bound, pd6 = decode-bound)
|
||
- routing policy does not rescue PD-disagg; the bottleneck is structural.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from pathlib import Path
|
||
|
||
import matplotlib
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
|
||
ROOT = Path(__file__).resolve().parents[2]
|
||
DATA = ROOT / "analysis" / "v2" / "fig4l_lmetric.json"
|
||
OUT = ROOT / "figs" / "v2" / "fig4_lmetric_pd_vs_colo.png"
|
||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
ARMS = ["colo", "6P+2D", "4P+4D", "2P+6D"] # decode-rich -> prefill-rich
|
||
TRACES = ["first600s", "full"]
|
||
TRACE_LABEL = {"first600s": "first600s (1.35 req/s, high load)",
|
||
"full": "full w600 (0.42 req/s, original §3)"}
|
||
COLOR = {"first600s": "#1f77b4", "full": "#ff7f0e"}
|
||
|
||
|
||
def pick(rows, trace, arm):
|
||
for r in rows:
|
||
if r["trace"] == trace and r["arm"] == arm:
|
||
return r
|
||
return None
|
||
|
||
|
||
def main():
|
||
rows = json.load(open(DATA))
|
||
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
|
||
width = 0.38
|
||
x = np.arange(len(ARMS))
|
||
|
||
# (a) completion %
|
||
ax = axes[0, 0]
|
||
for i, tr in enumerate(TRACES):
|
||
vals = [pick(rows, tr, a)["n"] / pick(rows, tr, a)["req"] * 100 for a in ARMS]
|
||
bars = ax.bar(x + (i - 0.5) * width, vals, width, color=COLOR[tr], label=TRACE_LABEL[tr])
|
||
for bx, bv in zip(x + (i - 0.5) * width, vals):
|
||
ax.annotate(f"{bv:.0f}%", (bx, bv + 1.5), ha="center", fontsize=8)
|
||
ax.axhline(100, color="grey", ls=":", lw=1)
|
||
ax.set_xticks(x); ax.set_xticklabels(ARMS)
|
||
ax.set_ylabel("completion (%)"); ax.set_ylim(0, 115)
|
||
ax.set_title("(a) request completion — colo holds 100%, all PD ratios fail")
|
||
ax.legend(fontsize=8); ax.grid(alpha=.3, axis="y")
|
||
|
||
# (b) E2E percentiles
|
||
ax = axes[0, 1]
|
||
for i, tr in enumerate(TRACES):
|
||
p50 = [pick(rows, tr, a)["e2e"]["p50"] for a in ARMS]
|
||
p90 = [pick(rows, tr, a)["e2e"]["p90"] for a in ARMS]
|
||
off = (i - 0.5) * width
|
||
ax.bar(x + off, p90, width, color=COLOR[tr], alpha=0.55, label=f"{tr} p90")
|
||
ax.bar(x + off, p50, width, color=COLOR[tr], alpha=1.0, label=f"{tr} p50")
|
||
ax.axhline(600, color="red", ls=":", lw=1, label="600 s request timeout")
|
||
ax.set_xticks(x); ax.set_xticklabels(ARMS)
|
||
ax.set_ylabel("E2E latency (s, log)"); ax.set_yscale("log")
|
||
ax.set_title("(b) E2E p50 (solid) + p90 (faded) — PD pegs at the timeout")
|
||
ax.legend(fontsize=7, ncol=2); ax.grid(alpha=.3, which="both", axis="y")
|
||
|
||
# (c) TPS
|
||
ax = axes[1, 0]
|
||
for i, tr in enumerate(TRACES):
|
||
vals = [pick(rows, tr, a)["tps"] for a in ARMS]
|
||
ax.bar(x + (i - 0.5) * width, vals, width, color=COLOR[tr], label=TRACE_LABEL[tr])
|
||
for bx, bv in zip(x + (i - 0.5) * width, vals):
|
||
ax.annotate(f"{bv:.0f}", (bx, bv + 4), ha="center", fontsize=8)
|
||
ax.set_xticks(x); ax.set_xticklabels(ARMS)
|
||
ax.set_ylabel("throughput (output tokens/s)")
|
||
ax.set_title("(c) throughput — PD throughput crashes 5–100×")
|
||
ax.legend(fontsize=8); ax.grid(alpha=.3, axis="y")
|
||
|
||
# (d) wall (min)
|
||
ax = axes[1, 1]
|
||
for i, tr in enumerate(TRACES):
|
||
vals = [pick(rows, tr, a)["wall"] / 60 for a in ARMS]
|
||
ax.bar(x + (i - 0.5) * width, vals, width, color=COLOR[tr], label=TRACE_LABEL[tr])
|
||
for bx, bv in zip(x + (i - 0.5) * width, vals):
|
||
ax.annotate(f"{bv:.0f}m", (bx, bv * 1.05), ha="center", fontsize=8)
|
||
ax.set_xticks(x); ax.set_xticklabels(ARMS)
|
||
ax.set_ylabel("bench wall-clock (min, log)"); ax.set_yscale("log")
|
||
ax.set_title("(d) wall-clock — PD drain dilates the run")
|
||
ax.legend(fontsize=8); ax.grid(alpha=.3, which="both", axis="y")
|
||
|
||
fig.suptitle("Fig 4 (LMetric) — PD-colo vs PD-disagg on the real agentic trace "
|
||
"(`w600_r0.0015_st30`), clean stack, cache-aware LMetric routing",
|
||
fontsize=12, y=1.0)
|
||
fig.tight_layout()
|
||
fig.savefig(OUT, dpi=130, bbox_inches="tight")
|
||
print(f"wrote {OUT}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|