Files
Gahow Wang dc6d24d1ca Add NIXL substrate isolation control + attribution decomposition
Adds unified_nixl_both to elastic_migration_v2: same picker as
unified_kv_both (never triggers PD-sep), but launches vLLM with
NixlConnector instead of MooncakeConnector. Compared against plain
unified and unified_kv_both (Mooncake) we can now attribute the
substrate overhead between "v1 connector framework irreducible
cost" (proxied by the leaner NIXL) and "Mooncake implementation
extra" (Mooncake - NIXL).

Result (vs plain unified, both substrates never PD-sep):

   metric          plain    NIXL          Mooncake
   TTFT p90        7.35s    +37.9%        +45.3%      (NIXL: +7pp better)
   TPOT p90        17.1ms   +15.5%        +24.5%      (NIXL: +9pp better)
   E2E p90         18.03s   +17.4%        +27.0%      (NIXL: +10pp better)
   hotspot         3.667    +0.2%         +19.0%      (NIXL: keeps it flat)
   APC             79.4%    -0.3pp        -1.1pp
   interference    -        5.58          8.57         (NIXL: ~35% lower)

The cleanest signal is hotspot: NIXL preserves plain-unified's
distribution (3.674 vs 3.667), while Mooncake's per-scheduler-step
O(|cache|) `set(self._block_pool.cache.keys())` diff against
_known_hash_keys (mooncake_connector.py:432-456) inflates routing
imbalance by 19%. The hash sync runs unconditionally even when no
direct_read consumer is present.

Attribution: NIXL-plain ~= v1 framework irreducible cost (kv_buffer
GPU memory, per-step SchedulerOutput.kv_connector_metadata
round-trip, altered kv_cache_manager block-lifecycle). Mooncake-NIXL
~= Mooncake-specific overhead (the hash-sync loop and stricter
delay_free semantics).

Practical implication: NIXL is meaningfully better than Mooncake on
this stack, but even NIXL imposes 16-38% across metrics — too
expensive for selective-PD-sep on agentic workloads where the
trigger rate is < 0.5%.

Launch fixes required for NIXL multi-instance:
- VLLM_NIXL_SIDE_CHANNEL_PORT must be unique per instance (default
  5600; we use 5600+i). Without this, 7 of 8 instances silently hang
  in `zmq.error.ZMQError: Address already in use` and the launcher
  trap kills all of them at health-check timeout.
- Health-check timeout raised from 180s to 360s; NIXL initialization
  (UCX agent + memory registration) is ~100-150s per instance under
  8-way concurrent load, vs Mooncake's ~30-60s.

New figure: fig_connector_substrate_attribution.png stacks plain /
framework / Mooncake-extra / v2-branch overhead per metric.
Existing figures (fig_kv_both_overhead, fig_three_way_hotspot)
updated to include NIXL as a fourth bar.

README updated with 4-way table, Result 1 reframed as "the cost is
mostly framework, not Mooncake — but Mooncake adds the hotspot
penalty", and the substrate-vs-PD-sep tradeoff math.

Refs: nixl_connector.py:700 handshake listener bind, factory.py
register_connector for the NixlConnector entry.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-26 16:02:12 +08:00

301 lines
12 KiB
Python

"""Render PNG figures for the elastic_migration_v2 section.
Inputs in ./data/ :
- b3_policy_comparison.json
- breakdown_unified.json, breakdown_unified_kv_both.json,
breakdown_unified_v2.json, breakdown_unified_v2_strict.json
- per_worker_<policy>.json for each of the four
Outputs in ./figures/ :
- fig_kv_both_overhead.png — three-way latency bars (plain vs kv_both vs v2)
- fig_v2_trigger_funnel.png — request count per fall-through reason
- fig_v2_predicted_vs_actual.png — cost-model migrate prediction vs realized TTFT
- fig_three_way_hotspot.png — per-worker TTFT p90 grouped bars
"""
from __future__ import annotations
import json
from collections import Counter
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
ROOT = Path(__file__).parent
DATA = ROOT / "data"
OUT = ROOT / "figures"
OUT.mkdir(parents=True, exist_ok=True)
def _load(name: str):
return json.loads((DATA / name).read_text())
POLICY_COLORS = {
"unified": "#2ca02c",
"unified_kv_both": "#9467bd",
"unified_nixl_both": "#1f77b4",
"unified_v2": "#d62728",
"unified_v2_strict": "#ff7f0e",
}
def fig_kv_both_overhead():
comp = _load("b3_policy_comparison.json")
by = {r["policy"]: r for r in comp["rows"]}
pols = ["unified", "unified_kv_both", "unified_nixl_both", "unified_v2"]
metrics = [
("TTFT p90 (s)", lambda r: r["ttft_p90_s"]),
("TPOT p90 (ms)", lambda r: r["tpot_p90_s"] * 1000),
("E2E p90 (s)", lambda r: r["e2e_p90_s"]),
("hotspot index", lambda r: r["hotspot_index_ttft_p90"]),
]
fig, axes = plt.subplots(1, 4, figsize=(15, 4.2))
for ax, (label, fn) in zip(axes, metrics):
vals = [fn(by[p]) for p in pols]
labels_short = [p.replace("unified_", "") for p in pols]
labels_short[0] = "plain"
bars = ax.bar(labels_short, vals,
color=[POLICY_COLORS[p] for p in pols],
edgecolor="black", linewidth=0.5)
ax.set_title(label)
ax.tick_params(axis="x", rotation=15, labelsize=9)
for b, v in zip(bars, vals):
ax.text(b.get_x() + b.get_width() / 2, v,
f"{v:.2f}" if v < 100 else f"{v:.0f}",
ha="center", va="bottom", fontsize=9)
ax.grid(alpha=0.3, axis="y")
baseline = vals[0]
for i, v in enumerate(vals):
if i == 0:
continue
pct = (v - baseline) / baseline * 100
ax.text(i, v * 0.5, f"{pct:+.0f}%", ha="center",
fontsize=10, fontweight="bold",
color="darkred" if pct > 0 else "darkgreen")
fig.suptitle(
"Mooncake substrate adds 19-45% across metrics; NIXL is 5-19pp better but\n"
"still 16-38% above plain. v2's 5 PD-sep events don't recover the substrate tax."
)
fig.tight_layout()
fig.savefig(OUT / "fig_kv_both_overhead.png", dpi=120)
plt.close(fig)
def _bucket_reasons(data):
"""Collapse v2_reason strings into the funnel buckets."""
buckets = Counter()
for r in data:
if r.get("v2_pd_sep") is True:
buckets["PD-sep TRIGGERED"] += 1
continue
reason = (r.get("v2_reason") or "no_v2_reason").split(" (")[0]
if reason.startswith("local_cost"):
reason = "cost_benefit not enough margin"
buckets[reason] += 1
return buckets
def fig_v2_trigger_funnel():
strict = _load("breakdown_unified_v2_strict.json")
relaxed = _load("breakdown_unified_v2.json")
bs = _bucket_reasons(strict)
br = _bucket_reasons(relaxed)
order = [
"new_local_below_threshold",
"chosen_no_active_decode",
"chosen_few_decodes",
"src_cache_below_threshold",
"src_not_meaningfully_more_cache",
"cost_benefit not enough margin",
"PD-sep TRIGGERED",
]
labels = [k for k in order if k in bs or k in br]
strict_vals = [bs.get(k, 0) for k in labels]
relaxed_vals = [br.get(k, 0) for k in labels]
x = range(len(labels))
width = 0.4
fig, ax = plt.subplots(figsize=(11, 5))
ax.bar([i - width / 2 for i in x], strict_vals, width,
label=f"v2.0 strict (PD-sep={bs['PD-sep TRIGGERED']}/{sum(bs.values())} "
f"= {bs['PD-sep TRIGGERED']*100/sum(bs.values()):.2f}%)",
color="#ff7f0e", edgecolor="black", linewidth=0.5)
ax.bar([i + width / 2 for i in x], relaxed_vals, width,
label=f"v2.1 relaxed (PD-sep={br['PD-sep TRIGGERED']}/{sum(br.values())} "
f"= {br['PD-sep TRIGGERED']*100/sum(br.values()):.2f}%)",
color="#d62728", edgecolor="black", linewidth=0.5)
ax.set_xticks(list(x))
ax.set_xticklabels(labels, rotation=20, ha="right", fontsize=9)
ax.set_ylabel("request count")
ax.set_yscale("log")
ax.set_title(
"Why v2 rarely PD-seps: 88-76% of requests have new_local < threshold\n"
"(intra-session cache already hot). Relaxing thresholds barely helps."
)
ax.legend()
ax.grid(alpha=0.3, axis="y", which="both")
for i, (s, r) in enumerate(zip(strict_vals, relaxed_vals)):
if s > 0:
ax.text(i - width / 2, s * 1.05, str(s), ha="center", fontsize=8)
if r > 0:
ax.text(i + width / 2, r * 1.05, str(r), ha="center", fontsize=8)
fig.tight_layout()
fig.savefig(OUT / "fig_v2_trigger_funnel.png", dpi=120)
plt.close(fig)
def fig_v2_predicted_vs_actual():
"""For each PD-sep'd request, plot model-predicted migrate cost
vs realized TTFT. Should sit near y=x if model is calibrated; sits
far above if mechanism is more expensive than modeled."""
relaxed = _load("breakdown_unified_v2.json")
triggered = [r for r in relaxed if r.get("v2_pd_sep") is True]
if not triggered:
return
predicted = []
actual = []
sizes = []
rids = []
for r in triggered:
cm = r.get("v2_cost_migrate_s")
t0 = r.get("t_proxy_recv")
t_first = r.get("t_first_token")
if cm is None or t0 is None or t_first is None:
continue
ttft = t_first - t0
predicted.append(cm)
actual.append(ttft)
sizes.append(r.get("input_length", 0))
rids.append(r.get("request_id", "?"))
fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(predicted, actual,
s=[max(100, sz / 100) for sz in sizes],
color="#d62728", edgecolors="black", alpha=0.75)
for p, a, sz, rid in zip(predicted, actual, sizes, rids):
ax.annotate(f"input={sz}",
(p, a), xytext=(8, 6), textcoords="offset points",
fontsize=9)
# y=x reference + 10x line + 20x line
lo = 0.5
hi = max(50, max(actual) * 1.2)
ax.plot([lo, hi], [lo, hi], "k--", alpha=0.5, label="y = x (calibrated)")
ax.plot([lo, hi], [lo * 10, hi * 10], color="gray", linestyle=":",
alpha=0.4, label="10x")
ax.plot([lo, hi], [lo * 20, hi * 20], color="lightgray", linestyle=":",
alpha=0.4, label="20x")
ax.set_xscale("log")
ax.set_yscale("log")
ax.set_xlim(lo, hi)
ax.set_ylim(lo, hi)
ax.set_xlabel("Cost model: predicted migrate cost (s)")
ax.set_ylabel("Realized TTFT (s)")
ax.set_title(
"All 5 PD-sep triggered requests in v2.1 sit far above y=x.\n"
"Real transfer cost ~10-20x what the calibrated model predicted."
)
ax.grid(alpha=0.3, which="both")
ax.legend(loc="lower right")
fig.tight_layout()
fig.savefig(OUT / "fig_v2_predicted_vs_actual.png", dpi=120)
plt.close(fig)
def fig_three_way_hotspot():
pols = ["unified", "unified_kv_both", "unified_nixl_both", "unified_v2"]
per_worker = {p: _load(f"per_worker_{p}.json") for p in pols}
workers = sorted(per_worker["unified"]["per_worker_ttft_p90_s"].keys())
x = range(len(workers))
n = len(pols)
width = 0.85 / n
fig, ax = plt.subplots(figsize=(12, 5))
for i, p in enumerate(pols):
d = per_worker[p]["per_worker_ttft_p90_s"]
vals = [d[w] for w in workers]
offset = (i - (n - 1) / 2) * width
label = p.replace("unified_", "") if p != "unified" else "plain"
ax.bar([j + offset for j in x], vals, width,
label=f"{label} (hotspot={per_worker[p]['hotspot_index_ttft_p90']:.2f})",
color=POLICY_COLORS[p], edgecolor="black", linewidth=0.4)
short = [w.replace("http://127.0.0.1:", ":") for w in workers]
ax.set_xticks(list(x))
ax.set_xticklabels(short, rotation=0, fontsize=9)
ax.set_ylabel("worker TTFT p90 (s)")
ax.set_title(
"Per-worker TTFT p90 distribution across substrates. Mooncake (kv_both)\n"
"amplifies the hot worker (hotspot 4.36); NIXL keeps it close to plain (3.67)."
)
ax.legend(loc="upper left", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(OUT / "fig_three_way_hotspot.png", dpi=120)
plt.close(fig)
def fig_connector_substrate_attribution():
"""Decomposes overhead into v1-framework cost (shared by all connectors,
proxied by NIXL since it's the leanest) and Mooncake-specific cost."""
comp = _load("b3_policy_comparison.json")
by = {r["policy"]: r for r in comp["rows"]}
metrics = [
("TTFT p90 (s)", "ttft_p90_s", False),
("TPOT p90 (ms)", "tpot_p90_s", True),
("E2E p90 (s)", "e2e_p90_s", False),
("hotspot index", "hotspot_index_ttft_p90", False),
]
fig, axes = plt.subplots(1, 4, figsize=(15, 4))
for ax, (label, key, scale_ms) in zip(axes, metrics):
plain = by["unified"][key] * (1000 if scale_ms else 1)
nixl = by["unified_nixl_both"][key] * (1000 if scale_ms else 1)
moon = by["unified_kv_both"][key] * (1000 if scale_ms else 1)
v2 = by["unified_v2"][key] * (1000 if scale_ms else 1)
framework_cost = nixl - plain # what NIXL adds = v1 framework cost
mooncake_extra = moon - nixl # extra on top from Mooncake
v2_branch_extra = v2 - moon # extra from PD-sep branch (Mooncake + 5 events)
bottom = 0
ax.bar(["overhead"], [plain], color="#cccccc",
edgecolor="black", linewidth=0.4,
label=f"plain unified ({plain:.2f})")
bottom += plain
ax.bar(["overhead"], [framework_cost], bottom=[bottom],
color="#1f77b4", edgecolor="black", linewidth=0.4,
label=f"v1 framework (+{framework_cost:.2f})")
bottom += framework_cost
ax.bar(["overhead"], [mooncake_extra], bottom=[bottom],
color="#9467bd", edgecolor="black", linewidth=0.4,
label=f"Mooncake extra (+{mooncake_extra:.2f})")
bottom += mooncake_extra
ax.bar(["overhead"], [v2_branch_extra], bottom=[bottom],
color="#d62728", edgecolor="black", linewidth=0.4,
label=f"v2 PD-sep branch ({v2_branch_extra:+.2f})")
ax.set_title(label)
ax.legend(fontsize=8, loc="upper right")
ax.grid(alpha=0.3, axis="y")
ax.tick_params(axis="x", labelbottom=False)
fig.suptitle(
"Attribution: plain unified vs NIXL substrate vs Mooncake substrate vs v2.\n"
"Blue: cost shared by any v1 connector. Purple: cost specific to Mooncake."
)
fig.tight_layout()
fig.savefig(OUT / "fig_connector_substrate_attribution.png", dpi=120)
plt.close(fig)
def main():
fig_kv_both_overhead()
fig_v2_trigger_funnel()
fig_v2_predicted_vs_actual()
fig_three_way_hotspot()
fig_connector_substrate_attribution()
print(f"wrote 5 figures to {OUT}")
if __name__ == "__main__":
main()