Files
agentic-kvc/microbench/fresh_setup/plot_fig4_linear_vs_lmetric.py
Gahow Wang 32f7f55990 v2: linear (default cache-aware) baseline + 2x wall-cap on first600s
Follow-up to the LMetric sweep: rerun with --policy linear (cache-aware
load + sticky session affinity, the cache_aware_proxy default) and cap
each PD-disagg arm at 2x the colo bench wall (SIGTERM bench.sh once cap
is exceeded; the cleanup trap clears vLLM and proxy; capped runs lack
metrics.summary.json so the analysis script computes from raw
metrics.jsonl).

Headline: the success-rate ceiling is policy-invariant.

  arm        linear (capped at 2x)    lmetric (uncapped)
  colo       807/807 = 100%, 964s     807/807 = 100%, 1021s
  pd6 (6:2)  472/807 =  58%, 2280s ⊗  474/807 =  59%, 3325s
  pd4 (4:4)  349/807 =  43%, 2281s ⊗  348/807 =  43%, 6850s
  pd2 (2:6)  176/807 =  22%, 2280s ⊗  180/807 =  22%, 19275s

Routing affects only how much wall is wasted timing out unreachable
requests at 600s each. Linear hits the same ceiling in 2280s as
LMetric does in 3300-19000s. This *strengthens* the §5 D-pool
capacity-ceiling thesis -- the cap is structural, not a routing
artifact.

Artifacts:
  analysis/v2/fig4r_linear.json          -- 4-arm linear summary
  analysis/v2/PD_DISAGG_LMETRIC.md       -- extended with wall-cap section
  figs/v2/fig4_linear_vs_lmetric.png     -- 3-panel side-by-side comparison
  microbench/fresh_setup/plot_fig4_linear_vs_lmetric.py
2026-06-01 00:55:40 +08:00

105 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Linear vs LMetric routing on the real agentic trace (first600s).
Visualizes the wall-cap finding: with the 2x-colo-wall cap on PD-disagg arms,
linear and LMetric reach the *same* success-rate ceiling -- the static P:D
split has a structural completion ceiling that does not depend on the routing
policy or on how long you keep retrying. Routing affects only how much wall
time is wasted on requests that will never succeed.
Inputs : analysis/v2/fig4l_lmetric.json (8 arms, both traces; we use first600s)
analysis/v2/fig4r_linear.json (4 arms, first600s, PD wall-capped)
Output : figs/v2/fig4_linear_vs_lmetric.png
"""
from __future__ import annotations
import json
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
ROOT = Path(__file__).resolve().parents[2]
DATA = ROOT / "analysis" / "v2"
OUT = ROOT / "figs" / "v2" / "fig4_linear_vs_lmetric.png"
ARMS = ["colo", "6P+2D", "4P+4D", "2P+6D"]
POLICY_COLOR = {"linear": "#9467bd", "lmetric": "#2ca02c"}
POLICY_LABEL = {"linear": "linear (cache-aware + session-affinity)",
"lmetric": "LMetric (P_tokens × BS)"}
def pick(rows, arm, trace="first600s"):
for r in rows:
if r["arm"] == arm and r["trace"] == trace:
return r
return None
def main():
lin = json.load(open(DATA / "fig4r_linear.json"))
lme = json.load(open(DATA / "fig4l_lmetric.json"))
# colo wall (linear) sets the 2x cap reference
colo_lin_wall = pick(lin, "colo")["wall"]
cap = 2 * colo_lin_wall
fig, axes = plt.subplots(1, 3, figsize=(15, 4.5))
x = np.arange(len(ARMS))
w = 0.38
# (a) success rate
ax = axes[0]
for i, (pol, rows) in enumerate([("linear", lin), ("lmetric", lme)]):
vals = [pick(rows, a)["n"] / pick(rows, a)["req"] * 100 for a in ARMS]
bars = ax.bar(x + (i - 0.5) * w, vals, w, color=POLICY_COLOR[pol], label=POLICY_LABEL[pol])
for bx, bv in zip(x + (i - 0.5) * w, vals):
ax.annotate(f"{bv:.0f}%", (bx, bv + 1.5), ha="center", fontsize=8)
ax.axhline(100, color="grey", ls=":", lw=1)
ax.set_xticks(x); ax.set_xticklabels(ARMS)
ax.set_ylabel("success rate (% of trace)"); ax.set_ylim(0, 115)
ax.set_title("(a) success ceiling is policy-invariant")
ax.legend(fontsize=8, loc="upper right"); ax.grid(alpha=.3, axis="y")
# (b) wall (log y) with cap line
ax = axes[1]
for i, (pol, rows) in enumerate([("linear", lin), ("lmetric", lme)]):
vals = [pick(rows, a)["wall"] for a in ARMS]
bars = ax.bar(x + (i - 0.5) * w, vals, w, color=POLICY_COLOR[pol],
label=POLICY_LABEL[pol])
for bx, bv, r in zip(x + (i - 0.5) * w, vals,
[pick(rows, a) for a in ARMS]):
mark = "" if r.get("capped") else ""
ax.annotate(f"{bv:.0f}s{mark}", (bx, bv * 1.05), ha="center", fontsize=7)
ax.axhline(cap, color="red", ls="--", lw=1.5,
label=f"2× colo wall cap = {cap:.0f}s")
ax.set_xticks(x); ax.set_xticklabels(ARMS)
ax.set_ylabel("wall-clock (s, log)"); ax.set_yscale("log")
ax.set_title("(b) linear w/ cap vs lmetric w/o cap — ⊗ = cap-killed")
ax.legend(fontsize=8, loc="upper left"); ax.grid(alpha=.3, which="both", axis="y")
# (c) goodput per minute of wall (success rate / wall × 60)
ax = axes[2]
for i, (pol, rows) in enumerate([("linear", lin), ("lmetric", lme)]):
vals = [pick(rows, a)["n"] / pick(rows, a)["wall"] * 60 for a in ARMS]
bars = ax.bar(x + (i - 0.5) * w, vals, w, color=POLICY_COLOR[pol], label=POLICY_LABEL[pol])
for bx, bv in zip(x + (i - 0.5) * w, vals):
ax.annotate(f"{bv:.1f}", (bx, bv + max(vals) * 0.02),
ha="center", fontsize=8)
ax.set_xticks(x); ax.set_xticklabels(ARMS)
ax.set_ylabel("goodput (successful req / min)")
ax.set_title("(c) linear+cap is 1.517× more wall-efficient on PD")
ax.legend(fontsize=8, loc="upper right"); ax.grid(alpha=.3, axis="y")
fig.suptitle("Fig 4r — Linear vs LMetric on the real agentic trace (first600s, "
"PD-disagg wall-capped at 2× colo)",
fontsize=12, y=1.0)
fig.tight_layout()
fig.savefig(OUT, dpi=130, bbox_inches="tight")
print(f"wrote {OUT}")
if __name__ == "__main__":
main()