Follow-up to the LMetric sweep: rerun with --policy linear (cache-aware load + sticky session affinity, the cache_aware_proxy default) and cap each PD-disagg arm at 2x the colo bench wall (SIGTERM bench.sh once cap is exceeded; the cleanup trap clears vLLM and proxy; capped runs lack metrics.summary.json so the analysis script computes from raw metrics.jsonl). Headline: the success-rate ceiling is policy-invariant. arm linear (capped at 2x) lmetric (uncapped) colo 807/807 = 100%, 964s 807/807 = 100%, 1021s pd6 (6:2) 472/807 = 58%, 2280s ⊗ 474/807 = 59%, 3325s pd4 (4:4) 349/807 = 43%, 2281s ⊗ 348/807 = 43%, 6850s pd2 (2:6) 176/807 = 22%, 2280s ⊗ 180/807 = 22%, 19275s Routing affects only how much wall is wasted timing out unreachable requests at 600s each. Linear hits the same ceiling in 2280s as LMetric does in 3300-19000s. This *strengthens* the §5 D-pool capacity-ceiling thesis -- the cap is structural, not a routing artifact. Artifacts: analysis/v2/fig4r_linear.json -- 4-arm linear summary analysis/v2/PD_DISAGG_LMETRIC.md -- extended with wall-cap section figs/v2/fig4_linear_vs_lmetric.png -- 3-panel side-by-side comparison microbench/fresh_setup/plot_fig4_linear_vs_lmetric.py
105 lines
4.3 KiB
Python
105 lines
4.3 KiB
Python
"""Linear vs LMetric routing on the real agentic trace (first600s).
|
||
|
||
Visualizes the wall-cap finding: with the 2x-colo-wall cap on PD-disagg arms,
|
||
linear and LMetric reach the *same* success-rate ceiling -- the static P:D
|
||
split has a structural completion ceiling that does not depend on the routing
|
||
policy or on how long you keep retrying. Routing affects only how much wall
|
||
time is wasted on requests that will never succeed.
|
||
|
||
Inputs : analysis/v2/fig4l_lmetric.json (8 arms, both traces; we use first600s)
|
||
analysis/v2/fig4r_linear.json (4 arms, first600s, PD wall-capped)
|
||
Output : figs/v2/fig4_linear_vs_lmetric.png
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from pathlib import Path
|
||
|
||
import matplotlib
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
|
||
ROOT = Path(__file__).resolve().parents[2]
|
||
DATA = ROOT / "analysis" / "v2"
|
||
OUT = ROOT / "figs" / "v2" / "fig4_linear_vs_lmetric.png"
|
||
|
||
ARMS = ["colo", "6P+2D", "4P+4D", "2P+6D"]
|
||
POLICY_COLOR = {"linear": "#9467bd", "lmetric": "#2ca02c"}
|
||
POLICY_LABEL = {"linear": "linear (cache-aware + session-affinity)",
|
||
"lmetric": "LMetric (P_tokens × BS)"}
|
||
|
||
|
||
def pick(rows, arm, trace="first600s"):
|
||
for r in rows:
|
||
if r["arm"] == arm and r["trace"] == trace:
|
||
return r
|
||
return None
|
||
|
||
|
||
def main():
|
||
lin = json.load(open(DATA / "fig4r_linear.json"))
|
||
lme = json.load(open(DATA / "fig4l_lmetric.json"))
|
||
|
||
# colo wall (linear) sets the 2x cap reference
|
||
colo_lin_wall = pick(lin, "colo")["wall"]
|
||
cap = 2 * colo_lin_wall
|
||
|
||
fig, axes = plt.subplots(1, 3, figsize=(15, 4.5))
|
||
x = np.arange(len(ARMS))
|
||
w = 0.38
|
||
|
||
# (a) success rate
|
||
ax = axes[0]
|
||
for i, (pol, rows) in enumerate([("linear", lin), ("lmetric", lme)]):
|
||
vals = [pick(rows, a)["n"] / pick(rows, a)["req"] * 100 for a in ARMS]
|
||
bars = ax.bar(x + (i - 0.5) * w, vals, w, color=POLICY_COLOR[pol], label=POLICY_LABEL[pol])
|
||
for bx, bv in zip(x + (i - 0.5) * w, vals):
|
||
ax.annotate(f"{bv:.0f}%", (bx, bv + 1.5), ha="center", fontsize=8)
|
||
ax.axhline(100, color="grey", ls=":", lw=1)
|
||
ax.set_xticks(x); ax.set_xticklabels(ARMS)
|
||
ax.set_ylabel("success rate (% of trace)"); ax.set_ylim(0, 115)
|
||
ax.set_title("(a) success ceiling is policy-invariant")
|
||
ax.legend(fontsize=8, loc="upper right"); ax.grid(alpha=.3, axis="y")
|
||
|
||
# (b) wall (log y) with cap line
|
||
ax = axes[1]
|
||
for i, (pol, rows) in enumerate([("linear", lin), ("lmetric", lme)]):
|
||
vals = [pick(rows, a)["wall"] for a in ARMS]
|
||
bars = ax.bar(x + (i - 0.5) * w, vals, w, color=POLICY_COLOR[pol],
|
||
label=POLICY_LABEL[pol])
|
||
for bx, bv, r in zip(x + (i - 0.5) * w, vals,
|
||
[pick(rows, a) for a in ARMS]):
|
||
mark = " ⊗" if r.get("capped") else ""
|
||
ax.annotate(f"{bv:.0f}s{mark}", (bx, bv * 1.05), ha="center", fontsize=7)
|
||
ax.axhline(cap, color="red", ls="--", lw=1.5,
|
||
label=f"2× colo wall cap = {cap:.0f}s")
|
||
ax.set_xticks(x); ax.set_xticklabels(ARMS)
|
||
ax.set_ylabel("wall-clock (s, log)"); ax.set_yscale("log")
|
||
ax.set_title("(b) linear w/ cap vs lmetric w/o cap — ⊗ = cap-killed")
|
||
ax.legend(fontsize=8, loc="upper left"); ax.grid(alpha=.3, which="both", axis="y")
|
||
|
||
# (c) goodput per minute of wall (success rate / wall × 60)
|
||
ax = axes[2]
|
||
for i, (pol, rows) in enumerate([("linear", lin), ("lmetric", lme)]):
|
||
vals = [pick(rows, a)["n"] / pick(rows, a)["wall"] * 60 for a in ARMS]
|
||
bars = ax.bar(x + (i - 0.5) * w, vals, w, color=POLICY_COLOR[pol], label=POLICY_LABEL[pol])
|
||
for bx, bv in zip(x + (i - 0.5) * w, vals):
|
||
ax.annotate(f"{bv:.1f}", (bx, bv + max(vals) * 0.02),
|
||
ha="center", fontsize=8)
|
||
ax.set_xticks(x); ax.set_xticklabels(ARMS)
|
||
ax.set_ylabel("goodput (successful req / min)")
|
||
ax.set_title("(c) linear+cap is 1.5–17× more wall-efficient on PD")
|
||
ax.legend(fontsize=8, loc="upper right"); ax.grid(alpha=.3, axis="y")
|
||
|
||
fig.suptitle("Fig 4r — Linear vs LMetric on the real agentic trace (first600s, "
|
||
"PD-disagg wall-capped at 2× colo)",
|
||
fontsize=12, y=1.0)
|
||
fig.tight_layout()
|
||
fig.savefig(OUT, dpi=130, bbox_inches="tight")
|
||
print(f"wrote {OUT}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|