diff --git a/figs/working_set/glm5_fp8_tp8_b300.png b/figs/working_set/glm5_fp8_tp8_b300.png
index dc2ca76..92edd2f 100644
Binary files a/figs/working_set/glm5_fp8_tp8_b300.png and b/figs/working_set/glm5_fp8_tp8_b300.png differ
diff --git a/scripts/working_set_analysis.py b/scripts/working_set_analysis.py
index 5cc43ad..d1d46ad 100644
--- a/scripts/working_set_analysis.py
+++ b/scripts/working_set_analysis.py
@@ -156,48 +156,72 @@ def plot(ws, hw, block_bytes, label, out_path):
     bgb = block_bytes / GB
     pool = hw["kv_pool_gb"]                      # KV pool per node (= per replica)
     gpr = hw["gpus_per_replica"]
-    node_lbl = f"1 node = {gpr}x {hw['gpu']} = {pool:.0f} GB KV"
-
-    # everything in node units: nodes = footprint_GB / pool
-    peak_nodes = np.array([r["peak_blocks"] * bgb / pool for r in ws["taus"]])
-    apc = np.array([r["apc"] * 100 for r in ws["taus"]])
-    oracle_nodes = ws["oracle_peak_blocks"] * bgb / pool
     ceil = ws["apc_ceiling"] * 100
+    oracle_nodes = ws["oracle_peak_blocks"] * bgb / pool
+
+    # operating points up to the ceiling: beyond oracle, TTL is strictly worse, so drop.
+    rows = [r for r in ws["taus"] if r["tau"] <= 300]
+    nodes = np.array([r["peak_blocks"] * bgb / pool for r in rows])
+    apc = np.array([r["apc"] * 100 for r in rows])
+    tau = np.array([r["tau"] for r in rows])
+    XMAX = 16
 
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
 
-    # --- panel 1: APC vs nodes of HBM needed ---
-    ax1.plot(peak_nodes, apc, "o-", color="#1f77b4", lw=2, ms=7, label="TTL-LRU W(T)")
-    for r, x, y in zip(ws["taus"], peak_nodes, apc):
-        ax1.annotate(f"{r['tau']:g}s", (x, y), fontsize=8,
-                     textcoords="offset points", xytext=(4, 5))
-    ax1.scatter([oracle_nodes], [ceil], marker="*", s=320, color="#d62728", zorder=5,
-                label=f"oracle / ceiling ({ceil:.1f}% @ {oracle_nodes:.0f} nodes)")
+    # ===== panel 1: benefit vs cost -- APC you get per cluster size =====
+    ax1.plot(nodes, apc, "o-", color="#1f77b4", lw=2, ms=7, zorder=4, label="TTL-LRU cache")
+    # interpolated APC exactly at the 1-node budget
+    apc_at_1 = float(np.interp(1.0, nodes, apc))
+    ax1.scatter([1], [apc_at_1], s=90, facecolors="none", edgecolors="#ff7f0e",
+                lw=2, zorder=6)
+    ax1.annotate(f"1 node -> ~{apc_at_1:.0f}% APC\n(TTL model; real LRU higher)",
+                 (1, apc_at_1), textcoords="offset points", xytext=(12, -2),
+                 fontsize=9, color="#ff7f0e", va="top")
+    # label the well-separated decision-zone points
+    for r, x, y in zip(rows, nodes, apc):
+        if x >= 1.5:
+            ax1.annotate(f"keep {r['tau']:g}s reuse", (x, y),
+                         textcoords="offset points", xytext=(6, 6), fontsize=8.5)
+    ax1.annotate("T<=10s reuse:\nall < 1.4 nodes", (0.5, 22), fontsize=8.5,
+                 color="#1f77b4", ha="left")
+    # budget + ceiling
+    ax1.axvspan(0, 1, color="#2ca02c", alpha=.08)
+    ax1.axvline(1, ls="--", color="#2ca02c", lw=1.8)
+    ax1.text(1.05, 96, "1 B300 node (your budget)", color="#2ca02c", fontsize=9, va="top")
+    ax1.scatter([oracle_nodes], [ceil], marker="*", s=340, color="#d62728", zorder=7)
+    ax1.annotate(f"ceiling {ceil:.1f}%\noracle: {oracle_nodes:.0f} nodes",
+                 (oracle_nodes, ceil), textcoords="offset points", xytext=(-10, -8),
+                 fontsize=9, color="#d62728", ha="right", va="top")
     ax1.axhline(ceil, ls=":", color="#d62728", alpha=.5)
-    for k in (1, 2, 4, 8, 16, 32):
-        ax1.axvline(k, ls="--", color="#2ca02c", alpha=.45)
-        ax1.text(k, 2, f"{k} node / {k*gpr} GPU",
-                 rotation=90, va="bottom", ha="right", fontsize=8, color="#2ca02c")
-    ax1.axvspan(0.1, 1, color="#2ca02c", alpha=.06)   # "fits in 1 node" region
-    ax1.set_xscale("log")
-    ax1.set_xlabel(f"# nodes of GPU HBM that must hold the KV  ({node_lbl})")
-    ax1.set_ylabel("Achievable prefix-cache hit rate  (APC %)")
-    ax1.set_title("APC vs cluster size (nodes)")
-    ax1.grid(alpha=.3, which="both"); ax1.legend(loc="lower right"); ax1.set_ylim(0, 100)
+    ax1.set_xlim(0, XMAX); ax1.set_ylim(0, 100)
+    ax1.set_xticks(range(0, XMAX + 1, 2)); ax1.set_xticks(range(0, XMAX + 1), minor=True)
+    ax1.set_xlabel(f"# nodes of GPU HBM needed   (1 node = {gpr}x {hw['gpu']} = {pool:.0f} GB KV)")
+    ax1.set_ylabel("Prefix-cache hit rate  (APC %)")
+    ax1.set_title("Benefit vs cost: APC per cluster size", fontweight="bold")
+    ax1.grid(alpha=.3); ax1.grid(alpha=.15, which="minor"); ax1.legend(loc="center right")
 
-    # --- panel 2: nodes needed by retention window ---
-    sel = [r for r in ws["taus"] if r["tau"] in (2, 30, 300, 600)]
-    xs = np.arange(len(sel)); w = 0.38
-    ax2.bar(xs - w/2, [r["peak_blocks"]*bgb/pool for r in sel], w, label="peak", color="#1f77b4")
-    ax2.bar(xs + w/2, [r["p50_blocks"]*bgb/pool for r in sel], w, label="median", color="#aec7e8")
-    ax2.axhline(1, ls="--", color="#2ca02c", lw=2, label="your budget: 1 node")
-    ax2.axhline(oracle_nodes, ls=":", color="#d62728", lw=2,
-                label=f"oracle full-ceiling ({oracle_nodes:.0f} nodes)")
-    ax2.set_xticks(xs); ax2.set_xticklabels([f"T={r['tau']:g}s\nAPC={r['apc']*100:.0f}%" for r in sel])
-    ax2.set_ylabel("# nodes of HBM needed")
-    ax2.set_yscale("log")
-    ax2.set_title("Cluster size by retention window")
-    ax2.grid(alpha=.3, axis="y", which="both"); ax2.legend(loc="upper left", fontsize=9)
+    # ===== panel 2: cost -- nodes needed to retain T seconds of reuse =====
+    ax2.plot(tau, nodes, "s-", color="#1f77b4", lw=2, ms=7, zorder=4)
+    ax2.axhline(1, ls="--", color="#2ca02c", lw=1.8, label="1 node (your budget)")
+    ax2.axhline(oracle_nodes, ls=":", color="#d62728", lw=1.8,
+                label=f"full ceiling = {oracle_nodes:.0f} nodes")
+    # where the curve crosses the 1-node budget
+    tau_at_1 = float(np.interp(1.0, nodes, tau))
+    ax2.scatter([tau_at_1], [1], s=90, facecolors="none", edgecolors="#ff7f0e", lw=2, zorder=6)
+    ax2.annotate(f"1 node only retains\n~{tau_at_1:.0f}s of reuse", (tau_at_1, 1),
+                 textcoords="offset points", xytext=(8, 14), fontsize=9, color="#ff7f0e")
+    for r, x, y in zip(rows, tau, nodes):
+        if y >= 1.5:
+            ax2.annotate(f"{r['tau']:g}s", (x, y), textcoords="offset points",
+                         xytext=(4, 5), fontsize=8.5)
+    ax2.set_xscale("log")
+    ax2.set_xticks([1, 2, 5, 10, 30, 60, 300])
+    ax2.set_xticklabels(["1s", "2s", "5s", "10s", "30s", "60s", "300s"])
+    ax2.set_ylim(0, XMAX); ax2.set_yticks(range(0, XMAX + 1, 2))
+    ax2.set_xlabel("retention window T  (how long-idle a session's KV we keep)")
+    ax2.set_ylabel("# nodes of GPU HBM needed")
+    ax2.set_title("Cost: nodes needed to retain T-seconds of reuse", fontweight="bold")
+    ax2.grid(alpha=.3, which="both"); ax2.legend(loc="upper left", fontsize=9)
 
     fig.suptitle(label, fontsize=13, fontweight="bold")
     fig.tight_layout(rect=[0, 0, 1, 0.97])