Adds `--dispatch-mode {tracets,thinktime}` to the replayer and documents that
agentic serving should be benchmarked with `thinktime` (the faithful load).
- `tracets` (old default): turn-k at the absolute trace timestamp, i.e.
max(prev_finished, trace_ts) -- collapses inter-turn think-time to ~0 when the
system is behind, manufacturing request bursts.
- `thinktime`: turn-1 at trace arrival; turn-k at prev_finished +
time_to_parent_chat (real production gap). scripts/add_time_to_parent.py
annotates a trace with that gap from the raw trace's request_ready/end_ms.
exp(c) ablation (v2/exp_c_dispatch_ablation/): at N=8 (capacity slack) thinktime
beats tracets -- E2E p90 -28% (73.5 vs 102.8s), TTFT p90 -29%, TPS +7%, because
tracets' bursts spike concurrency -> KV pressure -> preemption. At N=6
(saturated) they converge. So tracets makes the system look ~30% worse on tail
latency than realistic agent pacing. Root README.md carries the headline
guidance; raw per-request metrics gitignored (perf_summary.json kept).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
116 lines
4.5 KiB
Python
116 lines
4.5 KiB
Python
"""Mode 1 (tracets) vs Mode 2 (thinktime): wall-clock amplification + the
|
|
think-collapse mechanism.
|
|
|
|
Mechanism (the smoking gun): realized inter-turn gap = this.t_dispatch_unix -
|
|
prev.t_finish_unix, per session, consecutive turns. Mode 1 collapses it toward 0
|
|
when the system is behind; Mode 2 holds it at the real time_to_parent_chat.
|
|
|
|
Amplification: wall / ideal_span. Mode 1 ideal = trace_span; Mode 2 ideal = the
|
|
think-chain span under instant serving (analytic). If Mode 1 amplifies (>>1) with
|
|
collapsed gaps while Mode 2 stays ~1 with preserved gaps, the 8x was a Mode-1
|
|
artifact, not a workload property.
|
|
|
|
Usage: analyze.py <trace_ttp.jsonl> <m1.metrics.jsonl> <m2.metrics.jsonl> [fig.png]
|
|
"""
|
|
import json
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
def pct(v, q):
|
|
v = sorted(v)
|
|
return v[min(int(q * len(v)), len(v) - 1)] if v else 0.0
|
|
|
|
|
|
def load_trace(trace):
|
|
rows = [json.loads(l) for l in open(trace)]
|
|
span = max(r["timestamp"] for r in rows) - min(r["timestamp"] for r in rows)
|
|
by = defaultdict(list)
|
|
for r in rows:
|
|
by[r["session_id"]].append(r)
|
|
starts, ends, ttps = [], [], []
|
|
for sid, t in by.items():
|
|
t.sort(key=lambda r: (r["turn"], r["timestamp"]))
|
|
s = t[0]["timestamp"]
|
|
chain = sum((r.get("time_to_parent_chat") or 0.0) for r in t[1:])
|
|
starts.append(s); ends.append(s + chain)
|
|
ttps += [r["time_to_parent_chat"] for r in t[1:]
|
|
if r.get("time_to_parent_chat") is not None]
|
|
return span, (max(ends) - min(starts)), sorted(ttps)
|
|
|
|
|
|
def realized_gaps(metrics):
|
|
"""this.t_dispatch - prev.t_finish, per session, consecutive turns."""
|
|
rows = [json.loads(l) for l in open(metrics)]
|
|
ok = [r for r in rows if not r.get("error") and r.get("t_dispatch_unix")]
|
|
by = defaultdict(list)
|
|
for r in ok:
|
|
by[r["session_id"]].append(r)
|
|
gaps = []
|
|
for sid, t in by.items():
|
|
t.sort(key=lambda r: r["turn_id"])
|
|
for a, b in zip(t, t[1:]):
|
|
if a.get("t_finish_unix") and b.get("t_dispatch_unix"):
|
|
gaps.append(max(0.0, b["t_dispatch_unix"] - a["t_finish_unix"]))
|
|
return sorted(gaps), rows
|
|
|
|
|
|
def wallclock(metrics):
|
|
rows = [json.loads(l) for l in open(metrics)]
|
|
ds = [r["t_dispatch_unix"] for r in rows if r.get("t_dispatch_unix")]
|
|
fs = [r["t_finish_unix"] for r in rows if r.get("t_finish_unix")]
|
|
return (max(fs) - min(ds)) if ds and fs else 0.0
|
|
|
|
|
|
def e2e(metrics):
|
|
ok = [json.loads(l) for l in open(metrics)]
|
|
ok = [r for r in ok if not r.get("error")]
|
|
return pct([r["latency_s"] for r in ok if r.get("latency_s")], .9), len(ok)
|
|
|
|
|
|
def main():
|
|
trace, m1, m2 = sys.argv[1:4]
|
|
fig = sys.argv[4] if len(sys.argv) > 4 else "v2/figs/exp_c_dispatch_ablation.png"
|
|
span, ideal2, ttps = load_trace(trace)
|
|
g1, _ = realized_gaps(m1)
|
|
g2, _ = realized_gaps(m2)
|
|
w1, w2 = wallclock(m1), wallclock(m2)
|
|
e1, n1 = e2e(m1)
|
|
e2_, n2 = e2e(m2)
|
|
|
|
def collapsed(g): # fraction of realized gaps << the trace think-time median
|
|
return sum(1 for x in g if x < 0.5) / len(g) if g else 0
|
|
|
|
print(f"trace_span={span:.0f}s mode2_ideal_span={ideal2:.0f}s "
|
|
f"trace_ttp: p50={pct(ttps,.5):.2f}s frac<1s={sum(1 for x in ttps if x<1)/len(ttps):.0%}\n")
|
|
print(f"{'mode':<16}{'wall_s':>8}{'amp/ideal':>10}{'e2e_p90':>9}"
|
|
f"{'realgap_p50':>12}{'frac<0.5s':>10}")
|
|
for name, w, ideal, e, n, g in [
|
|
("Mode1 tracets", w1, span, e1, n1, g1),
|
|
("Mode2 thinktime", w2, ideal2, e2_, n2, g2)]:
|
|
print(f"{name:<16}{w:>8.0f}{w/ideal:>10.2f}{e:>9.1f}"
|
|
f"{pct(g,.5):>12.2f}{collapsed(g):>10.0%}")
|
|
|
|
# figure: realized inter-turn gap CDF, trace vs Mode1 vs Mode2
|
|
plt.figure(figsize=(7.2, 4.8))
|
|
for data, lab, c in [(ttps, "trace time_to_parent_chat (real)", "#1f77b4"),
|
|
(g2, "Mode2 thinktime: realized gap", "#2ca02c"),
|
|
(g1, "Mode1 tracets: realized gap", "#d62728")]:
|
|
if data:
|
|
d = sorted(x for x in data if x >= 0)
|
|
ys = [(i + 1) / len(d) for i in range(len(d))]
|
|
plt.plot([max(x, 1e-3) for x in d], ys, label=lab, c=c, lw=2)
|
|
plt.xscale("log"); plt.xlabel("inter-turn gap (s, log)"); plt.ylabel("CDF")
|
|
plt.title("Mode 1 collapses the inter-turn gap under load; Mode 2 preserves it")
|
|
plt.legend(); plt.grid(alpha=.3, which="both"); plt.tight_layout()
|
|
plt.savefig(fig, dpi=140)
|
|
print("wrote", fig)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|