Replayer think-time dispatch mode + benchmarking guidance

Adds `--dispatch-mode {tracets,thinktime}` to the replayer and documents that agentic serving should be benchmarked with `thinktime` (the faithful load). - `tracets` (old default): turn-k at the absolute trace timestamp, i.e. max(prev_finished, trace_ts) -- collapses inter-turn think-time to ~0 when the system is behind, manufacturing request bursts. - `thinktime`: turn-1 at trace arrival; turn-k at prev_finished + time_to_parent_chat (real production gap). scripts/add_time_to_parent.py annotates a trace with that gap from the raw trace's request_ready/end_ms. exp(c) ablation (v2/exp_c_dispatch_ablation/): at N=8 (capacity slack) thinktime beats tracets -- E2E p90 -28% (73.5 vs 102.8s), TTFT p90 -29%, TPS +7%, because tracets' bursts spike concurrency -> KV pressure -> preemption. At N=6 (saturated) they converge. So tracets makes the system look ~30% worse on tail latency than realistic agent pacing. Root README.md carries the headline guidance; raw per-request metrics gitignored (perf_summary.json kept). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 16:25:33 +08:00
parent f0d085ceda
commit 8a6b22c11c
11 changed files with 547 additions and 3 deletions
--- a/scripts/add_time_to_parent.py
+++ b/scripts/add_time_to_parent.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+"""Annotate a formatted/sampled trace with `time_to_parent_chat` (seconds).
+
+time_to_parent_chat = this_turn.request_ready_time_ms - parent_turn.request_end_time_ms
+i.e. the real external gap (tool exec + agent think) between the parent turn
+*finishing* in production and this turn *arriving*. Turn-1 (parent_chat_id == -1)
+gets null.
+
+The end/ready times live only in the raw trace (meta.*); the sampled trace has
+neither. So we scan the raw trace once (byte-level field extraction, early-exit
+once every needed chat_id is found) to build {chat_id: (ready_ms, end_ms)},
+then join.
+
+Run on dash0 (raw trace is there).
+"""
+from __future__ import annotations
+import json
+import sys
+import time
+from pathlib import Path
+
+RAW = "/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317-raw.jsonl"
+KCHAT = b'"chat_id":'
+KREADY = b'"request_ready_time_ms":'
+KEND = b'"request_end_time_ms":'
+
+
+def parse_int_after(line: bytes, key: bytes):
+    i = line.find(key)
+    if i < 0:
+        return None
+    i += len(key)
+    n = len(line)
+    while i < n and line[i] in (0x20, 0x09):  # space/tab
+        i += 1
+    j = i
+    if j < n and line[j] == 0x2D:  # '-'
+        j += 1
+    while j < n and 0x30 <= line[j] <= 0x39:
+        j += 1
+    return int(line[i:j]) if j > i and line[i:j] != b'-' else None
+
+
+def scan_timing(needed: set[int], raw_path: str) -> dict[int, tuple[int, int]]:
+    timing: dict[int, tuple[int, int]] = {}
+    t0 = time.time()
+    nbytes = 0
+    with open(raw_path, "rb", buffering=1 << 22) as f:
+        for line in f:
+            nbytes += len(line)
+            cid = parse_int_after(line, KCHAT)
+            if cid is None or cid not in needed or cid in timing:
+                continue
+            ready = parse_int_after(line, KREADY)
+            end = parse_int_after(line, KEND)
+            if ready is None or end is None:
+                continue
+            timing[cid] = (ready, end)
+            if len(timing) == len(needed):
+                break
+    print(f"[scan] found {len(timing)}/{len(needed)} chats in "
+          f"{nbytes/1e9:.1f} GB / {time.time()-t0:.0f}s", flush=True)
+    return timing
+
+
+def main():
+    in_trace = Path(sys.argv[1])
+    out_trace = Path(sys.argv[2])
+    raw_path = sys.argv[3] if len(sys.argv) > 3 else RAW
+
+    rows = [json.loads(l) for l in open(in_trace)]
+    chats = {r["chat_id"] for r in rows}
+    parents = {r["parent_chat_id"] for r in rows
+               if r.get("parent_chat_id") not in (None, -1, 0, "")}
+    needed = chats | parents
+    print(f"[trace] {len(rows)} reqs, {len(chats)} chats, "
+          f"{len(parents)} parents, {len(needed)} chat_ids to look up", flush=True)
+
+    timing = scan_timing(needed, raw_path)
+
+    # annotate
+    n_ann = n_neg = 0
+    ttps = []
+    min_ready = min((t[0] for t in timing.values()), default=0)
+    for r in rows:
+        p = r.get("parent_chat_id")
+        ttp = None
+        if p not in (None, -1, 0, "") and p in timing and r["chat_id"] in timing:
+            ttp = (timing[r["chat_id"]][0] - timing[p][1]) / 1000.0  # ready - parent.end
+            if ttp < 0:
+                n_neg += 1
+                ttp = 0.0
+            ttps.append(ttp)
+            n_ann += 1
+        r["time_to_parent_chat"] = ttp
+        # sanity field: re-derived ready offset (s) to cross-check vs timestamp
+        if r["chat_id"] in timing:
+            r["_ready_off_s"] = (timing[r["chat_id"]][0] - min_ready) / 1000.0
+
+    with open(out_trace, "w") as o:
+        for r in rows:
+            o.write(json.dumps(r) + "\n")
+
+    ttps.sort()
+    n = len(ttps)
+    pc = lambda q: ttps[min(int(q * n), n - 1)] if n else 0
+    print(f"[done] annotated {n_ann} turns with time_to_parent_chat "
+          f"({n_neg} negative clamped to 0)")
+    print(f"[ttp] p25={pc(.25):.2f}s p50={pc(.5):.2f}s p90={pc(.9):.2f}s "
+          f"p99={pc(.99):.2f}s  (f3a ref: p50~1.6s)")
+    print(f"[ttp] frac<1s={sum(1 for x in ttps if x<1)/n:.0%} "
+          f"frac<5s={sum(1 for x in ttps if x<5)/n:.0%}")
+    # cross-check timestamp vs re-derived ready offset on a few rows
+    chk = [(r["timestamp"], r["_ready_off_s"]) for r in rows if "_ready_off_s" in r][:5]
+    print("[sanity] (trace.timestamp, raw ready_off_s):", [(round(a,1),round(b,1)) for a,b in chk])
+    print(f"wrote {out_trace}")
+
+
+if __name__ == "__main__":
+    main()