Replayer think-time dispatch mode + benchmarking guidance
Adds `--dispatch-mode {tracets,thinktime}` to the replayer and documents that
agentic serving should be benchmarked with `thinktime` (the faithful load).
- `tracets` (old default): turn-k at the absolute trace timestamp, i.e.
max(prev_finished, trace_ts) -- collapses inter-turn think-time to ~0 when the
system is behind, manufacturing request bursts.
- `thinktime`: turn-1 at trace arrival; turn-k at prev_finished +
time_to_parent_chat (real production gap). scripts/add_time_to_parent.py
annotates a trace with that gap from the raw trace's request_ready/end_ms.
exp(c) ablation (v2/exp_c_dispatch_ablation/): at N=8 (capacity slack) thinktime
beats tracets -- E2E p90 -28% (73.5 vs 102.8s), TTFT p90 -29%, TPS +7%, because
tracets' bursts spike concurrency -> KV pressure -> preemption. At N=6
(saturated) they converge. So tracets makes the system look ~30% worse on tail
latency than realistic agent pacing. Root README.md carries the headline
guidance; raw per-request metrics gitignored (perf_summary.json kept).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
120
scripts/add_time_to_parent.py
Normal file
120
scripts/add_time_to_parent.py
Normal file
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Annotate a formatted/sampled trace with `time_to_parent_chat` (seconds).
|
||||
|
||||
time_to_parent_chat = this_turn.request_ready_time_ms - parent_turn.request_end_time_ms
|
||||
i.e. the real external gap (tool exec + agent think) between the parent turn
|
||||
*finishing* in production and this turn *arriving*. Turn-1 (parent_chat_id == -1)
|
||||
gets null.
|
||||
|
||||
The end/ready times live only in the raw trace (meta.*); the sampled trace has
|
||||
neither. So we scan the raw trace once (byte-level field extraction, early-exit
|
||||
once every needed chat_id is found) to build {chat_id: (ready_ms, end_ms)},
|
||||
then join.
|
||||
|
||||
Run on dash0 (raw trace is there).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
RAW = "/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317-raw.jsonl"
|
||||
KCHAT = b'"chat_id":'
|
||||
KREADY = b'"request_ready_time_ms":'
|
||||
KEND = b'"request_end_time_ms":'
|
||||
|
||||
|
||||
def parse_int_after(line: bytes, key: bytes):
|
||||
i = line.find(key)
|
||||
if i < 0:
|
||||
return None
|
||||
i += len(key)
|
||||
n = len(line)
|
||||
while i < n and line[i] in (0x20, 0x09): # space/tab
|
||||
i += 1
|
||||
j = i
|
||||
if j < n and line[j] == 0x2D: # '-'
|
||||
j += 1
|
||||
while j < n and 0x30 <= line[j] <= 0x39:
|
||||
j += 1
|
||||
return int(line[i:j]) if j > i and line[i:j] != b'-' else None
|
||||
|
||||
|
||||
def scan_timing(needed: set[int], raw_path: str) -> dict[int, tuple[int, int]]:
|
||||
timing: dict[int, tuple[int, int]] = {}
|
||||
t0 = time.time()
|
||||
nbytes = 0
|
||||
with open(raw_path, "rb", buffering=1 << 22) as f:
|
||||
for line in f:
|
||||
nbytes += len(line)
|
||||
cid = parse_int_after(line, KCHAT)
|
||||
if cid is None or cid not in needed or cid in timing:
|
||||
continue
|
||||
ready = parse_int_after(line, KREADY)
|
||||
end = parse_int_after(line, KEND)
|
||||
if ready is None or end is None:
|
||||
continue
|
||||
timing[cid] = (ready, end)
|
||||
if len(timing) == len(needed):
|
||||
break
|
||||
print(f"[scan] found {len(timing)}/{len(needed)} chats in "
|
||||
f"{nbytes/1e9:.1f} GB / {time.time()-t0:.0f}s", flush=True)
|
||||
return timing
|
||||
|
||||
|
||||
def main():
|
||||
in_trace = Path(sys.argv[1])
|
||||
out_trace = Path(sys.argv[2])
|
||||
raw_path = sys.argv[3] if len(sys.argv) > 3 else RAW
|
||||
|
||||
rows = [json.loads(l) for l in open(in_trace)]
|
||||
chats = {r["chat_id"] for r in rows}
|
||||
parents = {r["parent_chat_id"] for r in rows
|
||||
if r.get("parent_chat_id") not in (None, -1, 0, "")}
|
||||
needed = chats | parents
|
||||
print(f"[trace] {len(rows)} reqs, {len(chats)} chats, "
|
||||
f"{len(parents)} parents, {len(needed)} chat_ids to look up", flush=True)
|
||||
|
||||
timing = scan_timing(needed, raw_path)
|
||||
|
||||
# annotate
|
||||
n_ann = n_neg = 0
|
||||
ttps = []
|
||||
min_ready = min((t[0] for t in timing.values()), default=0)
|
||||
for r in rows:
|
||||
p = r.get("parent_chat_id")
|
||||
ttp = None
|
||||
if p not in (None, -1, 0, "") and p in timing and r["chat_id"] in timing:
|
||||
ttp = (timing[r["chat_id"]][0] - timing[p][1]) / 1000.0 # ready - parent.end
|
||||
if ttp < 0:
|
||||
n_neg += 1
|
||||
ttp = 0.0
|
||||
ttps.append(ttp)
|
||||
n_ann += 1
|
||||
r["time_to_parent_chat"] = ttp
|
||||
# sanity field: re-derived ready offset (s) to cross-check vs timestamp
|
||||
if r["chat_id"] in timing:
|
||||
r["_ready_off_s"] = (timing[r["chat_id"]][0] - min_ready) / 1000.0
|
||||
|
||||
with open(out_trace, "w") as o:
|
||||
for r in rows:
|
||||
o.write(json.dumps(r) + "\n")
|
||||
|
||||
ttps.sort()
|
||||
n = len(ttps)
|
||||
pc = lambda q: ttps[min(int(q * n), n - 1)] if n else 0
|
||||
print(f"[done] annotated {n_ann} turns with time_to_parent_chat "
|
||||
f"({n_neg} negative clamped to 0)")
|
||||
print(f"[ttp] p25={pc(.25):.2f}s p50={pc(.5):.2f}s p90={pc(.9):.2f}s "
|
||||
f"p99={pc(.99):.2f}s (f3a ref: p50~1.6s)")
|
||||
print(f"[ttp] frac<1s={sum(1 for x in ttps if x<1)/n:.0%} "
|
||||
f"frac<5s={sum(1 for x in ttps if x<5)/n:.0%}")
|
||||
# cross-check timestamp vs re-derived ready offset on a few rows
|
||||
chk = [(r["timestamp"], r["_ready_off_s"]) for r in rows if "_ready_off_s" in r][:5]
|
||||
print("[sanity] (trace.timestamp, raw ready_off_s):", [(round(a,1),round(b,1)) for a,b in chk])
|
||||
print(f"wrote {out_trace}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user