Replayer think-time dispatch mode + benchmarking guidance

Adds `--dispatch-mode {tracets,thinktime}` to the replayer and documents that agentic serving should be benchmarked with `thinktime` (the faithful load). - `tracets` (old default): turn-k at the absolute trace timestamp, i.e. max(prev_finished, trace_ts) -- collapses inter-turn think-time to ~0 when the system is behind, manufacturing request bursts. - `thinktime`: turn-1 at trace arrival; turn-k at prev_finished + time_to_parent_chat (real production gap). scripts/add_time_to_parent.py annotates a trace with that gap from the raw trace's request_ready/end_ms. exp(c) ablation (v2/exp_c_dispatch_ablation/): at N=8 (capacity slack) thinktime beats tracets -- E2E p90 -28% (73.5 vs 102.8s), TTFT p90 -29%, TPS +7%, because tracets' bursts spike concurrency -> KV pressure -> preemption. At N=6 (saturated) they converge. So tracets makes the system look ~30% worse on tail latency than realistic agent pacing. Root README.md carries the headline guidance; raw per-request metrics gitignored (perf_summary.json kept). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 16:25:33 +08:00
parent f0d085ceda
commit 8a6b22c11c
11 changed files with 547 additions and 3 deletions
--- a/replayer/main.py
+++ b/replayer/main.py
@@ -30,6 +30,11 @@ def main() -> None:
                   default=float(_env_think) if _env_think else None,
                   help="Closed-loop think-time (s) after each turn completes; "
                        "ignore absolute trace schedule. Env: REPLAY_INTER_TURN_THINK_S")
+    p.add_argument("--dispatch-mode", choices=["tracets", "thinktime"],
+                   default=os.environ.get("REPLAY_DISPATCH_MODE", "tracets"),
+                   help="tracets (Mode 1): absolute trace ts = max(prev_finished, ts). "
+                        "thinktime (Mode 2): turn-k at prev_finished + "
+                        "time_to_parent_chat. Env: REPLAY_DISPATCH_MODE")
    p.add_argument("--request-timeout", type=float, default=600.0)
    p.add_argument("--request-limit", type=int, default=None,
                   help="Limit number of requests to replay")
@@ -51,6 +56,7 @@ def main() -> None:
        request_limit=args.request_limit,
        max_inflight_sessions=args.max_inflight_sessions,
        inter_turn_think_s=args.inter_turn_think,
+        dispatch_mode=args.dispatch_mode,
    )

    results = asyncio.run(replay_trace(config))
--- a/replayer/replay.py
+++ b/replayer/replay.py
@@ -66,6 +66,13 @@ class ReplayConfig:
    # max_inflight_sessions=N this is a stable N-user closed-loop (no open-loop
    # runaway), so it removes the "immediate retrigger under load" artifact.
    inter_turn_think_s: float | None = None
+    # Dispatch timing for intra-session turns:
+    #  "tracets"   (Mode 1): fire at absolute trace timestamp -> effectively
+    #              max(prev_finished, trace_ts); collapses think-time to 0 when
+    #              the system is behind (the amplification-inflation suspect).
+    #  "thinktime" (Mode 2): turn-1 at trace arrival; turn-k at
+    #              prev_finished + time_to_parent_chat (real production gap).
+    dispatch_mode: str = "tracets"


 def _build_prompt_token_ids(req: TraceRequest) -> list[int]:
@@ -286,14 +293,26 @@ async def _run_session(
    realized_context: list[int] = []
    try:
        for turn_idx, req in enumerate(state.turns):
-            if config.inter_turn_think_s is not None:
+            if config.dispatch_mode == "thinktime":
+                # Mode 2: turn-1 at absolute trace arrival (preserve session
+                # schedule); later turns wait the REAL per-record think-time after
+                # the previous turn completed -> no think-collapse under load.
+                if turn_idx == 0:
+                    target_wall = (req.timestamp_s - earliest_ts)
+                    elapsed = time.perf_counter() - sweep_start
+                    if elapsed < target_wall:
+                        await asyncio.sleep(target_wall - elapsed)
+                else:
+                    think = req.time_to_parent_chat_s
+                    await asyncio.sleep(think if think is not None else 0.0)
+            elif config.inter_turn_think_s is not None:
                # Closed-loop: turn 1 fires on admission; later turns wait a fixed
                # think-time AFTER the previous turn completed (no absolute schedule,
                # so no "fire immediately because timestamp is in the past").
                if turn_idx > 0:
                    await asyncio.sleep(config.inter_turn_think_s)
            else:
-                # Original: dispatch at the request's absolute trace timestamp.
+                # Mode 1: dispatch at the request's absolute trace timestamp.
                target_wall = (req.timestamp_s - earliest_ts)
                elapsed = time.perf_counter() - sweep_start
                if elapsed < target_wall:
--- a/replayer/trace.py
+++ b/replayer/trace.py
@@ -28,6 +28,9 @@ class TraceRequest:
    request_type: str
    turn_id: int
    hash_ids: tuple[int, ...]
+    # real production gap (s) from parent turn finishing to this turn arriving;
+    # None for turn-1 / unannotated traces. Used by --dispatch-mode thinktime.
+    time_to_parent_chat_s: float | None = None


 def load_trace(
@@ -66,6 +69,9 @@ def load_trace(
                request_type=str(row["type"]),
                turn_id=int(row["turn"]),
                hash_ids=tuple(int(h) for h in row.get("hash_ids", [])),
+                time_to_parent_chat_s=(
+                    float(row["time_to_parent_chat"])
+                    if row.get("time_to_parent_chat") is not None else None),
            ))

    return requests