Replayer think-time dispatch mode + benchmarking guidance
Adds `--dispatch-mode {tracets,thinktime}` to the replayer and documents that
agentic serving should be benchmarked with `thinktime` (the faithful load).
- `tracets` (old default): turn-k at the absolute trace timestamp, i.e.
max(prev_finished, trace_ts) -- collapses inter-turn think-time to ~0 when the
system is behind, manufacturing request bursts.
- `thinktime`: turn-1 at trace arrival; turn-k at prev_finished +
time_to_parent_chat (real production gap). scripts/add_time_to_parent.py
annotates a trace with that gap from the raw trace's request_ready/end_ms.
exp(c) ablation (v2/exp_c_dispatch_ablation/): at N=8 (capacity slack) thinktime
beats tracets -- E2E p90 -28% (73.5 vs 102.8s), TTFT p90 -29%, TPS +7%, because
tracets' bursts spike concurrency -> KV pressure -> preemption. At N=6
(saturated) they converge. So tracets makes the system look ~30% worse on tail
latency than realistic agent pacing. Root README.md carries the headline
guidance; raw per-request metrics gitignored (perf_summary.json kept).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,11 @@ def main() -> None:
|
||||
default=float(_env_think) if _env_think else None,
|
||||
help="Closed-loop think-time (s) after each turn completes; "
|
||||
"ignore absolute trace schedule. Env: REPLAY_INTER_TURN_THINK_S")
|
||||
p.add_argument("--dispatch-mode", choices=["tracets", "thinktime"],
|
||||
default=os.environ.get("REPLAY_DISPATCH_MODE", "tracets"),
|
||||
help="tracets (Mode 1): absolute trace ts = max(prev_finished, ts). "
|
||||
"thinktime (Mode 2): turn-k at prev_finished + "
|
||||
"time_to_parent_chat. Env: REPLAY_DISPATCH_MODE")
|
||||
p.add_argument("--request-timeout", type=float, default=600.0)
|
||||
p.add_argument("--request-limit", type=int, default=None,
|
||||
help="Limit number of requests to replay")
|
||||
@@ -51,6 +56,7 @@ def main() -> None:
|
||||
request_limit=args.request_limit,
|
||||
max_inflight_sessions=args.max_inflight_sessions,
|
||||
inter_turn_think_s=args.inter_turn_think,
|
||||
dispatch_mode=args.dispatch_mode,
|
||||
)
|
||||
|
||||
results = asyncio.run(replay_trace(config))
|
||||
|
||||
@@ -66,6 +66,13 @@ class ReplayConfig:
|
||||
# max_inflight_sessions=N this is a stable N-user closed-loop (no open-loop
|
||||
# runaway), so it removes the "immediate retrigger under load" artifact.
|
||||
inter_turn_think_s: float | None = None
|
||||
# Dispatch timing for intra-session turns:
|
||||
# "tracets" (Mode 1): fire at absolute trace timestamp -> effectively
|
||||
# max(prev_finished, trace_ts); collapses think-time to 0 when
|
||||
# the system is behind (the amplification-inflation suspect).
|
||||
# "thinktime" (Mode 2): turn-1 at trace arrival; turn-k at
|
||||
# prev_finished + time_to_parent_chat (real production gap).
|
||||
dispatch_mode: str = "tracets"
|
||||
|
||||
|
||||
def _build_prompt_token_ids(req: TraceRequest) -> list[int]:
|
||||
@@ -286,14 +293,26 @@ async def _run_session(
|
||||
realized_context: list[int] = []
|
||||
try:
|
||||
for turn_idx, req in enumerate(state.turns):
|
||||
if config.inter_turn_think_s is not None:
|
||||
if config.dispatch_mode == "thinktime":
|
||||
# Mode 2: turn-1 at absolute trace arrival (preserve session
|
||||
# schedule); later turns wait the REAL per-record think-time after
|
||||
# the previous turn completed -> no think-collapse under load.
|
||||
if turn_idx == 0:
|
||||
target_wall = (req.timestamp_s - earliest_ts)
|
||||
elapsed = time.perf_counter() - sweep_start
|
||||
if elapsed < target_wall:
|
||||
await asyncio.sleep(target_wall - elapsed)
|
||||
else:
|
||||
think = req.time_to_parent_chat_s
|
||||
await asyncio.sleep(think if think is not None else 0.0)
|
||||
elif config.inter_turn_think_s is not None:
|
||||
# Closed-loop: turn 1 fires on admission; later turns wait a fixed
|
||||
# think-time AFTER the previous turn completed (no absolute schedule,
|
||||
# so no "fire immediately because timestamp is in the past").
|
||||
if turn_idx > 0:
|
||||
await asyncio.sleep(config.inter_turn_think_s)
|
||||
else:
|
||||
# Original: dispatch at the request's absolute trace timestamp.
|
||||
# Mode 1: dispatch at the request's absolute trace timestamp.
|
||||
target_wall = (req.timestamp_s - earliest_ts)
|
||||
elapsed = time.perf_counter() - sweep_start
|
||||
if elapsed < target_wall:
|
||||
|
||||
@@ -28,6 +28,9 @@ class TraceRequest:
|
||||
request_type: str
|
||||
turn_id: int
|
||||
hash_ids: tuple[int, ...]
|
||||
# real production gap (s) from parent turn finishing to this turn arriving;
|
||||
# None for turn-1 / unannotated traces. Used by --dispatch-mode thinktime.
|
||||
time_to_parent_chat_s: float | None = None
|
||||
|
||||
|
||||
def load_trace(
|
||||
@@ -66,6 +69,9 @@ def load_trace(
|
||||
request_type=str(row["type"]),
|
||||
turn_id=int(row["turn"]),
|
||||
hash_ids=tuple(int(h) for h in row.get("hash_ids", [])),
|
||||
time_to_parent_chat_s=(
|
||||
float(row["time_to_parent_chat"])
|
||||
if row.get("time_to_parent_chat") is not None else None),
|
||||
))
|
||||
|
||||
return requests
|
||||
|
||||
Reference in New Issue
Block a user