Add chatbot T_external CDF; overlay on f3a vs agentic
User-requested comparison of inter-turn external gap distribution between the production agentic trace (Qwen3-Coder) and a production chatbot trace (qwen3-max chat). Both computed as T_external = next_turn.start_ms - prev_turn.end_ms on the same kind of pipeline (raw input + raw output join on request_id, session structure from the formatted trace's parent_chat_id chains). The chatbot trace lives as two files on dash0: input : bailian-trace/qwen-trace-260321-260327/qwen3-max-input-032309-032311.jsonl output : bailian-trace/qwen-trace-260321-260327/qwen3-max-output-032109-032711.jsonl The raw input has no session_id (uuid is per-record, user_id has only 4 distinct tenant values for 346 k requests). We recover session structure from the formatted file (qwen_chat_blksz_64_032309-032311.jsonl, which groups requests by parent_chat_id), matching each formatted record to a raw record by (timestamp, output_length) — prompt_token_num is anonymized to 0 in this trace, so we use generate_token_num as the join key. End time is derived from time_to_finish_token (ms duration) not the "time" string field (which is the log-write time, not request completion). Numbers (chatbot, 42 228 inter-turn gaps over 32 262 multi-turn sessions): p25 4.85 s p50 7.18 s p75 8.22 s p90 15.0 s p99 43 s 4% gaps < 1 s 29% < 5 s 78% < 10 s 98% < 30 s Compare to agentic (same metric, scripts/compute_inter_turn_gap_remote.py): p25 0.69 s p50 1.6 s p75 8.6 s p90 44 s p99 738 s 39% gaps < 1 s 67% < 5 s 77% < 10 s 87% < 30 s Distributions differ in shape, not just location: - Chatbot is tight, unimodal around 5–10 s (human interaction). - Agentic is bimodal: a sub-second autonomous tool-call mode (39 % < 1 s) plus a long-pause tail (13 % > 30 s, p99 = 738 s) for sessions where the operator steps away. - The sub-second tool-call mass is where dispatch coupling lives — those turns have W_turn ≫ T_external for any current scheduler. The earlier "chatbot has T_human ≈ 30 s" hand-wave was wrong empirically. The right framing for §2.3 is "agentic has a sub-second tool-call mode that chatbot doesn't", not "chatbot has think-time and agentic doesn't". Adds: - scripts/compute_inter_turn_gap_chatbot.py: dash0-side aggregator (raw input/output join + formatted alignment by ts + output_length) - analysis/characterization/data/chatbot_inter_turn_gap.json: CDF cache - scripts/plot_inter_turn_gap.py: overlays both curves on log-x Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
220
scripts/compute_inter_turn_gap_chatbot.py
Normal file
220
scripts/compute_inter_turn_gap_chatbot.py
Normal file
@@ -0,0 +1,220 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compute inter-turn T_external on the chatbot trace, v2.
|
||||
|
||||
Uses formatted's parent_chat_id chains for sessions, and matches each
|
||||
formatted record to a raw input/output pair by (timestamp, input_length)
|
||||
rather than by index (index-by-index drifted up to 120s in v1).
|
||||
|
||||
Run on dash0. Writes /tmp/chatbot_inter_turn_gap.json.
|
||||
"""
|
||||
import json
|
||||
import bisect
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
RAW_IN = "/home/admin/cpfs/wjh/bailian-trace/qwen-trace-260321-260327/qwen3-max-input-032309-032311.jsonl"
|
||||
RAW_OUT = "/home/admin/cpfs/wjh/bailian-trace/qwen-trace-260321-260327/qwen3-max-output-032109-032711.jsonl"
|
||||
FMT = "/home/admin/cpfs/wjh/bailian-trace/qwen-trace-260321-260327-formatted/qwen_chat_blksz_64_032309-032311.jsonl"
|
||||
OUT_JSON = "/tmp/chatbot_inter_turn_gap.json"
|
||||
|
||||
def parse_time_str_to_ms(s):
|
||||
try:
|
||||
if "." in s:
|
||||
dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S.%f")
|
||||
else:
|
||||
dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
|
||||
return int(dt.timestamp() * 1000)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
print("Reading raw output (joining by request_id)...")
|
||||
# In this trace prompt_token_num is anonymized to '0'; use generate_token_num
|
||||
# as the matching key (matches formatted output_length). For end_ms we use
|
||||
# time_to_finish_token (ms duration from request start) — the "time" string
|
||||
# field is log-write time, not request completion time.
|
||||
out_info = {} # request_id -> (ttf_ms, generate_token_num)
|
||||
n_out_seen = 0
|
||||
with open(RAW_OUT) as f:
|
||||
for line in f:
|
||||
try:
|
||||
d = json.loads(line)
|
||||
except: continue
|
||||
rid = d.get("request_id")
|
||||
if rid is None:
|
||||
continue
|
||||
n_out_seen += 1
|
||||
gtn = d.get("generate_token_num")
|
||||
ttf = d.get("time_to_finish_token")
|
||||
if ttf is None:
|
||||
continue
|
||||
try:
|
||||
ttf_ms = int(float(ttf))
|
||||
except: continue
|
||||
try:
|
||||
gtn = int(gtn) if gtn is not None else None
|
||||
except: gtn = None
|
||||
# Keep the largest ttf_ms if duplicates (multiple log lines per request)
|
||||
prev = out_info.get(rid)
|
||||
if prev is None or ttf_ms > prev[0]:
|
||||
out_info[rid] = (ttf_ms, gtn)
|
||||
print(f" scanned: {n_out_seen}, unique req with ttf: {len(out_info)}")
|
||||
|
||||
print("Reading raw input + joining...")
|
||||
joined = [] # list of (start_ms, end_ms, input_length, request_id)
|
||||
n_in_seen = 0
|
||||
seen_rids = set()
|
||||
with open(RAW_IN) as f:
|
||||
for line in f:
|
||||
try:
|
||||
d = json.loads(line)
|
||||
except: continue
|
||||
n_in_seen += 1
|
||||
rid = d.get("request_id")
|
||||
ts = d.get("timestamp")
|
||||
if rid is None or ts is None or rid in seen_rids:
|
||||
continue
|
||||
seen_rids.add(rid)
|
||||
info = out_info.get(rid)
|
||||
if info is None:
|
||||
continue
|
||||
ttf_ms, gtn = info
|
||||
try: ts = int(ts)
|
||||
except: continue
|
||||
end_ms = ts + ttf_ms
|
||||
joined.append((ts, end_ms, gtn, rid))
|
||||
print(f" input scanned: {n_in_seen}, joined start+end+gtn: {len(joined)}")
|
||||
|
||||
joined.sort(key=lambda x: x[0])
|
||||
starts = [j[0] for j in joined]
|
||||
gtns = [j[2] for j in joined] # generate_token_num (output_length-equivalent)
|
||||
ends = [j[1] for j in joined]
|
||||
print(f"start_ms range: [{starts[0]}, {starts[-1]}], duration {(starts[-1]-starts[0])/1000:.0f}s")
|
||||
|
||||
print("Reading formatted...")
|
||||
fmt_rows = []
|
||||
with open(FMT) as f:
|
||||
for line in f:
|
||||
try:
|
||||
d = json.loads(line)
|
||||
except: continue
|
||||
fmt_rows.append((
|
||||
int(d["chat_id"]),
|
||||
int(d["parent_chat_id"]),
|
||||
float(d["timestamp"]),
|
||||
int(d.get("input_length", 0)),
|
||||
int(d.get("output_length", 0)),
|
||||
))
|
||||
print(f" formatted records: {len(fmt_rows)}")
|
||||
print(f"fmt timestamp range: [{fmt_rows[0][2]}, {fmt_rows[-1][2]}]s "
|
||||
f"(duration {fmt_rows[-1][2] - fmt_rows[0][2]:.0f}s)")
|
||||
|
||||
# Calibrate T0 by matching first few formatted records with raw records.
|
||||
# We use output_length (formatted) vs generate_token_num (raw output) as the
|
||||
# matching key — prompt_token_num is anonymized to 0.
|
||||
print("Calibrating T0 (raw_ms anchor for formatted ts=0)...")
|
||||
T0_candidates = []
|
||||
for chat_id, _pcid, ts_rel, _il, ol in fmt_rows[:200]:
|
||||
for k in range(min(2000, len(joined))):
|
||||
if gtns[k] == ol:
|
||||
T0_candidates.append(starts[k] - int(ts_rel * 1000))
|
||||
break
|
||||
T0_candidates.sort()
|
||||
T0 = T0_candidates[len(T0_candidates) // 2] if T0_candidates else starts[0]
|
||||
print(f" T0 from {len(T0_candidates)} candidates -> {T0} ms")
|
||||
print(f" candidate T0 distribution: min={min(T0_candidates) if T0_candidates else 'n/a'} "
|
||||
f"max={max(T0_candidates) if T0_candidates else 'n/a'}")
|
||||
|
||||
print("Matching formatted -> raw by (ts_rel, output_length)...")
|
||||
TOLERANCE_MS = 60_000 # ±60 s window
|
||||
fmt_to_timing = {}
|
||||
matched = 0
|
||||
ambiguous = 0
|
||||
unmatched = 0
|
||||
for chat_id, _pcid, ts_rel, _il, ol in fmt_rows:
|
||||
target_ms = T0 + int(ts_rel * 1000)
|
||||
lo = bisect.bisect_left(starts, target_ms - TOLERANCE_MS)
|
||||
hi = bisect.bisect_right(starts, target_ms + TOLERANCE_MS)
|
||||
best = None
|
||||
best_drift = None
|
||||
for k in range(lo, hi):
|
||||
if gtns[k] == ol:
|
||||
d = abs(starts[k] - target_ms)
|
||||
if best_drift is None or d < best_drift:
|
||||
best_drift = d
|
||||
best = k
|
||||
if best is None:
|
||||
unmatched += 1
|
||||
continue
|
||||
fmt_to_timing[chat_id] = (starts[best], ends[best])
|
||||
matched += 1
|
||||
print(f" matched: {matched}, unmatched: {unmatched}, ambiguous: {ambiguous}")
|
||||
print(f" match rate: {matched/len(fmt_rows)*100:.1f}%")
|
||||
|
||||
# Build session structure from parent_chat_id chains
|
||||
chat_to_session = {}
|
||||
for chat_id, pcid, _ts, _il, _ol in fmt_rows:
|
||||
if pcid < 0:
|
||||
sid = chat_id
|
||||
else:
|
||||
sid = chat_to_session.get(pcid, pcid)
|
||||
chat_to_session[chat_id] = sid
|
||||
|
||||
sessions = defaultdict(list)
|
||||
for chat_id, _pcid, _ts, _il, _ol in fmt_rows:
|
||||
timing = fmt_to_timing.get(chat_id)
|
||||
if timing is None:
|
||||
continue
|
||||
sid = chat_to_session[chat_id]
|
||||
sessions[sid].append(timing)
|
||||
|
||||
gaps_ms = []
|
||||
neg = 0
|
||||
multi = 0
|
||||
for sid, turns in sessions.items():
|
||||
if len(turns) < 2:
|
||||
continue
|
||||
multi += 1
|
||||
turns.sort(key=lambda x: x[0])
|
||||
for i in range(len(turns) - 1):
|
||||
g = turns[i + 1][0] - turns[i][1]
|
||||
if g < 0:
|
||||
neg += 1
|
||||
continue
|
||||
gaps_ms.append(g)
|
||||
|
||||
gaps = np.array(gaps_ms, dtype=np.float64) / 1000.0
|
||||
print(f"multi_turn_sessions: {multi}, gaps_kept: {len(gaps)}, neg_dropped: {neg}")
|
||||
|
||||
if len(gaps) == 0:
|
||||
print("No gaps to summarize.")
|
||||
else:
|
||||
pcts = [1, 5, 25, 50, 75, 90, 95, 99]
|
||||
ps = {f"p{p}": float(np.percentile(gaps, p)) for p in pcts}
|
||||
print(f"stats_s: min={gaps.min():.3f} mean={gaps.mean():.3f} max={gaps.max():.3f} {ps}")
|
||||
for thr in [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 300.0, 1800.0]:
|
||||
pct = (gaps < thr).sum() / len(gaps) * 100
|
||||
print(f"frac < {thr:7.1f}s : {pct:5.1f}%")
|
||||
|
||||
n = len(gaps)
|
||||
arr = np.sort(gaps)
|
||||
idx_top = np.unique(np.round(np.geomspace(1, max(1, n // 100), 200)).astype(int)) - 1
|
||||
idx_rest = np.unique(np.linspace(n // 100, n - 1, 300).astype(int))
|
||||
idx = np.unique(np.concatenate([[0], idx_top, idx_rest, [n - 1]]))
|
||||
idx = idx[idx < n]
|
||||
samples = [{"rank_pct": float((i + 1) / n * 100), "gap_s": float(arr[i])} for i in idx]
|
||||
out = {
|
||||
"trace": "chatbot",
|
||||
"n_gaps": n,
|
||||
"n_sessions": multi,
|
||||
"negative_dropped": neg,
|
||||
"matched_formatted_to_raw": matched,
|
||||
"unmatched_formatted": unmatched,
|
||||
"stats_s": {**{"min": float(gaps.min()), "max": float(gaps.max()),
|
||||
"mean": float(gaps.mean())}, **ps},
|
||||
"fraction_below": {f"{thr}s": float((gaps < thr).sum() / n)
|
||||
for thr in [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 300.0, 1800.0]},
|
||||
"cdf_samples": samples,
|
||||
}
|
||||
open(OUT_JSON, "w").write(json.dumps(out))
|
||||
print(f"wrote {OUT_JSON}")
|
||||
@@ -30,52 +30,65 @@ def load(cache_path: Path) -> tuple[np.ndarray, np.ndarray, dict]:
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--data",
|
||||
"--agentic-data",
|
||||
default="analysis/characterization/data/agentic_inter_turn_gap.json",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chatbot-data",
|
||||
default="analysis/characterization/data/chatbot_inter_turn_gap.json",
|
||||
)
|
||||
parser.add_argument("--out", default="figs/f3a_inter_turn_gap.png")
|
||||
args = parser.parse_args()
|
||||
|
||||
xs, ys, d = load(Path(args.data))
|
||||
a_xs, a_ys, a_d = load(Path(args.agentic_data))
|
||||
c_xs, c_ys, c_d = load(Path(args.chatbot_data))
|
||||
|
||||
fig, ax = plt.subplots(figsize=(9, 5.2))
|
||||
ax.plot(xs, ys, color="#1f77b4", lw=2.2,
|
||||
label=f"agentic trace (n={d['n_gaps']:,} gaps, "
|
||||
f"{d['n_sessions']:,} multi-turn sessions)")
|
||||
fig, ax = plt.subplots(figsize=(10, 5.5))
|
||||
ax.plot(a_xs, a_ys, color="#1f77b4", lw=2.4,
|
||||
label=f"agentic (n={a_d['n_gaps']:,} gaps, "
|
||||
f"{a_d['n_sessions']:,} multi-turn sessions, Qwen3-Coder)")
|
||||
ax.plot(c_xs, c_ys, color="#c44e52", lw=2.4,
|
||||
label=f"chatbot (n={c_d['n_gaps']:,} gaps, "
|
||||
f"{c_d['n_sessions']:,} multi-turn sessions, qwen3-max)")
|
||||
|
||||
p = d["stats_s"]
|
||||
for pct, key in [(25, "p25"), (50, "p50"), (75, "p75"), (90, "p90")]:
|
||||
v = p[key]
|
||||
ax.scatter([v], [pct], color="#c44e52", s=55, zorder=5)
|
||||
ax.annotate(f"p{pct} = {v:.2g}s",
|
||||
xy=(v, pct), xytext=(8, -4),
|
||||
textcoords="offset points",
|
||||
fontsize=10, color="#7a1d1d")
|
||||
for d, color, side in [(a_d, "#1f4e79", "left"), (c_d, "#7a1d1d", "right")]:
|
||||
for pct, key in [(50, "p50")]:
|
||||
v = d["stats_s"][key]
|
||||
ax.scatter([v], [pct], color=color, s=55, zorder=5)
|
||||
xt = 8 if side == "left" else -110
|
||||
yt = -10 if side == "left" else 8
|
||||
ax.annotate(f"p50 = {v:.2g}s",
|
||||
xy=(v, pct), xytext=(xt, yt),
|
||||
textcoords="offset points",
|
||||
fontsize=10, color=color)
|
||||
|
||||
# Reference vertical lines: scheduler W_turn (TTFT p90 from our window_1 runs)
|
||||
refs = [
|
||||
("lmetric TTFT p90 = 15.7s", 15.7, "#888"),
|
||||
("unified TTFT p90 = 7.3s", 7.3, "#444"),
|
||||
]
|
||||
for label, v, color in refs:
|
||||
ax.axvline(v, color=color, ls=":", lw=1.2, alpha=0.85)
|
||||
ax.text(v * 1.05, 8, label, fontsize=8.5, color=color,
|
||||
ax.axvline(v, color=color, ls=":", lw=1.2, alpha=0.7)
|
||||
ax.text(v * 1.05, 5, label, fontsize=8.5, color=color,
|
||||
rotation=90, va="bottom")
|
||||
|
||||
ax.set_xscale("log")
|
||||
ax.set_xlim(0.05, 2000)
|
||||
ax.set_xlim(0.05, 5000)
|
||||
ax.set_ylim(0, 102)
|
||||
ax.set_xlabel(
|
||||
"Inter-turn gap T_external (s, log scale) "
|
||||
"— next_turn.ready − prev_turn.end"
|
||||
"Inter-turn gap T_external (s, log scale) — next.ready − prev.end"
|
||||
)
|
||||
ax.set_ylabel("Cumulative % of inter-turn intervals")
|
||||
ap = a_d["stats_s"]
|
||||
cp = c_d["stats_s"]
|
||||
af = a_d["fraction_below"]
|
||||
cf = c_d["fraction_below"]
|
||||
ax.set_title(
|
||||
"Inter-turn external gap CDF — production agentic trace\n"
|
||||
f"median T_external = {p['p50']:.2g}s; "
|
||||
f"{int(d['fraction_below']['1.0s']*100)}% gaps < 1s, "
|
||||
f"{int(d['fraction_below']['5.0s']*100)}% < 5s, "
|
||||
f"{int(d['fraction_below']['30.0s']*100)}% < 30s"
|
||||
f"Agentic vs chatbot inter-turn external gap — agentic has a "
|
||||
f"sub-second tool-call mode chatbot lacks\n"
|
||||
f"agentic p50={ap['p50']:.2g}s, frac<1s={af['1.0s']*100:.0f}%, "
|
||||
f"frac<5s={af['5.0s']*100:.0f}% · "
|
||||
f"chatbot p50={cp['p50']:.2g}s, frac<1s={cf['1.0s']*100:.0f}%, "
|
||||
f"frac<5s={cf['5.0s']*100:.0f}%"
|
||||
)
|
||||
ax.grid(True, which="both", alpha=0.3)
|
||||
ax.legend(loc="lower right", framealpha=0.92, fontsize=9)
|
||||
|
||||
Reference in New Issue
Block a user