"""Analyze core agentic workload patterns that matter for PD scheduling. Focus: what characteristics make agentic workloads different from chatbot, and how do they interact with PD-combined vs PD-sep architectures? """ import json, statistics from collections import defaultdict, Counter rows = [json.loads(l) for l in open("traces/sampled_1000req_seed42.jsonl")] rows.sort(key=lambda r: float(r["timestamp"])) BLOCK_SIZE = 512 # Build sessions chat_to_session = {} sessions = defaultdict(list) for idx, r in enumerate(rows): cid = r["chat_id"] pid = r["parent_chat_id"] sid = r.get("session_id", str(cid) if pid < 0 else chat_to_session.get(pid, str(pid))) chat_to_session[cid] = str(sid) sessions[str(sid)].append((idx, r)) mt = {k: v for k, v in sessions.items() if len(v) > 1} st = {k: v for k, v in sessions.items() if len(v) == 1} sep = "=" * 70 print(sep) print(" AGENTIC WORKLOAD CORE PATTERNS") print(sep) # Pattern 1: Bimodal request profile print("\n PATTERN 1: Bimodal Request Profile") print(" " + "-" * 40) # Compute per-request new tokens (simulating prefix cache) seen = set() warm_reqs = [] # high cache hit cold_reqs = [] # low cache hit for r in rows: hids = r.get("hash_ids", []) hit = 0 for hid in hids: if hid in seen: hit += 1 else: break for hid in hids: seen.add(hid) cache_ratio = (hit * BLOCK_SIZE) / r["input_length"] if r["input_length"] > 0 else 0 new_tokens = max(0, r["input_length"] - hit * BLOCK_SIZE) entry = {"input": r["input_length"], "new": new_tokens, "cache": cache_ratio, "output": r["output_length"]} if cache_ratio > 0.5: warm_reqs.append(entry) else: cold_reqs.append(entry) print(" Warm (cache>50%%): %d reqs (%.0f%%)" % (len(warm_reqs), len(warm_reqs)*100/len(rows))) print(" Cold (cache<=50%%): %d reqs (%.0f%%)" % (len(cold_reqs), len(cold_reqs)*100/len(rows))) warm_new = sorted([r["new"] for r in warm_reqs]) cold_new = sorted([r["new"] for r in cold_reqs]) p = lambda v, q: v[min(int(q*len(v)), len(v)-1)] if v else 0 print(" Warm new_tokens: p50=%d p90=%d" % (p(warm_new,.5), p(warm_new,.9))) print(" Cold new_tokens: p50=%d p90=%d" % (p(cold_new,.5), p(cold_new,.9))) warm_out = sorted([r["output"] for r in warm_reqs]) cold_out = sorted([r["output"] for r in cold_reqs]) print(" Warm output: p50=%d p90=%d" % (p(warm_out,.5), p(warm_out,.9))) print(" Cold output: p50=%d p90=%d" % (p(cold_out,.5), p(cold_out,.9))) # Pattern 2: Multi-turn session structure print("\n PATTERN 2: Multi-Turn Session Lifecycle") print(" " + "-" * 40) print(" Sessions: %d total, %d multi-turn (%.0f%%)" % ( len(sessions), len(mt), len(mt)*100/len(sessions))) # Per-session: KV growth across turns for sid in sorted(mt.keys(), key=lambda s: -len(mt[s]))[:5]: turns = mt[sid] turns.sort(key=lambda x: x[0]) print(" Session %s (%d turns):" % (sid[:8], len(turns))) for req_idx, r in turns[:5]: print(" turn %d: input=%d output=%d blocks=%d" % ( r.get("turn", 0), r["input_length"], r["output_length"], len(r.get("hash_ids", [])))) if len(turns) > 5: print(" ... (%d more turns)" % (len(turns) - 5)) # Pattern 3: Arrival burstiness print("\n PATTERN 3: Arrival Pattern and Concurrency") print(" " + "-" * 40) timestamps = [float(r["timestamp"]) for r in rows] inter_arrivals = [timestamps[i+1] - timestamps[i] for i in range(len(timestamps)-1)] inter_arrivals.sort() print(" Inter-arrival time (s): p50=%.2f p90=%.2f" % (p(inter_arrivals,.5), p(inter_arrivals,.9))) # Simulate concurrency at different time scales for window_s in [1, 5, 10, 30]: max_concurrent = 0 for i, ts in enumerate(timestamps): concurrent = sum(1 for t in timestamps if ts <= t < ts + window_s) max_concurrent = max(max_concurrent, concurrent) print(" Max concurrent in %ds window: %d" % (window_s, max_concurrent)) # Pattern 4: Prefill-decode compute ratio print("\n PATTERN 4: Compute Asymmetry") print(" " + "-" * 40) total_input = sum(r["input_length"] for r in rows) total_output = sum(r["output_length"] for r in rows) total_new = sum(r["new"] for r in warm_reqs + cold_reqs) print(" Total input tokens: %s" % "{:,}".format(total_input)) print(" Total output tokens: %s" % "{:,}".format(total_output)) print(" Total new tokens (after cache): %s" % "{:,}".format(total_new)) print(" I/O ratio: %.1fx" % (total_input / max(total_output, 1))) print(" New/O ratio (actual prefill/decode): %.1fx" % (total_new / max(total_output, 1))) print(" Prefill reduction from cache: %.0f%%" % ((1 - total_new/total_input) * 100)) # Pattern 5: Session KV reuse potential print("\n PATTERN 5: Where KV Reuse Comes From") print(" " + "-" * 40) # Decompose: intra-session reuse vs cross-session reuse intra_session_reuse = 0 cross_session_reuse = 0 no_reuse = 0 session_seen = defaultdict(set) global_seen = set() for r in rows: hids = r.get("hash_ids", []) cid = r["chat_id"] pid = r["parent_chat_id"] sid = r.get("session_id", str(cid) if pid < 0 else chat_to_session.get(pid, str(pid))) for hid in hids: if hid in session_seen[sid]: intra_session_reuse += BLOCK_SIZE elif hid in global_seen: cross_session_reuse += BLOCK_SIZE else: no_reuse += BLOCK_SIZE session_seen[sid].add(hid) global_seen.add(hid) total = intra_session_reuse + cross_session_reuse + no_reuse print(" Intra-session (multi-turn KV reuse): %s tokens (%.1f%%)" % ( "{:,}".format(intra_session_reuse), intra_session_reuse*100/total)) print(" Cross-session (shared prefix/system prompt): %s tokens (%.1f%%)" % ( "{:,}".format(cross_session_reuse), cross_session_reuse*100/total)) print(" New (no reuse possible): %s tokens (%.1f%%)" % ( "{:,}".format(no_reuse), no_reuse*100/total)) # Pattern 6: Implications for PD design print("\n" + sep) print(" IMPLICATIONS FOR PD DESIGN") print(sep) print(""" 1. BIMODAL PREFILL: 36%% of requests are warm (1.3k new tokens), 64%% cold (17k+). -> One-size-fits-all PD strategy suboptimal. Warm requests don't need P isolation. 2. MULTI-TURN DOMINATES REUSE: %.1f%% of reusable KV is intra-session. -> Session-sticky routing is critical. Breaking session affinity destroys APC. 3. HIGH I/O RATIO (%.1fx), but after cache: %.1fx actual prefill/decode. -> Cache dramatically reduces effective prefill compute. -> PD separation's benefit (isolate prefill compute) is reduced by cache. 4. SHORT INTER-TURN GAP (p50=2 req): multi-turn KV stays warm in LRU naturally. -> No special eviction policy needed IF routing is balanced. 5. CROSS-SESSION SHARING IS SMALL (%.1f%% of total tokens). -> System prompt sharing helps APC but is not the main source of reuse. -> Intra-session reuse (%.1f%%) is the dominant pattern. """ % ( intra_session_reuse * 100 / (intra_session_reuse + cross_session_reuse), total_input / max(total_output, 1), total_new / max(total_output, 1), cross_session_reuse * 100 / total, intra_session_reuse * 100 / total, ))