diff --git a/README.md b/README.md new file mode 100644 index 0000000..3e42f2f --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# agentic-kv + +Serving agentic LLM workloads by keeping the KV working set in GPU HBM +(GPU-hit-first). Research outline: [`PAPER_OUTLINE.md`](PAPER_OUTLINE.md). +Evidence + experiments: [`v2/`](v2/). + +--- + +## ⚠️ Benchmarking methodology — read this first + +> **Replay agentic traces with `--dispatch-mode thinktime`, not the default +> `tracets`.** It is the faithful, more realistic load — and the dispatch mode +> materially changes the performance you measure. + +The replayer offers two ways to time each turn: + +| mode | turn-k dispatched at | what it models | +|---|---|---| +| `tracets` (default) | `max(prev_turn_finished, trace_ts)` | absolute production schedule | +| **`thinktime` (use this)** | `prev_turn_finished + time_to_parent_chat` | real closed-loop agent pacing | + +**Why it matters.** `tracets` collapses the inter-turn think-time to ~0 whenever +the system falls behind (it fires the next turn immediately because the trace +timestamp is already in the past). That manufactures **artificial request +bursts** — spiking instantaneous concurrency → KV-pool pressure → preemption → +inflated tail latency and wasted throughput. `thinktime` keeps each turn's real +gap (tool-exec + agent think), so the offered load is what a real agent produces. + +**Measured (w600 first-300s window, 8×H20, round-robin, 100% completion):** + +| metric (N=8) | `tracets` (Mode 1) | **`thinktime` (Mode 2)** | Δ | +|---|---:|---:|---:| +| E2E p90 | 102.8 s | **73.5 s** | **−28%** | +| E2E p99 | 245 s | **227 s** | −7% | +| TTFT p90 | 56.1 s | **39.7 s** | **−29%** | +| system TPS | 111.8 | **119.3** | **+7%** | +| wall-clock | 967 s | **787 s** | −19% | +| TPOT p90 | 0.174 s | 0.188 s | ~flat | + +So under realistic capacity, `tracets` makes the system look **~30% worse on +tail latency** than it actually is. Tell-tale: scaling 6→8 instances barely helped +`tracets` (975→967 s — its bursts re-saturate regardless of capacity) but helped +`thinktime` a lot (1125→787 s). Under heavy saturation (N=6) the two converge +(E2E p90 ≈ 118–120 s), since there is no slack for bursts to harm. Decode (TPOT) +is dispatch-independent everywhere. + +**Recommendation:** benchmark with `--dispatch-mode thinktime`; use `tracets` +only as an explicit bursty stress case. Full ablation: +[`v2/exp_c_dispatch_ablation/`](v2/exp_c_dispatch_ablation/). + +### How to use it + +```bash +# 1. annotate a trace with the real per-turn gap (one-time; scans the raw trace) +python scripts/add_time_to_parent.py traces/w600_r0.0015_st30.jsonl traces/w600_ttp.jsonl + +# 2. replay closed-loop with faithful think-time +python -m replayer --trace traces/w600_ttp.jsonl --endpoint \ + --model --dispatch-mode thinktime +``` + +`time_to_parent_chat = this_turn.request_ready_time_ms − parent_turn.request_end_time_ms`, +computed from the raw trace and stored per request; turn-1 has none (fires at its +trace arrival). Traces without the field fall back to `tracets`. + +--- + +## Project map + +- [`PAPER_OUTLINE.md`](PAPER_OUTLINE.md) — GPU-hit-first paper outline (the thesis). +- [`v2/`](v2/) — evidence experiments: + - `exp_a_tier_latency/` — KV-hit cost by tier (GPU < CPU-local < remote-RDMA < miss). + - `exp_b_capacity_knee/` — realized APC / latency knee vs GPU capacity. + - `exp_c_dispatch_ablation/` — the replay-mode study above. +- `replayer/` — trace replayer (`--dispatch-mode`, closed-loop think-time). +- `scripts/add_time_to_parent.py` — trace annotation for `thinktime`. +- `microbench/`, `analysis/` — PD-disagg, routing, workload characterization. diff --git a/replayer/__main__.py b/replayer/__main__.py index d9aa0ce..105f1f4 100644 --- a/replayer/__main__.py +++ b/replayer/__main__.py @@ -30,6 +30,11 @@ def main() -> None: default=float(_env_think) if _env_think else None, help="Closed-loop think-time (s) after each turn completes; " "ignore absolute trace schedule. Env: REPLAY_INTER_TURN_THINK_S") + p.add_argument("--dispatch-mode", choices=["tracets", "thinktime"], + default=os.environ.get("REPLAY_DISPATCH_MODE", "tracets"), + help="tracets (Mode 1): absolute trace ts = max(prev_finished, ts). " + "thinktime (Mode 2): turn-k at prev_finished + " + "time_to_parent_chat. Env: REPLAY_DISPATCH_MODE") p.add_argument("--request-timeout", type=float, default=600.0) p.add_argument("--request-limit", type=int, default=None, help="Limit number of requests to replay") @@ -51,6 +56,7 @@ def main() -> None: request_limit=args.request_limit, max_inflight_sessions=args.max_inflight_sessions, inter_turn_think_s=args.inter_turn_think, + dispatch_mode=args.dispatch_mode, ) results = asyncio.run(replay_trace(config)) diff --git a/replayer/replay.py b/replayer/replay.py index eb16c77..0d55441 100644 --- a/replayer/replay.py +++ b/replayer/replay.py @@ -66,6 +66,13 @@ class ReplayConfig: # max_inflight_sessions=N this is a stable N-user closed-loop (no open-loop # runaway), so it removes the "immediate retrigger under load" artifact. inter_turn_think_s: float | None = None + # Dispatch timing for intra-session turns: + # "tracets" (Mode 1): fire at absolute trace timestamp -> effectively + # max(prev_finished, trace_ts); collapses think-time to 0 when + # the system is behind (the amplification-inflation suspect). + # "thinktime" (Mode 2): turn-1 at trace arrival; turn-k at + # prev_finished + time_to_parent_chat (real production gap). + dispatch_mode: str = "tracets" def _build_prompt_token_ids(req: TraceRequest) -> list[int]: @@ -286,14 +293,26 @@ async def _run_session( realized_context: list[int] = [] try: for turn_idx, req in enumerate(state.turns): - if config.inter_turn_think_s is not None: + if config.dispatch_mode == "thinktime": + # Mode 2: turn-1 at absolute trace arrival (preserve session + # schedule); later turns wait the REAL per-record think-time after + # the previous turn completed -> no think-collapse under load. + if turn_idx == 0: + target_wall = (req.timestamp_s - earliest_ts) + elapsed = time.perf_counter() - sweep_start + if elapsed < target_wall: + await asyncio.sleep(target_wall - elapsed) + else: + think = req.time_to_parent_chat_s + await asyncio.sleep(think if think is not None else 0.0) + elif config.inter_turn_think_s is not None: # Closed-loop: turn 1 fires on admission; later turns wait a fixed # think-time AFTER the previous turn completed (no absolute schedule, # so no "fire immediately because timestamp is in the past"). if turn_idx > 0: await asyncio.sleep(config.inter_turn_think_s) else: - # Original: dispatch at the request's absolute trace timestamp. + # Mode 1: dispatch at the request's absolute trace timestamp. target_wall = (req.timestamp_s - earliest_ts) elapsed = time.perf_counter() - sweep_start if elapsed < target_wall: diff --git a/replayer/trace.py b/replayer/trace.py index 17f6767..b8d84e6 100644 --- a/replayer/trace.py +++ b/replayer/trace.py @@ -28,6 +28,9 @@ class TraceRequest: request_type: str turn_id: int hash_ids: tuple[int, ...] + # real production gap (s) from parent turn finishing to this turn arriving; + # None for turn-1 / unannotated traces. Used by --dispatch-mode thinktime. + time_to_parent_chat_s: float | None = None def load_trace( @@ -66,6 +69,9 @@ def load_trace( request_type=str(row["type"]), turn_id=int(row["turn"]), hash_ids=tuple(int(h) for h in row.get("hash_ids", [])), + time_to_parent_chat_s=( + float(row["time_to_parent_chat"]) + if row.get("time_to_parent_chat") is not None else None), )) return requests diff --git a/scripts/add_time_to_parent.py b/scripts/add_time_to_parent.py new file mode 100644 index 0000000..76a9a51 --- /dev/null +++ b/scripts/add_time_to_parent.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +"""Annotate a formatted/sampled trace with `time_to_parent_chat` (seconds). + +time_to_parent_chat = this_turn.request_ready_time_ms - parent_turn.request_end_time_ms +i.e. the real external gap (tool exec + agent think) between the parent turn +*finishing* in production and this turn *arriving*. Turn-1 (parent_chat_id == -1) +gets null. + +The end/ready times live only in the raw trace (meta.*); the sampled trace has +neither. So we scan the raw trace once (byte-level field extraction, early-exit +once every needed chat_id is found) to build {chat_id: (ready_ms, end_ms)}, +then join. + +Run on dash0 (raw trace is there). +""" +from __future__ import annotations +import json +import sys +import time +from pathlib import Path + +RAW = "/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317-raw.jsonl" +KCHAT = b'"chat_id":' +KREADY = b'"request_ready_time_ms":' +KEND = b'"request_end_time_ms":' + + +def parse_int_after(line: bytes, key: bytes): + i = line.find(key) + if i < 0: + return None + i += len(key) + n = len(line) + while i < n and line[i] in (0x20, 0x09): # space/tab + i += 1 + j = i + if j < n and line[j] == 0x2D: # '-' + j += 1 + while j < n and 0x30 <= line[j] <= 0x39: + j += 1 + return int(line[i:j]) if j > i and line[i:j] != b'-' else None + + +def scan_timing(needed: set[int], raw_path: str) -> dict[int, tuple[int, int]]: + timing: dict[int, tuple[int, int]] = {} + t0 = time.time() + nbytes = 0 + with open(raw_path, "rb", buffering=1 << 22) as f: + for line in f: + nbytes += len(line) + cid = parse_int_after(line, KCHAT) + if cid is None or cid not in needed or cid in timing: + continue + ready = parse_int_after(line, KREADY) + end = parse_int_after(line, KEND) + if ready is None or end is None: + continue + timing[cid] = (ready, end) + if len(timing) == len(needed): + break + print(f"[scan] found {len(timing)}/{len(needed)} chats in " + f"{nbytes/1e9:.1f} GB / {time.time()-t0:.0f}s", flush=True) + return timing + + +def main(): + in_trace = Path(sys.argv[1]) + out_trace = Path(sys.argv[2]) + raw_path = sys.argv[3] if len(sys.argv) > 3 else RAW + + rows = [json.loads(l) for l in open(in_trace)] + chats = {r["chat_id"] for r in rows} + parents = {r["parent_chat_id"] for r in rows + if r.get("parent_chat_id") not in (None, -1, 0, "")} + needed = chats | parents + print(f"[trace] {len(rows)} reqs, {len(chats)} chats, " + f"{len(parents)} parents, {len(needed)} chat_ids to look up", flush=True) + + timing = scan_timing(needed, raw_path) + + # annotate + n_ann = n_neg = 0 + ttps = [] + min_ready = min((t[0] for t in timing.values()), default=0) + for r in rows: + p = r.get("parent_chat_id") + ttp = None + if p not in (None, -1, 0, "") and p in timing and r["chat_id"] in timing: + ttp = (timing[r["chat_id"]][0] - timing[p][1]) / 1000.0 # ready - parent.end + if ttp < 0: + n_neg += 1 + ttp = 0.0 + ttps.append(ttp) + n_ann += 1 + r["time_to_parent_chat"] = ttp + # sanity field: re-derived ready offset (s) to cross-check vs timestamp + if r["chat_id"] in timing: + r["_ready_off_s"] = (timing[r["chat_id"]][0] - min_ready) / 1000.0 + + with open(out_trace, "w") as o: + for r in rows: + o.write(json.dumps(r) + "\n") + + ttps.sort() + n = len(ttps) + pc = lambda q: ttps[min(int(q * n), n - 1)] if n else 0 + print(f"[done] annotated {n_ann} turns with time_to_parent_chat " + f"({n_neg} negative clamped to 0)") + print(f"[ttp] p25={pc(.25):.2f}s p50={pc(.5):.2f}s p90={pc(.9):.2f}s " + f"p99={pc(.99):.2f}s (f3a ref: p50~1.6s)") + print(f"[ttp] frac<1s={sum(1 for x in ttps if x<1)/n:.0%} " + f"frac<5s={sum(1 for x in ttps if x<5)/n:.0%}") + # cross-check timestamp vs re-derived ready offset on a few rows + chk = [(r["timestamp"], r["_ready_off_s"]) for r in rows if "_ready_off_s" in r][:5] + print("[sanity] (trace.timestamp, raw ready_off_s):", [(round(a,1),round(b,1)) for a,b in chk]) + print(f"wrote {out_trace}") + + +if __name__ == "__main__": + main() diff --git a/v2/.gitignore b/v2/.gitignore index 351cd46..db3cddd 100644 --- a/v2/.gitignore +++ b/v2/.gitignore @@ -1,3 +1,3 @@ # raw per-request replay dumps (~0.6 MB each) — regenerable; keep summary/m0/m1 -*/results/metrics_blk*.jsonl +*/results/metrics_*.jsonl */results/vllm_*.log diff --git a/v2/exp_c_dispatch_ablation/README.md b/v2/exp_c_dispatch_ablation/README.md new file mode 100644 index 0000000..995b727 --- /dev/null +++ b/v2/exp_c_dispatch_ablation/README.md @@ -0,0 +1,61 @@ +# exp (c) — Replay dispatch mode: `tracets` vs `thinktime` + +Which replay mode should we benchmark agentic serving with, and how much does it +change the measured performance? + +**Two dispatch modes** (`replayer --dispatch-mode`): +- **`tracets`** (default): turn-k at the absolute trace timestamp ⇒ effectively + `max(prev_finished, trace_ts)`. +- **`thinktime`**: turn-1 at trace arrival; turn-k at + `prev_finished + time_to_parent_chat` (the REAL production gap; annotated by + `scripts/add_time_to_parent.py` from the raw trace's + `request_ready_time_ms`/`request_end_time_ms`). + +Setup: w600 windowed to first 300 s (366 reqs, 223 multi-turn), round-robin across +N H20 instances, both modes on the same instances, 100% completion throughout. + +## Performance result + +| metric | N6 tracets | N6 thinktime | N8 tracets | **N8 thinktime** | +|---|---:|---:|---:|---:| +| system TPS | 110.9 | 96.1 | 111.8 | **119.3** | +| wall (s) | 975 | 1125 | 967 | **787** | +| TTFT p50 / p90 / p99 | 4.4 / 61.8 / 135 | 4.5 / 83.7 / 130 | 2.9 / 56.1 / 115 | 3.1 / **39.7** / **83.5** | +| TPOT p50 / p90 / p99 | .039 / .242 / .96 | .037 / .264 / .69 | .037 / .174 / .89 | .037 / .188 / .85 | +| E2E p50 / p90 / p99 | 17.1 / 118 / 298 | 15.0 / 120 / 338 | 11.9 / 102.8 / 245 | 12.3 / **73.5** / **227** | + +**At N=8 (capacity slack), `thinktime` is clearly better**: E2E p90 −28%, TTFT p90 +−29%, TPS +7%, wall −19%. **At N=6 (saturated) they converge** (E2E p90 ≈ 118–120 s). +TPOT (decode) is dispatch-independent everywhere. + +## Why — the mechanism (`figs/exp_c_dispatch_ablation.png`) + +`tracets` collapses the realized inter-turn gap to ~0 under load (p50 0.00 s, 75% +< 0.5 s) — it fires the next turn immediately because the trace timestamp is in the +past. `thinktime` preserves the real gap (p50 1.22 s = the trace). The figure shows +both realized-gap CDFs against the real `time_to_parent_chat`. + +That gap-collapse manufactures **bursts** → peak concurrency spikes → KV-pool +pressure → preemption → inflated tail latency + wasted throughput. The bursts +re-saturate the system regardless of capacity, which is why scaling 6→8 instances +barely helped `tracets` (975→967 s) but helped `thinktime` a lot (1125→787 s). +Under saturation (N=6) there is no slack for bursts to harm, so the modes converge. + +## Conclusion + +Benchmark agentic serving with **`--dispatch-mode thinktime`** — it is the faithful +closed-loop agent load and avoids the `tracets` burst artifact that makes the system +look ~30% worse on tail latency than it is. Use `tracets` only as an explicit bursty +stress case. (See the repo [`README.md`](../../README.md) for the headline guidance.) + +Caveat: round-robin pays full prefill every turn (no cache reuse), so absolute +latencies here are high; a cache-aware policy (LPWL) would lower them and likely +widen the `thinktime` advantage. The raw window is also heavy (E2E in tens of +seconds); a lighter load shows a healthier operating point. + +## Repro +```bash +N=8 TRACE=traces/w600_ttp_win.jsonl bash v2/exp_c_dispatch_ablation/run_ablation.sh +python v2/exp_c_dispatch_ablation/analyze.py traces/w600_ttp_win.jsonl \ + v2/exp_c_dispatch_ablation/results/metrics_{tracets,thinktime}.jsonl +``` diff --git a/v2/exp_c_dispatch_ablation/analyze.py b/v2/exp_c_dispatch_ablation/analyze.py new file mode 100644 index 0000000..736f28f --- /dev/null +++ b/v2/exp_c_dispatch_ablation/analyze.py @@ -0,0 +1,115 @@ +"""Mode 1 (tracets) vs Mode 2 (thinktime): wall-clock amplification + the +think-collapse mechanism. + +Mechanism (the smoking gun): realized inter-turn gap = this.t_dispatch_unix - +prev.t_finish_unix, per session, consecutive turns. Mode 1 collapses it toward 0 +when the system is behind; Mode 2 holds it at the real time_to_parent_chat. + +Amplification: wall / ideal_span. Mode 1 ideal = trace_span; Mode 2 ideal = the +think-chain span under instant serving (analytic). If Mode 1 amplifies (>>1) with +collapsed gaps while Mode 2 stays ~1 with preserved gaps, the 8x was a Mode-1 +artifact, not a workload property. + +Usage: analyze.py [fig.png] +""" +import json +import sys +from collections import defaultdict + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def pct(v, q): + v = sorted(v) + return v[min(int(q * len(v)), len(v) - 1)] if v else 0.0 + + +def load_trace(trace): + rows = [json.loads(l) for l in open(trace)] + span = max(r["timestamp"] for r in rows) - min(r["timestamp"] for r in rows) + by = defaultdict(list) + for r in rows: + by[r["session_id"]].append(r) + starts, ends, ttps = [], [], [] + for sid, t in by.items(): + t.sort(key=lambda r: (r["turn"], r["timestamp"])) + s = t[0]["timestamp"] + chain = sum((r.get("time_to_parent_chat") or 0.0) for r in t[1:]) + starts.append(s); ends.append(s + chain) + ttps += [r["time_to_parent_chat"] for r in t[1:] + if r.get("time_to_parent_chat") is not None] + return span, (max(ends) - min(starts)), sorted(ttps) + + +def realized_gaps(metrics): + """this.t_dispatch - prev.t_finish, per session, consecutive turns.""" + rows = [json.loads(l) for l in open(metrics)] + ok = [r for r in rows if not r.get("error") and r.get("t_dispatch_unix")] + by = defaultdict(list) + for r in ok: + by[r["session_id"]].append(r) + gaps = [] + for sid, t in by.items(): + t.sort(key=lambda r: r["turn_id"]) + for a, b in zip(t, t[1:]): + if a.get("t_finish_unix") and b.get("t_dispatch_unix"): + gaps.append(max(0.0, b["t_dispatch_unix"] - a["t_finish_unix"])) + return sorted(gaps), rows + + +def wallclock(metrics): + rows = [json.loads(l) for l in open(metrics)] + ds = [r["t_dispatch_unix"] for r in rows if r.get("t_dispatch_unix")] + fs = [r["t_finish_unix"] for r in rows if r.get("t_finish_unix")] + return (max(fs) - min(ds)) if ds and fs else 0.0 + + +def e2e(metrics): + ok = [json.loads(l) for l in open(metrics)] + ok = [r for r in ok if not r.get("error")] + return pct([r["latency_s"] for r in ok if r.get("latency_s")], .9), len(ok) + + +def main(): + trace, m1, m2 = sys.argv[1:4] + fig = sys.argv[4] if len(sys.argv) > 4 else "v2/figs/exp_c_dispatch_ablation.png" + span, ideal2, ttps = load_trace(trace) + g1, _ = realized_gaps(m1) + g2, _ = realized_gaps(m2) + w1, w2 = wallclock(m1), wallclock(m2) + e1, n1 = e2e(m1) + e2_, n2 = e2e(m2) + + def collapsed(g): # fraction of realized gaps << the trace think-time median + return sum(1 for x in g if x < 0.5) / len(g) if g else 0 + + print(f"trace_span={span:.0f}s mode2_ideal_span={ideal2:.0f}s " + f"trace_ttp: p50={pct(ttps,.5):.2f}s frac<1s={sum(1 for x in ttps if x<1)/len(ttps):.0%}\n") + print(f"{'mode':<16}{'wall_s':>8}{'amp/ideal':>10}{'e2e_p90':>9}" + f"{'realgap_p50':>12}{'frac<0.5s':>10}") + for name, w, ideal, e, n, g in [ + ("Mode1 tracets", w1, span, e1, n1, g1), + ("Mode2 thinktime", w2, ideal2, e2_, n2, g2)]: + print(f"{name:<16}{w:>8.0f}{w/ideal:>10.2f}{e:>9.1f}" + f"{pct(g,.5):>12.2f}{collapsed(g):>10.0%}") + + # figure: realized inter-turn gap CDF, trace vs Mode1 vs Mode2 + plt.figure(figsize=(7.2, 4.8)) + for data, lab, c in [(ttps, "trace time_to_parent_chat (real)", "#1f77b4"), + (g2, "Mode2 thinktime: realized gap", "#2ca02c"), + (g1, "Mode1 tracets: realized gap", "#d62728")]: + if data: + d = sorted(x for x in data if x >= 0) + ys = [(i + 1) / len(d) for i in range(len(d))] + plt.plot([max(x, 1e-3) for x in d], ys, label=lab, c=c, lw=2) + plt.xscale("log"); plt.xlabel("inter-turn gap (s, log)"); plt.ylabel("CDF") + plt.title("Mode 1 collapses the inter-turn gap under load; Mode 2 preserves it") + plt.legend(); plt.grid(alpha=.3, which="both"); plt.tight_layout() + plt.savefig(fig, dpi=140) + print("wrote", fig) + + +if __name__ == "__main__": + main() diff --git a/v2/exp_c_dispatch_ablation/results/perf_summary.json b/v2/exp_c_dispatch_ablation/results/perf_summary.json new file mode 100644 index 0000000..4575015 --- /dev/null +++ b/v2/exp_c_dispatch_ablation/results/perf_summary.json @@ -0,0 +1,87 @@ +{ + "setup": "w600 first-300s window (366 req, 223 multi-turn), round-robin x N H20, Qwen3-Coder-30B-A3B", + "N6_tracets": { + "n": 366, + "ok": 366, + "wall_s": 974.8, + "tps": 110.9, + "ttft": { + "p50": 4.414, + "p90": 61.791, + "p99": 135.243 + }, + "tpot": { + "p50": 0.039, + "p90": 0.242, + "p99": 0.958 + }, + "e2e": { + "p50": 17.074, + "p90": 118.02, + "p99": 297.572 + } + }, + "N6_thinktime": { + "n": 366, + "ok": 366, + "wall_s": 1125.1, + "tps": 96.1, + "ttft": { + "p50": 4.52, + "p90": 83.662, + "p99": 130.373 + }, + "tpot": { + "p50": 0.037, + "p90": 0.264, + "p99": 0.694 + }, + "e2e": { + "p50": 15.029, + "p90": 119.68, + "p99": 338.466 + } + }, + "N8_tracets": { + "n": 366, + "ok": 366, + "wall_s": 967.2, + "tps": 111.8, + "ttft": { + "p50": 2.869, + "p90": 56.128, + "p99": 115.189 + }, + "tpot": { + "p50": 0.037, + "p90": 0.174, + "p99": 0.89 + }, + "e2e": { + "p50": 11.879, + "p90": 102.849, + "p99": 245.492 + } + }, + "N8_thinktime": { + "n": 365, + "ok": 365, + "wall_s": 787.0, + "tps": 119.3, + "ttft": { + "p50": 3.099, + "p90": 39.663, + "p99": 83.524 + }, + "tpot": { + "p50": 0.037, + "p90": 0.188, + "p99": 0.853 + }, + "e2e": { + "p50": 12.256, + "p90": 73.525, + "p99": 227.295 + } + } +} \ No newline at end of file diff --git a/v2/exp_c_dispatch_ablation/run_ablation.sh b/v2/exp_c_dispatch_ablation/run_ablation.sh new file mode 100644 index 0000000..69c86b1 --- /dev/null +++ b/v2/exp_c_dispatch_ablation/run_ablation.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Exp (c): does wall-clock amplification survive Mode 2 (real think-time)? +# Launch N vLLM instances; replayer round-robins across them; replay the SAME +# annotated trace under Mode 1 (tracets) and Mode 2 (thinktime). +set -uo pipefail +cd /home/admin/cpfs/wjh/agentic-kv +PY=.venv/bin/python +MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct +N=${N:-4} +TRACE=${TRACE:-traces/w600_ttp_win.jsonl} +REQLIMIT=${REQLIMIT:-} +OUT=v2/exp_c_dispatch_ablation/results +mkdir -p "$OUT" +PIDS=() +EPS="" + +launch() { # $1 gpu, $2 port + CUDA_VISIBLE_DEVICES=$1 VLLM_LOGGING_LEVEL=WARNING \ + $PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \ + --host 0.0.0.0 --port $2 --tensor-parallel-size 1 --trust-remote-code \ + --enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \ + --gpu-memory-utilization 0.9 > "$OUT/vllm_$2.log" 2>&1 & + PIDS+=($!) +} +teardown() { + for p in "${PIDS[@]:-}"; do kill -TERM "$p" 2>/dev/null; done + sleep 6 + for p in $(pgrep -f "VLLM::EngineCore"); do kill -9 "$p" 2>/dev/null; done + sleep 3 +} +trap teardown EXIT + +echo ">>> launch $N instances" +for i in $(seq 0 $((N-1))); do + launch "$i" $((8000+i)) + EPS="$EPS,http://127.0.0.1:$((8000+i))" +done +EPS="${EPS#,}" +for i in $(seq 0 $((N-1))); do + echo -n " wait health $((8000+i))..." + timeout 900 bash -c "until curl -sf http://127.0.0.1:$((8000+i))/health >/dev/null 2>&1; do sleep 5; done" \ + && echo ok || { echo FAIL; exit 1; } +done + +LIM=""; [ -n "$REQLIMIT" ] && LIM="--request-limit $REQLIMIT" +for MODE in tracets thinktime; do + echo "=== replay dispatch-mode=$MODE ===" + $PY -m replayer --trace "$TRACE" --output "$OUT/metrics_$MODE.jsonl" \ + --endpoint "$EPS" --model "$MODEL" --dispatch-mode "$MODE" $LIM + cp "$OUT/metrics_$MODE.summary.json" "$OUT/summary_$MODE.json" 2>/dev/null || true +done +teardown +echo "=== exp (c) DONE ===" diff --git a/v2/figs/exp_c_dispatch_ablation.png b/v2/figs/exp_c_dispatch_ablation.png new file mode 100644 index 0000000..1b9760a Binary files /dev/null and b/v2/figs/exp_c_dispatch_ablation.png differ