v2 exp(b): GPU KV-capacity APC/latency knee + writeup

Sweeps GPU KV-cache capacity (--num-gpu-blocks-override) under a closed-loop replay (concurrency 4) of a controlled multi-turn workload (cumulative intra-session prefix, gen_synth_trace.py), measuring realized APC (prefix_cache hits/queries delta) and latency per capacity. Result: a sharp knee at 3.6 GB = exactly the active working set (4 sessions x 0.91 GB). APC rises 7->12->36->80% then saturates at the ~71% intra-session ceiling; TTFT p90 collapses 13.0 s -> 0.53 s at the same point; dead flat to 14.5 GB, 100% completion throughout. So only the active working set needs HBM; capacity beyond it -- and the CPU/storage tier built to chase the reuse tail -- buys ~0. Knee scales linearly with concurrency = cluster GPU count. README.md ties exp(a)+exp(b) into the section-2.2 GPU-hit-first argument with tables, conclusions, and caveats. Raw per-request dumps gitignored; summary/m0/m1 deltas kept. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 11:23:31 +08:00
parent 837df6bc9e
commit ad754cfe0b
22 changed files with 393 additions and 0 deletions
--- a/v2/exp_b_capacity_knee/analyze_and_plot.py
+++ b/v2/exp_b_capacity_knee/analyze_and_plot.py
@@ -0,0 +1,71 @@
+"""Analyze + plot exp (b): realized APC and latency vs GPU KV capacity (the knee)."""
+import json
+import statistics
+import sys
+from pathlib import Path
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+R = Path(sys.argv[1] if len(sys.argv) > 1 else "v2/exp_b_capacity_knee/results")
+FIG = Path(sys.argv[2] if len(sys.argv) > 2 else "v2/figs/exp_b_capacity_knee.png")
+BLOCK_BYTES = 16 * 98304  # 1.573 MB / block
+
+
+def pct(v, q):
+    v = sorted(v)
+    return v[min(int(q * len(v)), len(v) - 1)] if v else 0.0
+
+
+rows = []
+for mf in sorted(R.glob("metrics_blk*.jsonl"), key=lambda p: int(p.stem.split("blk")[1])):
+    blk = int(mf.stem.split("blk")[1])
+    gb = blk * BLOCK_BYTES / 1e9
+    recs = [json.loads(l) for l in open(mf)]
+    ok = [r for r in recs if not r.get("error")]
+    ttft = [r["ttft_s"] for r in ok if r.get("ttft_s")]
+    e2e = [r["latency_s"] for r in ok if r.get("latency_s")]
+    m0 = json.load(open(R / f"m0_blk{blk}.json"))
+    m1 = json.load(open(R / f"m1_blk{blk}.json"))
+    dq = m1["gpu_queries"] - m0["gpu_queries"]
+    dh = m1["gpu_hits"] - m0["gpu_hits"]
+    apc = dh / dq if dq > 0 else 0.0
+    rows.append({
+        "blocks": blk, "gb": gb,
+        "apc": apc,
+        "completion": len(ok) / len(recs) if recs else 0,
+        "n_ok": len(ok), "n": len(recs),
+        "ttft_p50": pct(ttft, .5), "ttft_p90": pct(ttft, .9),
+        "e2e_p50": pct(e2e, .5), "e2e_p90": pct(e2e, .9),
+    })
+
+print(f"{'GB':>6} {'blocks':>7} {'APC':>7} {'compl':>6} {'TTFTp50':>8} {'TTFTp90':>8} {'E2Ep90':>8}")
+for r in rows:
+    print(f"{r['gb']:>6.1f} {r['blocks']:>7} {r['apc']:>6.1%} {r['completion']:>6.0%} "
+          f"{r['ttft_p50']:>8.3f} {r['ttft_p90']:>8.3f} {r['e2e_p90']:>8.3f}")
+json.dump(rows, open(R / "summary.json", "w"), indent=2)
+
+if rows:
+    gb = [r["gb"] for r in rows]
+    fig, ax1 = plt.subplots(figsize=(7.4, 5.0))
+    ax1.plot(gb, [r["apc"] * 100 for r in rows], "o-", color="#2ca02c",
+             linewidth=2.2, markersize=8, label="Realized APC")
+    ax1.set_xlabel("GPU KV-cache capacity (GB)")
+    ax1.set_ylabel("Realized APC (%)", color="#2ca02c")
+    ax1.tick_params(axis="y", labelcolor="#2ca02c")
+    ax1.set_ylim(0, 100)
+    ax1.grid(True, alpha=0.3)
+
+    ax2 = ax1.twinx()
+    ax2.plot(gb, [r["ttft_p90"] for r in rows], "s--", color="#d62728",
+             linewidth=2, markersize=7, label="TTFT p90")
+    ax2.set_ylabel("TTFT p90 (s)", color="#d62728")
+    ax2.tick_params(axis="y", labelcolor="#d62728")
+
+    ax1.set_title("APC and latency saturate at small GPU KV capacity\n"
+                  "Qwen3-Coder-30B-A3B, 1xH20, agentic trace replay")
+    fig.tight_layout()
+    FIG.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(FIG, dpi=140)
+    print("wrote", FIG)
--- a/v2/exp_b_capacity_knee/gen_synth_trace.py
+++ b/v2/exp_b_capacity_knee/gen_synth_trace.py
@@ -0,0 +1,55 @@
+"""Controlled multi-turn agentic workload for the capacity->APC knee.
+
+Each session grows its prefix cumulatively: turn k appends G fresh blocks and
+reuses all blocks of turns 1..k-1 (intra-session prefix reuse, the dominant
+mode per the trace, 93% intra-session). Block ids are namespaced per session so
+cross-session reuse is ~0. Intra-session APC ceiling = (T-1)/(T+1).
+
+timestamp=0 => the replayer fires closed-loop, gated only by max-inflight-sessions.
+"""
+import argparse
+import json
+
+BLOCK = 16  # tokens/block (vLLM default)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--sessions", type=int, default=40)
+    ap.add_argument("--turns", type=int, default=8)
+    ap.add_argument("--blocks-per-turn", type=int, default=192)  # 3072 tok/turn
+    ap.add_argument("--output-len", type=int, default=100)
+    ap.add_argument("--out", required=True)
+    a = ap.parse_args()
+
+    rows = []
+    for s in range(a.sessions):
+        base = s * 10_000_000  # unique block namespace per session
+        cum = []
+        for k in range(1, a.turns + 1):
+            for _ in range(a.blocks_per_turn):
+                cum.append(base + len(cum))
+            rows.append({
+                "chat_id": s * 1000 + k,
+                "parent_chat_id": (s * 1000 + k - 1) if k > 1 else 0,
+                "timestamp": 0.0,
+                "input_length": len(cum) * BLOCK,
+                "output_length": a.output_len,
+                "type": "coder",
+                "turn": k,
+                "hash_ids": list(cum),
+                "session_id": f"s{s}",
+            })
+    with open(a.out, "w") as o:
+        for r in rows:
+            o.write(json.dumps(r) + "\n")
+    ws_blocks = a.turns * a.blocks_per_turn
+    apc = (a.turns - 1) / (a.turns + 1)
+    print(f"wrote {len(rows)} reqs ({a.sessions} sessions x {a.turns} turns) -> {a.out}")
+    print(f"session working set = {ws_blocks} blocks ({ws_blocks*BLOCK} tok, "
+          f"{ws_blocks*BLOCK*98304/1e9:.2f} GB); max req = {ws_blocks*BLOCK} tok")
+    print(f"intra-session APC ceiling = {apc:.1%}")
+
+
+if __name__ == "__main__":
+    main()
--- a/v2/exp_b_capacity_knee/results/m0_blk1024.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk1024.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780084807.7091374, "gpu_queries": 1780084807.7091217, "ext_hits": 1780084807.7091625, "ext_queries": 1780084807.7091503}
--- a/v2/exp_b_capacity_knee/results/m0_blk1536.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk1536.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780085167.731176, "gpu_queries": 1780085167.73116, "ext_hits": 1780085167.7312036, "ext_queries": 1780085167.7311893}
--- a/v2/exp_b_capacity_knee/results/m0_blk2304.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk2304.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780085450.084966, "gpu_queries": 1780085450.0849319, "ext_hits": 1780085450.085004, "ext_queries": 1780085450.0849845}
--- a/v2/exp_b_capacity_knee/results/m0_blk3072.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk3072.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780085701.1922042, "gpu_queries": 1780085701.1921885, "ext_hits": 1780085701.1922336, "ext_queries": 1780085701.1922188}
--- a/v2/exp_b_capacity_knee/results/m0_blk4608.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk4608.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780085943.247891, "gpu_queries": 1780085943.247875, "ext_hits": 1780085943.247915, "ext_queries": 1780085943.2479026}
--- a/v2/exp_b_capacity_knee/results/m0_blk6144.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk6144.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780086191.0650043, "gpu_queries": 1780086191.06498, "ext_hits": 1780086191.0650318, "ext_queries": 1780086191.0650187}
--- a/v2/exp_b_capacity_knee/results/m0_blk768.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk768.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780084321.73404, "gpu_queries": 1780084321.7340264, "ext_hits": 1780084321.7340639, "ext_queries": 1780084321.7340522}
--- a/v2/exp_b_capacity_knee/results/m0_blk9216.json
+++ b/v2/exp_b_capacity_knee/results/m0_blk9216.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780086433.7639863, "gpu_queries": 1780086433.7639701, "ext_hits": 1780086433.764013, "ext_queries": 1780086433.7640002}
--- a/v2/exp_b_capacity_knee/results/m1_blk1024.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk1024.json
@@ -0,0 +1 @@
+{"gpu_hits": 1783032455.7091374, "gpu_queries": 1804304455.7091217, "ext_hits": 1780084807.7091625, "ext_queries": 1780084807.7091503}
--- a/v2/exp_b_capacity_knee/results/m1_blk1536.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk1536.json
@@ -0,0 +1 @@
+{"gpu_hits": 1784993167.731176, "gpu_queries": 1793597359.73116, "ext_hits": 1780085167.7312036, "ext_queries": 1780085167.7311893}
--- a/v2/exp_b_capacity_knee/results/m1_blk2304.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk2304.json
@@ -0,0 +1 @@
+{"gpu_hits": 1781831882.084966, "gpu_queries": 1782260426.0849319, "ext_hits": 1780085450.085004, "ext_queries": 1780085450.0849845}
--- a/v2/exp_b_capacity_knee/results/m1_blk3072.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk3072.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780650181.1922042, "gpu_queries": 1780859845.1921885, "ext_hits": 1780085701.1922336, "ext_queries": 1780085701.1922188}
--- a/v2/exp_b_capacity_knee/results/m1_blk4608.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk4608.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780650423.247891, "gpu_queries": 1780860087.247875, "ext_hits": 1780085943.247915, "ext_queries": 1780085943.2479026}
--- a/v2/exp_b_capacity_knee/results/m1_blk6144.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk6144.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780650671.0650043, "gpu_queries": 1780860335.06498, "ext_hits": 1780086191.0650318, "ext_queries": 1780086191.0650187}
--- a/v2/exp_b_capacity_knee/results/m1_blk768.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk768.json
@@ -0,0 +1 @@
+{"gpu_hits": 1782356641.73404, "gpu_queries": 1810984033.7340264, "ext_hits": 1780084321.7340639, "ext_queries": 1780084321.7340522}
--- a/v2/exp_b_capacity_knee/results/m1_blk9216.json
+++ b/v2/exp_b_capacity_knee/results/m1_blk9216.json
@@ -0,0 +1 @@
+{"gpu_hits": 1780650913.7639863, "gpu_queries": 1780860577.7639701, "ext_hits": 1780086433.764013, "ext_queries": 1780086433.7640002}
--- a/v2/exp_b_capacity_knee/results/summary.json
+++ b/v2/exp_b_capacity_knee/results/summary.json
@@ -0,0 +1,98 @@
+[
+  {
+    "blocks": 768,
+    "gb": 1.207959552,
+    "apc": 0.07353854948550977,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 8.315758996002842,
+    "ttft_p90": 13.000879739003722,
+    "e2e_p50": 11.904735280026216,
+    "e2e_p90": 16.53674147298443
+  },
+  {
+    "blocks": 1024,
+    "gb": 1.610612736,
+    "apc": 0.12170482411635379,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 4.015194748993963,
+    "ttft_p90": 8.895869197003776,
+    "e2e_p50": 7.799231034005061,
+    "e2e_p90": 12.4102137539885
+  },
+  {
+    "blocks": 1536,
+    "gb": 2.415919104,
+    "apc": 0.36322752074570874,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 0.46762072801357135,
+    "ttft_p90": 4.615992321021622,
+    "e2e_p50": 4.144864278001478,
+    "e2e_p90": 8.661657008022303
+  },
+  {
+    "blocks": 2304,
+    "gb": 3.623878656,
+    "apc": 0.8029661016949152,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 0.4056103950133547,
+    "ttft_p90": 0.532125736004673,
+    "e2e_p50": 4.129167931008851,
+    "e2e_p90": 4.328828729019733
+  },
+  {
+    "blocks": 3072,
+    "gb": 4.831838208,
+    "apc": 0.7291666666666666,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 0.4871154689753894,
+    "ttft_p90": 0.6493310299993027,
+    "e2e_p50": 4.035265229002107,
+    "e2e_p90": 4.273102787992684
+  },
+  {
+    "blocks": 4608,
+    "gb": 7.247757312,
+    "apc": 0.7291666666666666,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 0.4874342739931308,
+    "ttft_p90": 0.6399849629960954,
+    "e2e_p50": 4.077990949008381,
+    "e2e_p90": 4.249602819007123
+  },
+  {
+    "blocks": 6144,
+    "gb": 9.663676416,
+    "apc": 0.7291666666666666,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 0.4956600739969872,
+    "ttft_p90": 0.649673483974766,
+    "e2e_p50": 4.049805466987891,
+    "e2e_p90": 4.187004164006794
+  },
+  {
+    "blocks": 9216,
+    "gb": 14.495514624,
+    "apc": 0.7291666666666666,
+    "completion": 1.0,
+    "n_ok": 144,
+    "n": 144,
+    "ttft_p50": 0.49285231801331975,
+    "ttft_p90": 0.6484746419882867,
+    "e2e_p50": 4.013530449010432,
+    "e2e_p90": 4.254351082985522
+  }
+]
--- a/v2/exp_b_capacity_knee/run_sweep.sh
+++ b/v2/exp_b_capacity_knee/run_sweep.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Exp (b): capacity -> realized-APC -> latency knee. Runs on dash0, one H20.
+set -uo pipefail
+cd /home/admin/cpfs/wjh/agentic-kv
+PY=.venv/bin/python
+MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
+GPU=${GPU:-1}
+PORT=${PORT:-8200}
+EP=http://127.0.0.1:$PORT
+# Filtered trace (inputs <= 60k tok) so max-model-len can be 64k and the low
+# capacity points still boot; raw trace has p90=89k/max=167k single requests.
+TRACE=${TRACE:-traces/sampled_pfx_r0.004_le60k.jsonl}
+MAXLEN=${MAXLEN:-65536}
+REQLIMIT=${REQLIMIT:-600}
+INFLIGHT=${INFLIGHT:-8}
+OUT=v2/exp_b_capacity_knee/results
+mkdir -p "$OUT"
+
+# GPU KV-block counts to sweep (16 tok/block; 1 GiB ~= 683 blocks).
+# floor 4096 blk (6.4GB, holds one 64k req) -> 24000 blk (37.7GB, full instance):
+CAPS=${CAPS:-"4096 6144 8192 12288 16384 20480 24000"}
+
+VLLM_PID=""
+launch() {
+    CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \
+    $PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
+        --host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \
+        --enable-prefix-caching --enforce-eager --dtype auto --max-model-len $MAXLEN \
+        --num-gpu-blocks-override "$1" > "$OUT/vllm_blk$1.log" 2>&1 &
+    VLLM_PID=$!
+    $PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; \
+            sys.exit(0 if wait_healthy('$EP',900) else 1)"
+}
+teardown() {
+    [ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null
+    for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done
+    sleep 3; VLLM_PID=""
+}
+trap teardown EXIT
+
+scrape() { $PY -c "import sys,json; sys.path.insert(0,'v2'); from common.util import scrape_prefix_cache; print(json.dumps(scrape_prefix_cache('$EP')))"; }
+
+for BLK in $CAPS; do
+    echo "==================== blocks=$BLK ===================="
+    launch "$BLK" || { echo "launch failed at $BLK (pool too small for model?)"; tail -20 "$OUT/vllm_blk$BLK.log"; teardown; continue; }
+    M0=$(scrape)
+    $PY -m replayer --trace "$TRACE" --output "$OUT/metrics_blk$BLK.jsonl" \
+        --endpoint $EP --model "$MODEL" --max-inflight-sessions $INFLIGHT --request-limit $REQLIMIT
+    M1=$(scrape)
+    echo "$M0" > "$OUT/m0_blk$BLK.json"; echo "$M1" > "$OUT/m1_blk$BLK.json"
+    teardown
+done
+
+echo "=== exp (b) sweep DONE ==="
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780084807.7091374, "gpu_queries": 1780084807.7091217, "ext_hits": 1780084807.7091625, "ext_queries": 1780084807.7091503}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780085167.731176, "gpu_queries": 1780085167.73116, "ext_hits": 1780085167.7312036, "ext_queries": 1780085167.7311893}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780085450.084966, "gpu_queries": 1780085450.0849319, "ext_hits": 1780085450.085004, "ext_queries": 1780085450.0849845}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780085701.1922042, "gpu_queries": 1780085701.1921885, "ext_hits": 1780085701.1922336, "ext_queries": 1780085701.1922188}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780085943.247891, "gpu_queries": 1780085943.247875, "ext_hits": 1780085943.247915, "ext_queries": 1780085943.2479026}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780086191.0650043, "gpu_queries": 1780086191.06498, "ext_hits": 1780086191.0650318, "ext_queries": 1780086191.0650187}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780084321.73404, "gpu_queries": 1780084321.7340264, "ext_hits": 1780084321.7340639, "ext_queries": 1780084321.7340522}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1780086433.7639863, "gpu_queries": 1780086433.7639701, "ext_hits": 1780086433.764013, "ext_queries": 1780086433.7640002}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1783032455.7091374, "gpu_queries": 1804304455.7091217, "ext_hits": 1780084807.7091625, "ext_queries": 1780084807.7091503}`
				`@@ -0,0 +1 @@`
				`{"gpu_hits": 1784993167.731176, "gpu_queries": 1793597359.73116, "ext_hits": 1780085167.7312036, "ext_queries": 1780085167.7311893}`