diff --git a/analysis/working_set/README.md b/analysis/working_set/README.md new file mode 100644 index 0000000..a781c66 --- /dev/null +++ b/analysis/working_set/README.md @@ -0,0 +1,67 @@ +# KV-cache Working-Set Sizing — GLM-5.1-FP8 · TP=8 · 1× B300 node + +工具:`scripts/working_set_analysis.py`(可配置 GPU 型号 / 并行度 TP·PP·EP / 模型 config.json / +KV dtype / 权重大小)。图:`figs/working_set/glm5_fp8_tp8_b300.png`。 + +## 复现 + +```bash +.venv/bin/python scripts/working_set_analysis.py \ + /home/gahow/phd/kvcache-simulator/bailian-traces/glm_coder_blksz_512_040915-040917.jsonl \ + --model-config /home/gahow/phd/kvcache-simulator/models/GLM-5/config.json \ + --gpu B300 --tp 8 --ep 8 --kv-dtype-bytes 1 --weight-gb 744 --min-ts 0 \ + --out figs/working_set/glm5_fp8_tp8_b300.png +``` + +## 方法 + +`hash_ids` 是全局内容寻址 block id(同内容=同 id,复用=同 id 再现)。vLLM prefix cache 是 +block 级,所以**集群级 KV footprint = 任一时刻必须常驻的 distinct block 数**,与 placement 无关 +(affinity 只搬运 block,不改总量)。三种 working set: +- `W_all` 永不淘汰(真上界) +- `W_oracle` 每 block 只在 `[首次, 末次复用]` 常驻(Belady 完美预知 → 满 APC 上界的最小 HBM) +- `W_denning(T)` 滑窗 T 内被访问的 distinct block(现实 TTL-LRU) + +KV/token:MLA → `L·(kv_lora_rank+qk_rope_head_dim)·dtype`;GQA → `2·L·kv_heads·head_dim·dtype` +(与 `kvcache-simulator/src/config.rs::kv_block_bytes` 一致)。 + +## 配置 + +| 项 | 值 | +|---|---| +| 模型 | GLM-5.1-FP8(MLA, L=78, kv_lora=512+rope=64) | +| KV/token · KV/block(512) | **43.9 KiB** · **23.0 MB**(≈ Qwen3 GQA 96 KiB 的一半) | +| 硬件 | 8× B300 (288 GB) = 2304 GB HBM/replica | +| 预算 | FP8 权重 744 GB + act 32 GB → **KV pool = 1528 GB/node** | +| trace | dash0 glm_coder,475k req,**1.25h active @ 106 QPS**,~40k tok/req(剔除 77 条负 ts 暖机) | +| APC 上界 | **80.4%** | + +## 结果 + +| 保留窗口 T | peak footprint | = 节点 (GPU) | APC@T | +|---:|---:|---:|---:| +| 2s(在飞下限)| 533 GB | 0.3 (3) | 1.7% | +| 10s | 2,068 GB | 1.4 (11) | 15% | +| 30s | 4,906 GB | 3.2 (26) | 42% | +| 60s | 7,698 GB | 5.0 (40) | 56% | +| 300s | 21,960 GB | 14.4 (115) | 74% | +| **oracle(满 80.4%)** | **21,399 GB** | **14.0 (112)** | 80.4% | +| retain-forever | 167,018 GB | 109 (874) | — | + +## 结论 + +1. **Serving:1 节点绰绰有余。** 在飞 KV(τ≈2-5s)仅 533–1157 GB ≪ 单节点 1528 GB。 + MLA + B300 大 HBM 让 live footprint 微不足道——跑起来根本不缺显存。 +2. **缓存全部复用(80.4%):1 节点差 ~14×。** oracle 下限 21.4 TB = 14 节点(112 GPU), + 真实 LRU ~2× → ~28 节点。单节点(1528 GB)只能 hold ~10s 窗口 → cache 侧 APC 仅 ~10-15%。 + 要 ~56% 需 5 节点,~74% 需 ~14 节点。 +3. **瓶颈在长尾,不在 live。** 把 APC 50%→80% 装进 GPU HBM 要 5→14 节点,极不经济 + → offload/migration 到 CPU DRAM(每节点 ~1.5 TB)是定量动机。与 Qwen 结论方向一致。 + +## 注意 + +- footprint 是 TTL-LRU(最浪费)+ shared-cache 下限:真实 capacity-LRU 同容量下 APC 更高, + 但分区/affinity 不均衡又抬高需求;oracle / retain-forever 给出下/上界。 +- GLM trace mean ~40k tok/req,是 Qwen trace(11k)的 ~3.5×(tokenizer + 抽取不同), + **绝对 GB 不可跨模型横比**,方法与定性结论可比。 +- EP 不改变 KV 总量(只影响 expert 权重分布),`--ep` 仅作标注。 diff --git a/figs/working_set/glm5_fp8_tp8_b300.png b/figs/working_set/glm5_fp8_tp8_b300.png new file mode 100644 index 0000000..3e15512 Binary files /dev/null and b/figs/working_set/glm5_fp8_tp8_b300.png differ diff --git a/scripts/working_set_analysis.py b/scripts/working_set_analysis.py new file mode 100644 index 0000000..a738ee3 --- /dev/null +++ b/scripts/working_set_analysis.py @@ -0,0 +1,276 @@ +"""KV-cache working-set sizing for agentic traces, across GPU / model / parallelism. + +WHAT IT COMPUTES + hash_ids in these traces are global content-addressed block ids (same content + -> same id; reuse = repeated id). vLLM prefix cache is block-level, so the + cluster-wide KV footprint at any instant = the set of distinct block ids that + must be resident. Session/instance placement only moves blocks between GPUs; + it does not change this aggregate, so the analysis is placement-independent. + + Three working-set notions, swept over a retention window T: + W_all retain every block forever (true upper bound) + W_oracle keep block in [first_use, last_use] (Belady foresight floor) + W_denning(T) distinct blocks touched in (t-T, t] (realistic TTL=T LRU) + and the APC actually captured at each T (validates vs the trie ceiling). + +HARDWARE MODEL + KV pool per serving replica = + gpus_per_replica * hbm_per_gpu - model_weights - activation_reserve + (TP/EP shard weights+KV across the replica's GPUs; the *aggregate* KV pool is + what we size against, so only gpus_per_replica and total weights matter.) + + KV bytes / token: + GQA/MHA : 2 * L * kv_heads * head_dim * kv_dtype_bytes + MLA : L * (kv_lora_rank + qk_rope_head_dim) * kv_dtype_bytes + (matches kvcache-simulator/src/config.rs::kv_block_bytes) + +All sizes reported in GB = 1e9 bytes (matches the simulator's `hbm_bytes` e9 +convention). +""" +from __future__ import annotations +import argparse, json +import numpy as np + +GB = 1e9 + +# Nominal HBM per GPU, in GB (decimal). +GPU_HBM_GB = { + "H100": 80, "H200": 141, "H20": 96, "H20-141G": 141, + "A100-40G": 40, "A100-80G": 80, + "B200": 192, "B300": 288, "GB200": 192, +} + + +# ----------------------------------------------------------------------------- model +def load_model(config_json: str) -> dict: + v = json.load(open(config_json)) + L = int(v["num_hidden_layers"]) + out = {"name": v.get("model_type", "?"), "L": L} + if "kv_lora_rank" in v: # MLA (DeepSeek / GLM-MoE-DSA) + out["mla"] = True + out["kv_lora_rank"] = int(v["kv_lora_rank"]) + out["qk_rope_head_dim"] = int(v["qk_rope_head_dim"]) + else: # GQA / MHA + out["mla"] = False + H = int(v.get("num_attention_heads", 0)) + out["kv_heads"] = int(v.get("num_key_value_heads", H) or H) + out["head_dim"] = int(v.get("head_dim") or (v["hidden_size"] // H)) + return out + + +def kv_bytes_per_token(model: dict, kv_dtype_bytes: int) -> int: + L = model["L"] + if model["mla"]: + return L * (model["kv_lora_rank"] + model["qk_rope_head_dim"]) * kv_dtype_bytes + return 2 * L * model["kv_heads"] * model["head_dim"] * kv_dtype_bytes + + +# ----------------------------------------------------------------------------- trace +def load_trace(path: str, min_ts=None, max_ts=None): + ids, ts = [], [] + n = dropped = 0 + with open(path) as fh: + for line in fh: + line = line.strip() + if not line: + continue + r = json.loads(line) + h = r.get("hash_ids") + if isinstance(h, str): + h = json.loads(h) + if not h: + continue + t = float(r.get("timestamp", 0.0)) + if (min_ts is not None and t < min_ts) or (max_ts is not None and t > max_ts): + dropped += 1 + continue + ids.extend(h) + ts.extend([t] * len(h)) + n += 1 + if dropped: + print(f" (clipped {dropped} reqs outside [{min_ts}, {max_ts}])") + return n, np.asarray(ids, dtype=np.int64), np.asarray(ts, dtype=np.float64) + + +def _sweep_peak(starts, ends): + """Peak concurrency of intervals [start, end); ends applied before starts at ties.""" + ev = np.concatenate([starts, ends]) + d = np.concatenate([np.ones(len(starts), np.int64), -np.ones(len(ends), np.int64)]) + order = np.lexsort((d, ev)) # at equal time: -1 (end) before +1 (start) + return int(np.cumsum(d[order]).max()) + + +def _series(starts, ends, grid): + s = np.sort(starts); e = np.sort(ends) + return np.searchsorted(s, grid, side="right") - np.searchsorted(e, grid, side="right") + + +def compute_working_set(ids, ts, taus): + """Return dict with appearance stats + per-tau Denning peaks + oracle/all.""" + A = len(ids) + order = np.lexsort((ts, ids)) + ids_s, ts_s = ids[order], ts[order] + same_prev = np.empty(A, bool); same_prev[0] = False + same_prev[1:] = ids_s[1:] == ids_s[:-1] + same_next = np.empty(A, bool); same_next[-1] = False + same_next[:-1] = ids_s[:-1] == ids_s[1:] + prev_gap = np.full(A, np.inf); prev_gap[1:][same_prev[1:]] = (ts_s[1:] - ts_s[:-1])[same_prev[1:]] + next_gap = np.full(A, np.inf); next_gap[:-1][same_next[:-1]] = (ts_s[1:] - ts_s[:-1])[same_next[:-1]] + + n_unique = int((~same_prev).sum()) + grid = np.linspace(ts.min(), ts.max(), 400) + + # oracle [first,last] + first = np.full(ids.max() + 1, np.inf); last = np.full(ids.max() + 1, -np.inf) + np.minimum.at(first, ids, ts); np.maximum.at(last, ids, ts) + seen = np.isfinite(first) + oracle_peak = _sweep_peak(first[seen], last[seen]) + + rows = [] + for T in taus: + enter = ts_s[prev_gap > T] + exit_ = ts_s[next_gap > T] + T + peak = _sweep_peak(enter, exit_) + ser = _series(enter, exit_, grid) + rows.append({ + "tau": T, "peak_blocks": peak, + "p99_blocks": float(np.percentile(ser, 99)), + "p50_blocks": float(np.percentile(ser, 50)), + "apc": float((prev_gap <= T).sum() / A), + }) + return { + "A": A, "n_unique": n_unique, "n_reuse": A - n_unique, + "apc_ceiling": (A - n_unique) / A, + "oracle_peak_blocks": oracle_peak, + "span": float(ts.max() - ts.min()), + "taus": rows, + } + + +# ----------------------------------------------------------------------------- plot +def plot(ws, hw, block_bytes, label, out_path): + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + bgb = block_bytes / GB + taus = [r["tau"] for r in ws["taus"]] + peak_gb = np.array([r["peak_blocks"] * bgb for r in ws["taus"]]) + apc = np.array([r["apc"] * 100 for r in ws["taus"]]) + oracle_gb = ws["oracle_peak_blocks"] * bgb + ceil = ws["apc_ceiling"] * 100 + pool = hw["kv_pool_gb"] # per replica + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) + + # --- panel 1: APC vs required KV footprint --- + ax1.plot(peak_gb, apc, "o-", color="#1f77b4", lw=2, ms=7, label="TTL-LRU W(T)") + for r, x, y in zip(ws["taus"], peak_gb, apc): + ax1.annotate(f"{r['tau']:g}s", (x, y), fontsize=8, + textcoords="offset points", xytext=(4, 5)) + ax1.scatter([oracle_gb], [ceil], marker="*", s=320, color="#d62728", zorder=5, + label=f"oracle / ceiling ({ceil:.1f}%)") + ax1.axhline(ceil, ls=":", color="#d62728", alpha=.5) + for k in (1, 2, 4, 8): + x = pool * k + ax1.axvline(x, ls="--", color="#2ca02c", alpha=.55) + ax1.text(x, 2, f"{k} replica\n{k*hw['gpus_per_replica']} GPU", + rotation=90, va="bottom", ha="right", fontsize=8, color="#2ca02c") + ax1.set_xscale("log") + ax1.set_xlabel("KV footprint that must be resident (GB, log)") + ax1.set_ylabel("Achievable prefix-cache hit rate (APC %)") + ax1.set_title("APC vs KV-pool budget") + ax1.grid(alpha=.3, which="both"); ax1.legend(loc="lower right"); ax1.set_ylim(0, 100) + + # --- panel 2: footprint over time for a few T --- + span = ws["span"]; grid = np.linspace(0, span, 400) + # recompute series for a representative subset from stored peaks is not enough; + # show peak/p50 bars instead (compact, robust) + sel = [r for r in ws["taus"] if r["tau"] in (2, 30, 300, 600)] + xs = np.arange(len(sel)); w = 0.38 + ax2.bar(xs - w/2, [r["peak_blocks"]*bgb for r in sel], w, label="peak", color="#1f77b4") + ax2.bar(xs + w/2, [r["p50_blocks"]*bgb for r in sel], w, label="median", color="#aec7e8") + ax2.axhline(pool, ls="--", color="#2ca02c", lw=2, label=f"1 replica KV pool ({pool:.0f} GB)") + ax2.axhline(oracle_gb, ls=":", color="#d62728", lw=2, label=f"oracle full-ceiling ({oracle_gb:.0f} GB)") + ax2.set_xticks(xs); ax2.set_xticklabels([f"T={r['tau']:g}s\nAPC={r['apc']*100:.0f}%" for r in sel]) + ax2.set_ylabel("KV footprint (GB)") + ax2.set_yscale("log") + ax2.set_title("Footprint by retention window vs pool") + ax2.grid(alpha=.3, axis="y", which="both"); ax2.legend(loc="upper left", fontsize=9) + + fig.suptitle(label, fontsize=13, fontweight="bold") + fig.tight_layout(rect=[0, 0, 1, 0.97]) + fig.savefig(out_path, dpi=130) + print(f" figure -> {out_path}") + + +# ----------------------------------------------------------------------------- main +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("trace") + ap.add_argument("--model-config", required=True, help="path to HF config.json") + ap.add_argument("--gpu", required=True, choices=sorted(GPU_HBM_GB)) + ap.add_argument("--tp", type=int, default=8) + ap.add_argument("--pp", type=int, default=1) + ap.add_argument("--ep", type=int, default=0, help="informational only (KV unchanged by EP)") + ap.add_argument("--kv-dtype-bytes", type=int, default=1, help="1=FP8, 2=BF16") + ap.add_argument("--weight-gb", type=float, required=True, help="total resident model weights, GB") + ap.add_argument("--activation-gb", type=float, default=32.0, help="activation+ctx reserve, GB") + ap.add_argument("--block-size", type=int, default=512) + ap.add_argument("--min-ts", type=float, default=None, help="drop reqs with timestamp < this") + ap.add_argument("--max-ts", type=float, default=None, help="drop reqs with timestamp > this") + ap.add_argument("--label", default="") + ap.add_argument("--out", default="figs/working_set.png") + a = ap.parse_args() + + model = load_model(a.model_config) + kv_tok = kv_bytes_per_token(model, a.kv_dtype_bytes) + block_bytes = kv_tok * a.block_size + + gpus_per_replica = a.tp * a.pp + total_hbm = gpus_per_replica * GPU_HBM_GB[a.gpu] + kv_pool_gb = total_hbm - a.weight_gb - a.activation_gb + hw = {"gpus_per_replica": gpus_per_replica, "kv_pool_gb": kv_pool_gb} + + taus = [1, 2, 5, 10, 30, 60, 300, 600, 1800] + n, ids, ts = load_trace(a.trace, a.min_ts, a.max_ts) + ws = compute_working_set(ids, ts, taus) + + label = a.label or f"{model['name']} {a.gpu} TP{a.tp}" + (f" EP{a.ep}" if a.ep else "") + print("=" * 84) + print(f" {label}") + print("=" * 84) + print(f" model {model['name']} L={model['L']} " + + (f"MLA(kv_lora={model['kv_lora_rank']}+rope={model['qk_rope_head_dim']})" + if model["mla"] else f"GQA(kv_heads={model['kv_heads']}xhd={model['head_dim']})")) + print(f" KV / token {kv_tok:,} B ({kv_tok/1024:.1f} KiB) KV / block({a.block_size}) {block_bytes/1e6:.1f} MB") + print(f" hardware {gpus_per_replica}x {a.gpu} ({GPU_HBM_GB[a.gpu]} GB) = {total_hbm:.0f} GB HBM/replica" + + (f" EP={a.ep}" if a.ep else "")) + print(f" weights {a.weight_gb:.0f} GB ({a.kv_dtype_bytes}B-KV) + act {a.activation_gb:.0f} GB" + f" => KV pool/replica = {kv_pool_gb:.0f} GB") + print() + print(f" trace {n:,} reqs span {ws['span']:.0f}s ({ws['span']/3600:.2f}h) QPS~{n/ws['span']:.1f}") + print(f" block appearances {ws['A']:,} distinct {ws['n_unique']:,} APC ceiling {ws['apc_ceiling']*100:.2f}%") + bgb = block_bytes / GB + print(f" W_all (retain forever) {ws['n_unique']*bgb:>10,.0f} GB" + f" = {ws['n_unique']*bgb/kv_pool_gb:6.1f} replicas ({ws['n_unique']*bgb/kv_pool_gb*gpus_per_replica:,.0f} GPU)") + print(f" W_oracle (full ceiling) {ws['oracle_peak_blocks']*bgb:>10,.0f} GB" + f" = {ws['oracle_peak_blocks']*bgb/kv_pool_gb:6.1f} replicas ({ws['oracle_peak_blocks']*bgb/kv_pool_gb*gpus_per_replica:,.0f} GPU)") + print() + print(f" {'T':>7} | {'peak GB':>9} {'p50 GB':>8} | {'replicas':>8} {'GPUs':>6} | {'APC@T':>6}") + print(" " + "-" * 60) + for r in ws["taus"]: + pg = r["peak_blocks"] * bgb + rep = pg / kv_pool_gb + print(f" {r['tau']:>6g}s | {pg:>9,.0f} {r['p50_blocks']*bgb:>8,.0f} | " + f"{rep:>8.1f} {rep*gpus_per_replica:>6.0f} | {r['apc']*100:>5.1f}%") + print() + print(f" [ref] 1 replica = {gpus_per_replica} GPU = {kv_pool_gb:.0f} GB KV pool") + + import os + os.makedirs(os.path.dirname(a.out) or ".", exist_ok=True) + plot(ws, hw, block_bytes, label, a.out) + + +if __name__ == "__main__": + main()