diff --git a/v2/.gitignore b/v2/.gitignore new file mode 100644 index 0000000..351cd46 --- /dev/null +++ b/v2/.gitignore @@ -0,0 +1,3 @@ +# raw per-request replay dumps (~0.6 MB each) — regenerable; keep summary/m0/m1 +*/results/metrics_blk*.jsonl +*/results/vllm_*.log diff --git a/v2/common/util.py b/v2/common/util.py new file mode 100644 index 0000000..5483fd0 --- /dev/null +++ b/v2/common/util.py @@ -0,0 +1,106 @@ +"""Shared helpers for v2 GPU-hit-first experiments.""" +from __future__ import annotations + +import random +import time +import requests + +# Qwen3-Coder geometry (from config.json): 48 layers, 4 KV heads, head_dim 128, bf16 +KV_BYTES_PER_TOKEN = 98304 # 96 KiB +VOCAB = 151936 +# Safe token-id range: avoid low special-ish ids and the high special tokens (>=151643) +TOK_LO, TOK_HI = 1000, 151000 + + +def make_token_prompt(length: int, seed: int) -> list[int]: + """Deterministic, content-addressed token-id prompt of exact `length`. + + Same (length, seed) -> same ids -> prefix-cache hit. + Different seed -> fresh ids -> miss. + """ + rng = random.Random(seed) + return [rng.randint(TOK_LO, TOK_HI) for _ in range(length)] + + +def scrape_prefix_cache(endpoint: str) -> dict: + """Return cumulative prefix-cache counters from vLLM /metrics. + + Keys: gpu_hits, gpu_queries, ext_hits, ext_queries (floats, cumulative). + """ + out = {"gpu_hits": 0.0, "gpu_queries": 0.0, "ext_hits": 0.0, "ext_queries": 0.0} + try: + txt = requests.get(f"{endpoint}/metrics", timeout=10).text + except Exception: + return out + for line in txt.splitlines(): + if line.startswith("#") or not line: + continue + try: + name, val = line.rsplit(" ", 1) + v = float(val) + except ValueError: + continue + # strip prometheus labels and match only the cumulative _total counters + # (exclude _created epoch-timestamp series, which would dominate the sum) + metric = name.split("{", 1)[0] + if metric == "vllm:external_prefix_cache_hits_total": + out["ext_hits"] += v + elif metric == "vllm:external_prefix_cache_queries_total": + out["ext_queries"] += v + elif metric == "vllm:prefix_cache_hits_total": + out["gpu_hits"] += v + elif metric == "vllm:prefix_cache_queries_total": + out["gpu_queries"] += v + return out + + +def measure_ttft(endpoint: str, model: str, prompt_ids: list[int], + max_tokens: int = 1, timeout: float = 600.0) -> dict: + """Send one streaming /v1/completions request; return TTFT and e2e seconds. + + TTFT = time from send to first streamed token chunk (== prefill wall time). + """ + url = f"{endpoint}/v1/completions" + payload = { + "model": model, + "prompt": prompt_ids, + "max_tokens": max_tokens, + "temperature": 0.0, + "stream": True, + "stream_options": {"include_usage": True}, + } + t0 = time.perf_counter() + ttft = None + usage = None + with requests.post(url, json=payload, stream=True, timeout=timeout) as r: + r.raise_for_status() + for raw in r.iter_lines(): + if not raw: + continue + line = raw.decode("utf-8") if isinstance(raw, bytes) else raw + if not line.startswith("data: "): + continue + data = line[6:] + if data.strip() == "[DONE]": + break + import json as _json + obj = _json.loads(data) + if obj.get("usage"): + usage = obj["usage"] + choices = obj.get("choices") or [] + if ttft is None and choices and choices[0].get("text"): + ttft = time.perf_counter() - t0 + e2e = time.perf_counter() - t0 + return {"ttft_s": ttft if ttft is not None else e2e, "e2e_s": e2e, "usage": usage} + + +def wait_healthy(endpoint: str, timeout: float = 900.0) -> bool: + deadline = time.time() + timeout + while time.time() < deadline: + try: + if requests.get(f"{endpoint}/health", timeout=5).status_code == 200: + return True + except Exception: + pass + time.sleep(3) + return False diff --git a/v2/exp_a_tier_latency/driver.py b/v2/exp_a_tier_latency/driver.py new file mode 100644 index 0000000..c9ad160 --- /dev/null +++ b/v2/exp_a_tier_latency/driver.py @@ -0,0 +1,140 @@ +"""Exp (a): three-tier hit-latency microbench. + +Measures TTFT of serving a prefix of length L from each tier: + - miss : fresh unique prompt -> full prefill (recompute) + - gpu : re-request same prompt -> HBM prefix-cache hit + - cpu : warm -> evict to CPU offload tier -> re-request -> DRAM hit + +Each measured request is bracketed by /metrics scrapes so the tier is *verified* +(gpu_hits delta vs external_prefix_cache_hits delta), not assumed. +""" +from __future__ import annotations + +import argparse +import json +import statistics +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) +from common.util import make_token_prompt, scrape_prefix_cache, measure_ttft # noqa: E402 + +LENGTHS = [1024, 2048, 4096, 8192, 16384, 32768, 65536] + + +def delta(a: dict, b: dict) -> dict: + return {k: b[k] - a[k] for k in a} + + +def one_measurement(ep, model, prompt, expect): + m0 = scrape_prefix_cache(ep) + res = measure_ttft(ep, model, prompt) + m1 = scrape_prefix_cache(ep) + d = delta(m0, m1) + cached = (res.get("usage") or {}).get("prompt_tokens", None) + # classify + if d["ext_hits"] > 0.5: + tier = "cpu" + elif d["gpu_hits"] > 0.5: + tier = "gpu" + else: + tier = "miss" + return {"ttft_s": res["ttft_s"], "e2e_s": res["e2e_s"], + "tier_observed": tier, "expect": expect, + "d_gpu_hits": d["gpu_hits"], "d_ext_hits": d["ext_hits"]} + + +def run_miss(ep, model, L, reps, base): + rows = [] + for i in range(reps): + p = make_token_prompt(L, seed=base + i) # fresh each time + rows.append(one_measurement(ep, model, p, "miss")) + return rows + + +def run_gpu(ep, model, L, reps, base): + rows = [] + for i in range(reps): + p = make_token_prompt(L, seed=base + i) + measure_ttft(ep, model, p) # warm + rows.append(one_measurement(ep, model, p, "gpu")) # hit + return rows + + +def run_cpu(ep, model, L, reps, base, flood_tokens, flood_chunk): + rows = [] + for i in range(reps): + p = make_token_prompt(L, seed=base + i) + measure_ttft(ep, model, p) # warm -> GPU (+offload) + # flood with distinct content to evict p from the GPU pool to CPU tier + sent = 0 + fseed = 10_000_000 + (base + i) * 1000 + while sent < flood_tokens: + fp = make_token_prompt(flood_chunk, seed=fseed) + measure_ttft(ep, model, fp) + fseed += 1 + sent += flood_chunk + rows.append(one_measurement(ep, model, p, "cpu")) # should hit CPU tier + return rows + + +def summarize(rows): + t = sorted(r["ttft_s"] for r in rows) + return { + "n": len(rows), + "ttft_p50": statistics.median(t) if t else None, + "ttft_mean": statistics.fmean(t) if t else None, + "ttft_min": t[0] if t else None, + "ttft_max": t[-1] if t else None, + "tier_observed": _modal([r["tier_observed"] for r in rows]), + "verified_frac": sum(r["tier_observed"] == r["expect"] for r in rows) / len(rows) if rows else 0, + } + + +def _modal(xs): + from collections import Counter + return Counter(xs).most_common(1)[0][0] if xs else None + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--endpoint", required=True) + ap.add_argument("--model", required=True) + ap.add_argument("--mode", required=True, choices=["miss", "gpu", "cpu"]) + ap.add_argument("--reps", type=int, default=8) + ap.add_argument("--out", required=True) + ap.add_argument("--lengths", type=str, default=None, + help="comma list override, e.g. 1024,4096") + ap.add_argument("--flood-tokens", type=int, default=120000, + help="cpu mode: distinct tokens to flush GPU pool") + ap.add_argument("--flood-chunk", type=int, default=8192) + args = ap.parse_args() + + lengths = ([int(x) for x in args.lengths.split(",")] if args.lengths else LENGTHS) + out = {"mode": args.mode, "reps": args.reps, "by_length": {}, "raw": {}} + base = {"miss": 1_000, "gpu": 2_000, "cpu": 3_000}[args.mode] + + for L in lengths: + t0 = time.time() + if args.mode == "miss": + rows = run_miss(args.endpoint, args.model, L, args.reps, base) + elif args.mode == "gpu": + rows = run_gpu(args.endpoint, args.model, L, args.reps, base) + else: + rows = run_cpu(args.endpoint, args.model, L, args.reps, base, + args.flood_tokens, args.flood_chunk) + base += 100_000 + s = summarize(rows) + out["by_length"][str(L)] = s + out["raw"][str(L)] = rows + print(f"[{args.mode}] L={L:>6} ttft_p50={s['ttft_p50']:.4f}s " + f"tier={s['tier_observed']} verified={s['verified_frac']:.0%} " + f"({time.time()-t0:.0f}s)", flush=True) + Path(args.out).write_text(json.dumps(out, indent=2)) + + print(f"wrote {args.out}") + + +if __name__ == "__main__": + main() diff --git a/v2/exp_a_tier_latency/pcie_transfer.py b/v2/exp_a_tier_latency/pcie_transfer.py new file mode 100644 index 0000000..2aafa36 --- /dev/null +++ b/v2/exp_a_tier_latency/pcie_transfer.py @@ -0,0 +1,59 @@ +"""Exp (a) backstop: direct CPU(DRAM)->GPU(HBM) KV-transfer cost. + +Independent lower bound on a CPU-tier hit: fetching L tokens' KV over the +host<->device link. CPU_hit(L) >= GPU_hit(L) + KV_bytes(L) / BW_h2d. +Uses pinned host memory (best case for the offload tier, which pins buffers). +""" +from __future__ import annotations + +import argparse +import json +import time + +import torch + +KV_BYTES_PER_TOKEN = 98304 # Qwen3-Coder, bf16 +LENGTHS = [1024, 2048, 4096, 8192, 16384, 32768, 65536] + + +def time_h2d(nbytes: int, reps: int) -> float: + n = nbytes // 2 # bf16 elements + host = torch.empty(n, dtype=torch.float16, pin_memory=True) + dev = torch.empty(n, dtype=torch.float16, device="cuda") + # warmup + for _ in range(3): + dev.copy_(host, non_blocking=True) + torch.cuda.synchronize() + ts = [] + for _ in range(reps): + t0 = time.perf_counter() + dev.copy_(host, non_blocking=True) + torch.cuda.synchronize() + ts.append(time.perf_counter() - t0) + ts.sort() + return ts[len(ts) // 2] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--reps", type=int, default=20) + ap.add_argument("--out", required=True) + args = ap.parse_args() + assert torch.cuda.is_available(), "need a GPU" + print("device:", torch.cuda.get_device_name(0)) + + out = {"device": torch.cuda.get_device_name(0), "by_length": {}} + for L in LENGTHS: + nbytes = L * KV_BYTES_PER_TOKEN + sec = time_h2d(nbytes, args.reps) + bw = nbytes / sec / 1e9 + out["by_length"][str(L)] = { + "kv_bytes": nbytes, "transfer_s": sec, "bw_GBps": bw, + } + print(f"L={L:>6} KV={nbytes/1e9:6.3f}GB t={sec*1000:7.2f}ms bw={bw:6.1f} GB/s", flush=True) + json.dump(out, open(args.out, "w"), indent=2) + print("wrote", args.out) + + +if __name__ == "__main__": + main() diff --git a/v2/exp_a_tier_latency/plot.py b/v2/exp_a_tier_latency/plot.py new file mode 100644 index 0000000..8d889dc --- /dev/null +++ b/v2/exp_a_tier_latency/plot.py @@ -0,0 +1,68 @@ +"""Plot exp (a): TTFT vs prefix length for miss / gpu-hit / cpu-hit (+ PCIe floor).""" +import json +import sys +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + +R = Path(sys.argv[1] if len(sys.argv) > 1 else "v2/exp_a_tier_latency/results") +FIG = Path(sys.argv[2] if len(sys.argv) > 2 else "v2/figs/exp_a_tier_latency.png") +KV_BYTES_PER_TOKEN = 98304 + + +def load(name): + p = R / name + return json.load(open(p)) if p.exists() else None + + +miss, gpu, cpu, pcie = load("miss.json"), load("gpu.json"), load("cpu.json"), load("pcie.json") + + +def series(d): + if not d: + return [], [] + items = sorted(((int(k), v["ttft_p50"]) for k, v in d["by_length"].items()), key=lambda x: x[0]) + return [a for a, _ in items], [b for _, b in items] + + +fig, ax = plt.subplots(figsize=(7.2, 5.0)) +for d, lab, mk, c in [(miss, "miss (recompute)", "o", "#d62728"), + (cpu, "CPU-tier hit (DRAM offload)", "s", "#ff7f0e"), + (gpu, "GPU-tier hit (HBM APC)", "^", "#2ca02c")]: + xs, ys = series(d) + if xs: + ax.plot(xs, ys, marker=mk, label=lab, color=c, linewidth=2, markersize=7) + +if pcie: + items = sorted(((int(k), v["transfer_s"]) for k, v in pcie["by_length"].items())) + xs = [a for a, _ in items]; ys = [b for _, b in items] + ax.plot(xs, ys, "--", color="#7f7f7f", linewidth=1.4, + label="CPU-hit transfer floor (PCIe H2D)") + +ax.set_xscale("log", base=2); ax.set_yscale("log") +ax.set_xlabel("Reused prefix length (tokens)") +ax.set_ylabel("TTFT (s, log)") +ax.set_title("Cost of serving a reused prefix from each KV tier\nQwen3-Coder-30B-A3B, 1xH20") +ax.grid(True, which="both", alpha=0.3) +ax.legend() +FIG.parent.mkdir(parents=True, exist_ok=True) +fig.tight_layout(); fig.savefig(FIG, dpi=140) +print("wrote", FIG) + +# Table +print(f"\n{'L':>7} {'miss(s)':>10} {'cpu(s)':>10} {'gpu(s)':>10} {'miss/cpu':>9} {'cpu/gpu':>9}") +allL = sorted({int(k) for d in (miss, gpu, cpu) if d for k in d["by_length"]}) +for L in allL: + m = miss["by_length"].get(str(L), {}).get("ttft_p50") if miss else None + c = cpu["by_length"].get(str(L), {}).get("ttft_p50") if cpu else None + g = gpu["by_length"].get(str(L), {}).get("ttft_p50") if gpu else None + f = lambda x: f"{x:.4f}" if x is not None else " - " + r1 = f"{m/c:.1f}x" if (m and c) else " -" + r2 = f"{c/g:.1f}x" if (c and g) else " -" + print(f"{L:>7} {f(m):>10} {f(c):>10} {f(g):>10} {r1:>9} {r2:>9}") + +if cpu: + vf = {k: v.get("verified_frac") for k, v in cpu["by_length"].items()} + print("\nCPU-tier verified fraction (ext_hits>0):", vf) diff --git a/v2/exp_a_tier_latency/results/cpu.json b/v2/exp_a_tier_latency/results/cpu.json new file mode 100644 index 0000000..b5835a7 --- /dev/null +++ b/v2/exp_a_tier_latency/results/cpu.json @@ -0,0 +1,309 @@ +{ + "mode": "cpu", + "reps": 4, + "by_length": { + "1024": { + "n": 4, + "ttft_p50": 0.057389369496377185, + "ttft_mean": 0.08805505199416075, + "ttft_min": 0.055113587994128466, + "ttft_max": 0.18232788098976016, + "tier_observed": "cpu", + "verified_frac": 1.0 + }, + "2048": { + "n": 4, + "ttft_p50": 0.05680296401260421, + "ttft_mean": 0.05688378225750057, + "ttft_min": 0.05582832600339316, + "ttft_max": 0.05810087500140071, + "tier_observed": "cpu", + "verified_frac": 1.0 + }, + "4096": { + "n": 4, + "ttft_p50": 0.06417885900009423, + "ttft_mean": 0.06439992749801604, + "ttft_min": 0.063741421996383, + "ttft_max": 0.0655005699954927, + "tier_observed": "cpu", + "verified_frac": 1.0 + }, + "8192": { + "n": 4, + "ttft_p50": 0.07612077200610656, + "ttft_mean": 0.07600563450250775, + "ttft_min": 0.07499952000216581, + "ttft_max": 0.07678147399565205, + "tier_observed": "cpu", + "verified_frac": 1.0 + }, + "16384": { + "n": 4, + "ttft_p50": 0.10476256850233767, + "ttft_mean": 0.10426848525821697, + "ttft_min": 0.10178845902555622, + "ttft_max": 0.10576034500263631, + "tier_observed": "cpu", + "verified_frac": 1.0 + }, + "32768": { + "n": 4, + "ttft_p50": 0.15755228500347584, + "ttft_mean": 0.15822456649766536, + "ttft_min": 0.15535229499801062, + "ttft_max": 0.16244140098569915, + "tier_observed": "cpu", + "verified_frac": 1.0 + }, + "65536": { + "n": 4, + "ttft_p50": 0.27175235451431945, + "ttft_mean": 0.2716009145078715, + "ttft_min": 0.26732781299506314, + "ttft_max": 0.27557113600778393, + "tier_observed": "cpu", + "verified_frac": 1.0 + } + }, + "raw": { + "1024": [ + { + "ttft_s": 0.18232788098976016, + "e2e_s": 0.182449893996818, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 1024.0 + }, + { + "ttft_s": 0.05692533400724642, + "e2e_s": 0.057079910999163985, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 1024.0 + }, + { + "ttft_s": 0.05785340498550795, + "e2e_s": 0.05796545499470085, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 1024.0 + }, + { + "ttft_s": 0.055113587994128466, + "e2e_s": 0.055113587994128466, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 1024.0 + } + ], + "2048": [ + { + "ttft_s": 0.05582832600339316, + "e2e_s": 0.055943820014363155, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 2048.0 + }, + { + "ttft_s": 0.057600113010266796, + "e2e_s": 0.05772249499568716, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 2048.0 + }, + { + "ttft_s": 0.05600581501494162, + "e2e_s": 0.05611848901025951, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 2048.0 + }, + { + "ttft_s": 0.05810087500140071, + "e2e_s": 0.0582130889815744, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 2048.0 + } + ], + "4096": [ + { + "ttft_s": 0.0655005699954927, + "e2e_s": 0.06560429997625761, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 4096.0 + }, + { + "ttft_s": 0.063741421996383, + "e2e_s": 0.06384500698186457, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 4096.0 + }, + { + "ttft_s": 0.06445824800175615, + "e2e_s": 0.06458494698745199, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 4096.0 + }, + { + "ttft_s": 0.06389946999843232, + "e2e_s": 0.06403137900633737, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 4096.0 + } + ], + "8192": [ + { + "ttft_s": 0.0759067680046428, + "e2e_s": 0.0759067680046428, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 8192.0 + }, + { + "ttft_s": 0.07678147399565205, + "e2e_s": 0.07678147399565205, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 8192.0 + }, + { + "ttft_s": 0.07633477600757033, + "e2e_s": 0.0764636590029113, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 8192.0 + }, + { + "ttft_s": 0.07499952000216581, + "e2e_s": 0.07499952000216581, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 8192.0 + } + ], + "16384": [ + { + "ttft_s": 0.10444335200008936, + "e2e_s": 0.104556644015247, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 16384.0 + }, + { + "ttft_s": 0.10178845902555622, + "e2e_s": 0.10188649001065642, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 16384.0 + }, + { + "ttft_s": 0.10508178500458598, + "e2e_s": 0.10518974298611283, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 16384.0 + }, + { + "ttft_s": 0.10576034500263631, + "e2e_s": 0.10587632199167274, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 16384.0 + } + ], + "32768": [ + { + "ttft_s": 0.15535229499801062, + "e2e_s": 0.155460513982689, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 32768.0 + }, + { + "ttft_s": 0.15777600501314737, + "e2e_s": 0.15777600501314737, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 32768.0 + }, + { + "ttft_s": 0.16244140098569915, + "e2e_s": 0.16244140098569915, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 32768.0 + }, + { + "ttft_s": 0.15732856499380432, + "e2e_s": 0.1574467390018981, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 32768.0 + } + ], + "65536": [ + { + "ttft_s": 0.27309533301740885, + "e2e_s": 0.273235183005454, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 65536.0 + }, + { + "ttft_s": 0.27040937601123005, + "e2e_s": 0.27040937601123005, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 65536.0 + }, + { + "ttft_s": 0.27557113600778393, + "e2e_s": 0.27557113600778393, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 65536.0 + }, + { + "ttft_s": 0.26732781299506314, + "e2e_s": 0.2674778919899836, + "tier_observed": "cpu", + "expect": "cpu", + "d_gpu_hits": 0.0, + "d_ext_hits": 65536.0 + } + ] + } +} \ No newline at end of file diff --git a/v2/exp_a_tier_latency/results/gpu.json b/v2/exp_a_tier_latency/results/gpu.json new file mode 100644 index 0000000..1f4f2f6 --- /dev/null +++ b/v2/exp_a_tier_latency/results/gpu.json @@ -0,0 +1,533 @@ +{ + "mode": "gpu", + "reps": 8, + "by_length": { + "1024": { + "n": 8, + "ttft_p50": 0.04180275100225117, + "ttft_mean": 0.05689269150025211, + "ttft_min": 0.041313502995762974, + "ttft_max": 0.1606091230059974, + "tier_observed": "gpu", + "verified_frac": 1.0 + }, + "2048": { + "n": 8, + "ttft_p50": 0.044922845510882325, + "ttft_mean": 0.04646045462868642, + "ttft_min": 0.04261300901998766, + "ttft_max": 0.06082483098725788, + "tier_observed": "gpu", + "verified_frac": 1.0 + }, + "4096": { + "n": 8, + "ttft_p50": 0.0462174300046172, + "ttft_mean": 0.04691218675361597, + "ttft_min": 0.044408742018276826, + "ttft_max": 0.05101387499598786, + "tier_observed": "gpu", + "verified_frac": 1.0 + }, + "8192": { + "n": 8, + "ttft_p50": 0.052487702007056214, + "ttft_mean": 0.05252782000388834, + "ttft_min": 0.050384567002765834, + "ttft_max": 0.055209266021847725, + "tier_observed": "gpu", + "verified_frac": 1.0 + }, + "16384": { + "n": 8, + "ttft_p50": 0.06340778700541705, + "ttft_mean": 0.06307360512437299, + "ttft_min": 0.059953891002805904, + "ttft_max": 0.06587072199909016, + "tier_observed": "gpu", + "verified_frac": 1.0 + }, + "32768": { + "n": 8, + "ttft_p50": 0.07986902150150854, + "ttft_mean": 0.08412684850554797, + "ttft_min": 0.07615292401169427, + "ttft_max": 0.11761908099288121, + "tier_observed": "gpu", + "verified_frac": 1.0 + }, + "65536": { + "n": 8, + "ttft_p50": 0.11140661900572013, + "ttft_mean": 0.10751268000240088, + "ttft_min": 0.07390080401091836, + "ttft_max": 0.1206158839922864, + "tier_observed": "gpu", + "verified_frac": 1.0 + } + }, + "raw": { + "1024": [ + { + "ttft_s": 0.1606091230059974, + "e2e_s": 0.16078226300305687, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.043477901024743915, + "e2e_s": 0.0436010490229819, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04137404798530042, + "e2e_s": 0.04146770399529487, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04179324599681422, + "e2e_s": 0.041887808009050786, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04326947100344114, + "e2e_s": 0.04335355598595925, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04149198398226872, + "e2e_s": 0.04157822398701683, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04181225600768812, + "e2e_s": 0.04190706100780517, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.041313502995762974, + "e2e_s": 0.041313502995762974, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 1008.0, + "d_ext_hits": 0.0 + } + ], + "2048": [ + { + "ttft_s": 0.04491939002764411, + "e2e_s": 0.045019031007541344, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.045014784001978114, + "e2e_s": 0.04511277299025096, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04492630099412054, + "e2e_s": 0.04502850098651834, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04566141500254162, + "e2e_s": 0.04576313399593346, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04261300901998766, + "e2e_s": 0.04271370900096372, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.06082483098725788, + "e2e_s": 0.06096197199076414, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04355804901570082, + "e2e_s": 0.04355804901570082, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.044165857980260625, + "e2e_s": 0.044268568977713585, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 2032.0, + "d_ext_hits": 0.0 + } + ], + "4096": [ + { + "ttft_s": 0.05101387499598786, + "e2e_s": 0.051123478973750025, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.044408742018276826, + "e2e_s": 0.044408742018276826, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04538871700060554, + "e2e_s": 0.045498208986828104, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04754545699688606, + "e2e_s": 0.047664124984294176, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04840670898556709, + "e2e_s": 0.04840670898556709, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.0462190090038348, + "e2e_s": 0.04632823000429198, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.04609913402236998, + "e2e_s": 0.046204126003431156, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.0462158510053996, + "e2e_s": 0.0462158510053996, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 4080.0, + "d_ext_hits": 0.0 + } + ], + "8192": [ + { + "ttft_s": 0.05042222701013088, + "e2e_s": 0.05053543800022453, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.05319672200130299, + "e2e_s": 0.053308423986891285, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.05063424099353142, + "e2e_s": 0.05073276098119095, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.050384567002765834, + "e2e_s": 0.05048462699051015, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.055105848994571716, + "e2e_s": 0.055215683998540044, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.05349100599414669, + "e2e_s": 0.053595816978486255, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.05177868201280944, + "e2e_s": 0.05188246400211938, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.055209266021847725, + "e2e_s": 0.05531894601881504, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 8176.0, + "d_ext_hits": 0.0 + } + ], + "16384": [ + { + "ttft_s": 0.0633803239907138, + "e2e_s": 0.06349112599855289, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.06337857199832797, + "e2e_s": 0.06350608498905785, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.06098292299429886, + "e2e_s": 0.061115075019188225, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.06343525002012029, + "e2e_s": 0.06355450401315466, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.0636955969966948, + "e2e_s": 0.0636955969966948, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.06389156199293211, + "e2e_s": 0.06389156199293211, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.06587072199909016, + "e2e_s": 0.06587072199909016, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.059953891002805904, + "e2e_s": 0.060058912000386044, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 16368.0, + "d_ext_hits": 0.0 + } + ], + "32768": [ + { + "ttft_s": 0.07615292401169427, + "e2e_s": 0.07625289200223051, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07992992899380624, + "e2e_s": 0.0800386439950671, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07980811400921084, + "e2e_s": 0.07995001602103002, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.11761908099288121, + "e2e_s": 0.11776423300034367, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07834753501811065, + "e2e_s": 0.07834753501811065, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.0814115820103325, + "e2e_s": 0.0814115820103325, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.08212830501724966, + "e2e_s": 0.08224253499065526, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07761731799109839, + "e2e_s": 0.07772363899857737, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 32752.0, + "d_ext_hits": 0.0 + } + ], + "65536": [ + { + "ttft_s": 0.1206158839922864, + "e2e_s": 0.1206158839922864, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.10727833199780434, + "e2e_s": 0.10727833199780434, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.11010084400186315, + "e2e_s": 0.11023741000099108, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.11566799599677324, + "e2e_s": 0.1157765949901659, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.11422122400836088, + "e2e_s": 0.11422122400836088, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07390080401091836, + "e2e_s": 0.07390080401091836, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.11271239400957711, + "e2e_s": 0.11271239400957711, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.10560396200162359, + "e2e_s": 0.10572021701955236, + "tier_observed": "gpu", + "expect": "gpu", + "d_gpu_hits": 65520.0, + "d_ext_hits": 0.0 + } + ] + } +} \ No newline at end of file diff --git a/v2/exp_a_tier_latency/results/miss.json b/v2/exp_a_tier_latency/results/miss.json new file mode 100644 index 0000000..b4f8c38 --- /dev/null +++ b/v2/exp_a_tier_latency/results/miss.json @@ -0,0 +1,533 @@ +{ + "mode": "miss", + "reps": 8, + "by_length": { + "1024": { + "n": 8, + "ttft_p50": 0.07843716802017298, + "ttft_mean": 0.1441972145003092, + "ttft_min": 0.0768489159818273, + "ttft_max": 0.6051040079910308, + "tier_observed": "miss", + "verified_frac": 1.0 + }, + "2048": { + "n": 8, + "ttft_p50": 0.13254689799214248, + "ttft_mean": 0.14012471562818973, + "ttft_min": 0.13148935200297274, + "ttft_max": 0.19460841300315224, + "tier_observed": "miss", + "verified_frac": 1.0 + }, + "4096": { + "n": 8, + "ttft_p50": 0.2613626064994605, + "ttft_mean": 0.2620435417484259, + "ttft_min": 0.26085699602845125, + "ttft_max": 0.2652779980271589, + "tier_observed": "miss", + "verified_frac": 1.0 + }, + "8192": { + "n": 8, + "ttft_p50": 0.588181210012408, + "ttft_mean": 0.5881437246280257, + "ttft_min": 0.5867919930024073, + "ttft_max": 0.5897468629991636, + "tier_observed": "miss", + "verified_frac": 1.0 + }, + "16384": { + "n": 8, + "ttft_p50": 1.5470821364870062, + "ttft_mean": 1.5479571051182575, + "ttft_min": 1.5448924789961893, + "ttft_max": 1.5552692519850098, + "tier_observed": "miss", + "verified_frac": 1.0 + }, + "32768": { + "n": 8, + "ttft_p50": 4.603862981006387, + "ttft_mean": 4.601982127005613, + "ttft_min": 4.594119774992578, + "ttft_max": 4.608500114001799, + "tier_observed": "miss", + "verified_frac": 1.0 + }, + "65536": { + "n": 8, + "ttft_p50": 15.230140178493457, + "ttft_mean": 15.229316346121777, + "ttft_min": 15.177009812992765, + "ttft_max": 15.28301460199873, + "tier_observed": "miss", + "verified_frac": 1.0 + } + }, + "raw": { + "1024": [ + { + "ttft_s": 0.6051040079910308, + "e2e_s": 0.6052766389911994, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.08014080999419093, + "e2e_s": 0.08025239198468626, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07735737299663015, + "e2e_s": 0.07746260898420587, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.0768489159818273, + "e2e_s": 0.0768489159818273, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07845149302738719, + "e2e_s": 0.07860825702664442, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07842284301295877, + "e2e_s": 0.07842284301295877, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.07716230300138704, + "e2e_s": 0.07729722000658512, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.08008996999706142, + "e2e_s": 0.08020236299489625, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + } + ], + "2048": [ + { + "ttft_s": 0.19460841300315224, + "e2e_s": 0.19472959099221043, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.13326648401562124, + "e2e_s": 0.13339112501125783, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.1324451889959164, + "e2e_s": 0.13255105601274408, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.13292257199645974, + "e2e_s": 0.1330343289882876, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.13170140300644562, + "e2e_s": 0.13180866098264232, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.13264860698836856, + "e2e_s": 0.13274579899734817, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.13148935200297274, + "e2e_s": 0.13160509002045728, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.13191570501658134, + "e2e_s": 0.1320405500009656, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + } + ], + "4096": [ + { + "ttft_s": 0.2652779980271589, + "e2e_s": 0.2653735490166582, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.26092263497412205, + "e2e_s": 0.2610465929901693, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.26085699602845125, + "e2e_s": 0.2609657910070382, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.2613706949923653, + "e2e_s": 0.2613706949923653, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.26135451800655574, + "e2e_s": 0.2614478030009195, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.2610389889741782, + "e2e_s": 0.2611535779724363, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.26337735800188966, + "e2e_s": 0.2634820609819144, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.26214914498268627, + "e2e_s": 0.26226521999342367, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + } + ], + "8192": [ + { + "ttft_s": 0.5897468629991636, + "e2e_s": 0.5898835949774366, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.5884731799887959, + "e2e_s": 0.5884731799887959, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.588430589006748, + "e2e_s": 0.588430589006748, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.5895502530038357, + "e2e_s": 0.5896741840115283, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.5879318310180679, + "e2e_s": 0.5879318310180679, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.5870601140195504, + "e2e_s": 0.5871822330227587, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.5871649739856366, + "e2e_s": 0.5871649739856366, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 0.5867919930024073, + "e2e_s": 0.5869127069890965, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + } + ], + "16384": [ + { + "ttft_s": 1.5552692519850098, + "e2e_s": 1.5552692519850098, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 1.5471642419870477, + "e2e_s": 1.547288056986872, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 1.5476988689915743, + "e2e_s": 1.5476988689915743, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 1.5459686409740243, + "e2e_s": 1.5460858139849734, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 1.548691272008, + "e2e_s": 1.548691272008, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 1.5470000309869647, + "e2e_s": 1.5471212370030116, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 1.5448924789961893, + "e2e_s": 1.5448924789961893, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 1.5469720550172497, + "e2e_s": 1.5470902820234187, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + } + ], + "32768": [ + { + "ttft_s": 4.603710585011868, + "e2e_s": 4.603829422005219, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 4.608500114001799, + "e2e_s": 4.608500114001799, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 4.604015377000906, + "e2e_s": 4.604134508001152, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 4.606255694001447, + "e2e_s": 4.606392626999877, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 4.598494538018713, + "e2e_s": 4.598494538018713, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 4.594119774992578, + "e2e_s": 4.59425267498591, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 4.594318103016121, + "e2e_s": 4.594462227018084, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 4.60644283000147, + "e2e_s": 4.60644283000147, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + } + ], + "65536": [ + { + "ttft_s": 15.276076515991008, + "e2e_s": 15.276189939002506, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 15.275313204998383, + "e2e_s": 15.275428983004531, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 15.274311708984897, + "e2e_s": 15.274442903988529, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 15.28301460199873, + "e2e_s": 15.283127713017166, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 15.185968648002017, + "e2e_s": 15.185968648002017, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 15.177009812992765, + "e2e_s": 15.177117884973995, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 15.181360610004049, + "e2e_s": 15.181482268002583, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + }, + { + "ttft_s": 15.18147566600237, + "e2e_s": 15.18147566600237, + "tier_observed": "miss", + "expect": "miss", + "d_gpu_hits": 0.0, + "d_ext_hits": 0.0 + } + ] + } +} \ No newline at end of file diff --git a/v2/exp_a_tier_latency/results/pcie.json b/v2/exp_a_tier_latency/results/pcie.json new file mode 100644 index 0000000..12d38cb --- /dev/null +++ b/v2/exp_a_tier_latency/results/pcie.json @@ -0,0 +1,40 @@ +{ + "device": "NVIDIA H20", + "by_length": { + "1024": { + "kv_bytes": 100663296, + "transfer_s": 0.001876260997960344, + "bw_GBps": 53.65100916633112 + }, + "2048": { + "kv_bytes": 201326592, + "transfer_s": 0.003709116979734972, + "bw_GBps": 54.27884671741612 + }, + "4096": { + "kv_bytes": 402653184, + "transfer_s": 0.007338636991335079, + "bw_GBps": 54.86757070494469 + }, + "8192": { + "kv_bytes": 805306368, + "transfer_s": 0.01476299500791356, + "bw_GBps": 54.548983290201164 + }, + "16384": { + "kv_bytes": 1610612736, + "transfer_s": 0.02972855800180696, + "bw_GBps": 54.17729093695375 + }, + "32768": { + "kv_bytes": 3221225472, + "transfer_s": 0.059267577016726136, + "bw_GBps": 54.35055107940257 + }, + "65536": { + "kv_bytes": 6442450944, + "transfer_s": 0.11847134301206097, + "bw_GBps": 54.37982536708583 + } + } +} \ No newline at end of file diff --git a/v2/exp_a_tier_latency/run.sh b/v2/exp_a_tier_latency/run.sh new file mode 100644 index 0000000..a849f5c --- /dev/null +++ b/v2/exp_a_tier_latency/run.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Exp (a): three-tier hit-latency. Runs on dash0. One H20 (GPU $GPU). +set -uo pipefail +cd /home/admin/cpfs/wjh/agentic-kv +PY=.venv/bin/python +MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct +GPU=${GPU:-0} +PORT=${PORT:-8100} +EP=http://127.0.0.1:$PORT +OUT=v2/exp_a_tier_latency/results +mkdir -p "$OUT" + +VLLM_PID="" +launch() { # $1 = extra args, $2 = logfile + echo ">>> launch vllm: $1" + CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \ + $PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \ + --host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \ + --enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \ + $1 > "$2" 2>&1 & + VLLM_PID=$! + echo " pid=$VLLM_PID waiting for health..." + $PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; \ + sys.exit(0 if wait_healthy('$EP',900) else 1)" || { echo "LAUNCH FAILED"; tail -30 "$2"; return 1; } + echo " healthy." +} +teardown() { + [ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null + for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done + kill -0 "$VLLM_PID" 2>/dev/null && kill -TERM "$VLLM_PID" 2>/dev/null + sleep 3; VLLM_PID="" +} +trap teardown EXIT + +# ---- Config A1: big GPU pool, NO offload -> measure MISS + GPU hit ---- +launch "--gpu-memory-utilization 0.9" "$OUT/vllm_a1.log" || exit 1 +$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode miss --reps 8 --out "$OUT/miss.json" +$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode gpu --reps 8 --out "$OUT/gpu.json" +teardown + +# ---- Config A2: small GPU pool (80k tok = 5000 blocks) + CPU offload 40GB -> CPU hit ---- +launch "--num-gpu-blocks-override 5000 --kv-offloading-size 40 --kv-offloading-backend native" "$OUT/vllm_a2.log" || exit 1 +$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode cpu --reps 4 \ + --flood-tokens 88000 --flood-chunk 16384 --out "$OUT/cpu.json" +teardown + +# ---- PCIe backstop (uses the now-free GPU) ---- +CUDA_VISIBLE_DEVICES=$GPU $PY v2/exp_a_tier_latency/pcie_transfer.py --reps 20 --out "$OUT/pcie.json" + +echo "=== exp (a) DONE ===" diff --git a/v2/exp_a_tier_latency/run_cpu.sh b/v2/exp_a_tier_latency/run_cpu.sh new file mode 100644 index 0000000..6f8e337 --- /dev/null +++ b/v2/exp_a_tier_latency/run_cpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Exp (a) CPU-tier + PCIe only (miss/gpu already done). HMA fix applied. +set -uo pipefail +cd /home/admin/cpfs/wjh/agentic-kv +PY=.venv/bin/python +MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct +GPU=${GPU:-0} +PORT=${PORT:-8100} +EP=http://127.0.0.1:$PORT +OUT=v2/exp_a_tier_latency/results +mkdir -p "$OUT" + +VLLM_PID="" +teardown() { + [ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null + for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done + sleep 3; VLLM_PID="" +} +trap teardown EXIT + +echo ">>> launch A2: small pool + CPU offload (HMA disabled)" +CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \ +$PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \ + --host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \ + --enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \ + --num-gpu-blocks-override 5000 --kv-offloading-size 40 --kv-offloading-backend native \ + --disable-hybrid-kv-cache-manager > "$OUT/vllm_a2.log" 2>&1 & +VLLM_PID=$! +echo " pid=$VLLM_PID waiting for health..." +$PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; sys.exit(0 if wait_healthy('$EP',900) else 1)" \ + || { echo "LAUNCH FAILED"; tail -25 "$OUT/vllm_a2.log"; exit 1; } +echo " healthy." + +$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode cpu --reps 4 \ + --flood-tokens 88000 --flood-chunk 16384 --out "$OUT/cpu.json" +teardown + +CUDA_VISIBLE_DEVICES=$GPU $PY v2/exp_a_tier_latency/pcie_transfer.py --reps 20 --out "$OUT/pcie.json" +echo "=== exp (a) CPU+PCIe DONE ===" diff --git a/v2/figs/exp_a_tier_latency.png b/v2/figs/exp_a_tier_latency.png new file mode 100644 index 0000000..cbeae7d Binary files /dev/null and b/v2/figs/exp_a_tier_latency.png differ