v2 exp(a): three-tier KV-hit latency microbench (GPU >> CPU >> miss)
Measures TTFT to serve a reused prefix of length L from each KV tier on a single H20 (Qwen3-Coder-30B-A3B, vLLM 0.18.1): miss (recompute), CPU-tier hit (native DRAM offload), GPU-tier hit (HBM prefix cache). Each measured request is bracketed by /metrics scrapes so the tier is verified (vllm:prefix_cache_hits vs external_prefix_cache_hits), not assumed. Result: GPU hit is ~flat (42->111 ms over 1k->64k tokens); CPU hit is transfer-bound (PCIe H2D ~54 GB/s, 57->272 ms); miss grows superlinearly (78 ms -> 15.2 s). GPU beats CPU 1.4-2.5x (gap grows with context); miss/CPU up to 56x, miss/GPU up to 137x. pcie_transfer.py is the independent CPU-hit floor backstop. Evidence for the GPU-hit-first principle (paper section 2.2). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
3
v2/.gitignore
vendored
Normal file
3
v2/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# raw per-request replay dumps (~0.6 MB each) — regenerable; keep summary/m0/m1
|
||||
*/results/metrics_blk*.jsonl
|
||||
*/results/vllm_*.log
|
||||
106
v2/common/util.py
Normal file
106
v2/common/util.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Shared helpers for v2 GPU-hit-first experiments."""
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
import requests
|
||||
|
||||
# Qwen3-Coder geometry (from config.json): 48 layers, 4 KV heads, head_dim 128, bf16
|
||||
KV_BYTES_PER_TOKEN = 98304 # 96 KiB
|
||||
VOCAB = 151936
|
||||
# Safe token-id range: avoid low special-ish ids and the high special tokens (>=151643)
|
||||
TOK_LO, TOK_HI = 1000, 151000
|
||||
|
||||
|
||||
def make_token_prompt(length: int, seed: int) -> list[int]:
|
||||
"""Deterministic, content-addressed token-id prompt of exact `length`.
|
||||
|
||||
Same (length, seed) -> same ids -> prefix-cache hit.
|
||||
Different seed -> fresh ids -> miss.
|
||||
"""
|
||||
rng = random.Random(seed)
|
||||
return [rng.randint(TOK_LO, TOK_HI) for _ in range(length)]
|
||||
|
||||
|
||||
def scrape_prefix_cache(endpoint: str) -> dict:
|
||||
"""Return cumulative prefix-cache counters from vLLM /metrics.
|
||||
|
||||
Keys: gpu_hits, gpu_queries, ext_hits, ext_queries (floats, cumulative).
|
||||
"""
|
||||
out = {"gpu_hits": 0.0, "gpu_queries": 0.0, "ext_hits": 0.0, "ext_queries": 0.0}
|
||||
try:
|
||||
txt = requests.get(f"{endpoint}/metrics", timeout=10).text
|
||||
except Exception:
|
||||
return out
|
||||
for line in txt.splitlines():
|
||||
if line.startswith("#") or not line:
|
||||
continue
|
||||
try:
|
||||
name, val = line.rsplit(" ", 1)
|
||||
v = float(val)
|
||||
except ValueError:
|
||||
continue
|
||||
# strip prometheus labels and match only the cumulative _total counters
|
||||
# (exclude _created epoch-timestamp series, which would dominate the sum)
|
||||
metric = name.split("{", 1)[0]
|
||||
if metric == "vllm:external_prefix_cache_hits_total":
|
||||
out["ext_hits"] += v
|
||||
elif metric == "vllm:external_prefix_cache_queries_total":
|
||||
out["ext_queries"] += v
|
||||
elif metric == "vllm:prefix_cache_hits_total":
|
||||
out["gpu_hits"] += v
|
||||
elif metric == "vllm:prefix_cache_queries_total":
|
||||
out["gpu_queries"] += v
|
||||
return out
|
||||
|
||||
|
||||
def measure_ttft(endpoint: str, model: str, prompt_ids: list[int],
|
||||
max_tokens: int = 1, timeout: float = 600.0) -> dict:
|
||||
"""Send one streaming /v1/completions request; return TTFT and e2e seconds.
|
||||
|
||||
TTFT = time from send to first streamed token chunk (== prefill wall time).
|
||||
"""
|
||||
url = f"{endpoint}/v1/completions"
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt_ids,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0,
|
||||
"stream": True,
|
||||
"stream_options": {"include_usage": True},
|
||||
}
|
||||
t0 = time.perf_counter()
|
||||
ttft = None
|
||||
usage = None
|
||||
with requests.post(url, json=payload, stream=True, timeout=timeout) as r:
|
||||
r.raise_for_status()
|
||||
for raw in r.iter_lines():
|
||||
if not raw:
|
||||
continue
|
||||
line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
|
||||
if not line.startswith("data: "):
|
||||
continue
|
||||
data = line[6:]
|
||||
if data.strip() == "[DONE]":
|
||||
break
|
||||
import json as _json
|
||||
obj = _json.loads(data)
|
||||
if obj.get("usage"):
|
||||
usage = obj["usage"]
|
||||
choices = obj.get("choices") or []
|
||||
if ttft is None and choices and choices[0].get("text"):
|
||||
ttft = time.perf_counter() - t0
|
||||
e2e = time.perf_counter() - t0
|
||||
return {"ttft_s": ttft if ttft is not None else e2e, "e2e_s": e2e, "usage": usage}
|
||||
|
||||
|
||||
def wait_healthy(endpoint: str, timeout: float = 900.0) -> bool:
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
if requests.get(f"{endpoint}/health", timeout=5).status_code == 200:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(3)
|
||||
return False
|
||||
140
v2/exp_a_tier_latency/driver.py
Normal file
140
v2/exp_a_tier_latency/driver.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""Exp (a): three-tier hit-latency microbench.
|
||||
|
||||
Measures TTFT of serving a prefix of length L from each tier:
|
||||
- miss : fresh unique prompt -> full prefill (recompute)
|
||||
- gpu : re-request same prompt -> HBM prefix-cache hit
|
||||
- cpu : warm -> evict to CPU offload tier -> re-request -> DRAM hit
|
||||
|
||||
Each measured request is bracketed by /metrics scrapes so the tier is *verified*
|
||||
(gpu_hits delta vs external_prefix_cache_hits delta), not assumed.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from common.util import make_token_prompt, scrape_prefix_cache, measure_ttft # noqa: E402
|
||||
|
||||
LENGTHS = [1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
||||
|
||||
|
||||
def delta(a: dict, b: dict) -> dict:
|
||||
return {k: b[k] - a[k] for k in a}
|
||||
|
||||
|
||||
def one_measurement(ep, model, prompt, expect):
|
||||
m0 = scrape_prefix_cache(ep)
|
||||
res = measure_ttft(ep, model, prompt)
|
||||
m1 = scrape_prefix_cache(ep)
|
||||
d = delta(m0, m1)
|
||||
cached = (res.get("usage") or {}).get("prompt_tokens", None)
|
||||
# classify
|
||||
if d["ext_hits"] > 0.5:
|
||||
tier = "cpu"
|
||||
elif d["gpu_hits"] > 0.5:
|
||||
tier = "gpu"
|
||||
else:
|
||||
tier = "miss"
|
||||
return {"ttft_s": res["ttft_s"], "e2e_s": res["e2e_s"],
|
||||
"tier_observed": tier, "expect": expect,
|
||||
"d_gpu_hits": d["gpu_hits"], "d_ext_hits": d["ext_hits"]}
|
||||
|
||||
|
||||
def run_miss(ep, model, L, reps, base):
|
||||
rows = []
|
||||
for i in range(reps):
|
||||
p = make_token_prompt(L, seed=base + i) # fresh each time
|
||||
rows.append(one_measurement(ep, model, p, "miss"))
|
||||
return rows
|
||||
|
||||
|
||||
def run_gpu(ep, model, L, reps, base):
|
||||
rows = []
|
||||
for i in range(reps):
|
||||
p = make_token_prompt(L, seed=base + i)
|
||||
measure_ttft(ep, model, p) # warm
|
||||
rows.append(one_measurement(ep, model, p, "gpu")) # hit
|
||||
return rows
|
||||
|
||||
|
||||
def run_cpu(ep, model, L, reps, base, flood_tokens, flood_chunk):
|
||||
rows = []
|
||||
for i in range(reps):
|
||||
p = make_token_prompt(L, seed=base + i)
|
||||
measure_ttft(ep, model, p) # warm -> GPU (+offload)
|
||||
# flood with distinct content to evict p from the GPU pool to CPU tier
|
||||
sent = 0
|
||||
fseed = 10_000_000 + (base + i) * 1000
|
||||
while sent < flood_tokens:
|
||||
fp = make_token_prompt(flood_chunk, seed=fseed)
|
||||
measure_ttft(ep, model, fp)
|
||||
fseed += 1
|
||||
sent += flood_chunk
|
||||
rows.append(one_measurement(ep, model, p, "cpu")) # should hit CPU tier
|
||||
return rows
|
||||
|
||||
|
||||
def summarize(rows):
|
||||
t = sorted(r["ttft_s"] for r in rows)
|
||||
return {
|
||||
"n": len(rows),
|
||||
"ttft_p50": statistics.median(t) if t else None,
|
||||
"ttft_mean": statistics.fmean(t) if t else None,
|
||||
"ttft_min": t[0] if t else None,
|
||||
"ttft_max": t[-1] if t else None,
|
||||
"tier_observed": _modal([r["tier_observed"] for r in rows]),
|
||||
"verified_frac": sum(r["tier_observed"] == r["expect"] for r in rows) / len(rows) if rows else 0,
|
||||
}
|
||||
|
||||
|
||||
def _modal(xs):
|
||||
from collections import Counter
|
||||
return Counter(xs).most_common(1)[0][0] if xs else None
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--endpoint", required=True)
|
||||
ap.add_argument("--model", required=True)
|
||||
ap.add_argument("--mode", required=True, choices=["miss", "gpu", "cpu"])
|
||||
ap.add_argument("--reps", type=int, default=8)
|
||||
ap.add_argument("--out", required=True)
|
||||
ap.add_argument("--lengths", type=str, default=None,
|
||||
help="comma list override, e.g. 1024,4096")
|
||||
ap.add_argument("--flood-tokens", type=int, default=120000,
|
||||
help="cpu mode: distinct tokens to flush GPU pool")
|
||||
ap.add_argument("--flood-chunk", type=int, default=8192)
|
||||
args = ap.parse_args()
|
||||
|
||||
lengths = ([int(x) for x in args.lengths.split(",")] if args.lengths else LENGTHS)
|
||||
out = {"mode": args.mode, "reps": args.reps, "by_length": {}, "raw": {}}
|
||||
base = {"miss": 1_000, "gpu": 2_000, "cpu": 3_000}[args.mode]
|
||||
|
||||
for L in lengths:
|
||||
t0 = time.time()
|
||||
if args.mode == "miss":
|
||||
rows = run_miss(args.endpoint, args.model, L, args.reps, base)
|
||||
elif args.mode == "gpu":
|
||||
rows = run_gpu(args.endpoint, args.model, L, args.reps, base)
|
||||
else:
|
||||
rows = run_cpu(args.endpoint, args.model, L, args.reps, base,
|
||||
args.flood_tokens, args.flood_chunk)
|
||||
base += 100_000
|
||||
s = summarize(rows)
|
||||
out["by_length"][str(L)] = s
|
||||
out["raw"][str(L)] = rows
|
||||
print(f"[{args.mode}] L={L:>6} ttft_p50={s['ttft_p50']:.4f}s "
|
||||
f"tier={s['tier_observed']} verified={s['verified_frac']:.0%} "
|
||||
f"({time.time()-t0:.0f}s)", flush=True)
|
||||
Path(args.out).write_text(json.dumps(out, indent=2))
|
||||
|
||||
print(f"wrote {args.out}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
59
v2/exp_a_tier_latency/pcie_transfer.py
Normal file
59
v2/exp_a_tier_latency/pcie_transfer.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Exp (a) backstop: direct CPU(DRAM)->GPU(HBM) KV-transfer cost.
|
||||
|
||||
Independent lower bound on a CPU-tier hit: fetching L tokens' KV over the
|
||||
host<->device link. CPU_hit(L) >= GPU_hit(L) + KV_bytes(L) / BW_h2d.
|
||||
Uses pinned host memory (best case for the offload tier, which pins buffers).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
KV_BYTES_PER_TOKEN = 98304 # Qwen3-Coder, bf16
|
||||
LENGTHS = [1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
||||
|
||||
|
||||
def time_h2d(nbytes: int, reps: int) -> float:
|
||||
n = nbytes // 2 # bf16 elements
|
||||
host = torch.empty(n, dtype=torch.float16, pin_memory=True)
|
||||
dev = torch.empty(n, dtype=torch.float16, device="cuda")
|
||||
# warmup
|
||||
for _ in range(3):
|
||||
dev.copy_(host, non_blocking=True)
|
||||
torch.cuda.synchronize()
|
||||
ts = []
|
||||
for _ in range(reps):
|
||||
t0 = time.perf_counter()
|
||||
dev.copy_(host, non_blocking=True)
|
||||
torch.cuda.synchronize()
|
||||
ts.append(time.perf_counter() - t0)
|
||||
ts.sort()
|
||||
return ts[len(ts) // 2]
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--reps", type=int, default=20)
|
||||
ap.add_argument("--out", required=True)
|
||||
args = ap.parse_args()
|
||||
assert torch.cuda.is_available(), "need a GPU"
|
||||
print("device:", torch.cuda.get_device_name(0))
|
||||
|
||||
out = {"device": torch.cuda.get_device_name(0), "by_length": {}}
|
||||
for L in LENGTHS:
|
||||
nbytes = L * KV_BYTES_PER_TOKEN
|
||||
sec = time_h2d(nbytes, args.reps)
|
||||
bw = nbytes / sec / 1e9
|
||||
out["by_length"][str(L)] = {
|
||||
"kv_bytes": nbytes, "transfer_s": sec, "bw_GBps": bw,
|
||||
}
|
||||
print(f"L={L:>6} KV={nbytes/1e9:6.3f}GB t={sec*1000:7.2f}ms bw={bw:6.1f} GB/s", flush=True)
|
||||
json.dump(out, open(args.out, "w"), indent=2)
|
||||
print("wrote", args.out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
68
v2/exp_a_tier_latency/plot.py
Normal file
68
v2/exp_a_tier_latency/plot.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""Plot exp (a): TTFT vs prefix length for miss / gpu-hit / cpu-hit (+ PCIe floor)."""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
R = Path(sys.argv[1] if len(sys.argv) > 1 else "v2/exp_a_tier_latency/results")
|
||||
FIG = Path(sys.argv[2] if len(sys.argv) > 2 else "v2/figs/exp_a_tier_latency.png")
|
||||
KV_BYTES_PER_TOKEN = 98304
|
||||
|
||||
|
||||
def load(name):
|
||||
p = R / name
|
||||
return json.load(open(p)) if p.exists() else None
|
||||
|
||||
|
||||
miss, gpu, cpu, pcie = load("miss.json"), load("gpu.json"), load("cpu.json"), load("pcie.json")
|
||||
|
||||
|
||||
def series(d):
|
||||
if not d:
|
||||
return [], []
|
||||
items = sorted(((int(k), v["ttft_p50"]) for k, v in d["by_length"].items()), key=lambda x: x[0])
|
||||
return [a for a, _ in items], [b for _, b in items]
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(7.2, 5.0))
|
||||
for d, lab, mk, c in [(miss, "miss (recompute)", "o", "#d62728"),
|
||||
(cpu, "CPU-tier hit (DRAM offload)", "s", "#ff7f0e"),
|
||||
(gpu, "GPU-tier hit (HBM APC)", "^", "#2ca02c")]:
|
||||
xs, ys = series(d)
|
||||
if xs:
|
||||
ax.plot(xs, ys, marker=mk, label=lab, color=c, linewidth=2, markersize=7)
|
||||
|
||||
if pcie:
|
||||
items = sorted(((int(k), v["transfer_s"]) for k, v in pcie["by_length"].items()))
|
||||
xs = [a for a, _ in items]; ys = [b for _, b in items]
|
||||
ax.plot(xs, ys, "--", color="#7f7f7f", linewidth=1.4,
|
||||
label="CPU-hit transfer floor (PCIe H2D)")
|
||||
|
||||
ax.set_xscale("log", base=2); ax.set_yscale("log")
|
||||
ax.set_xlabel("Reused prefix length (tokens)")
|
||||
ax.set_ylabel("TTFT (s, log)")
|
||||
ax.set_title("Cost of serving a reused prefix from each KV tier\nQwen3-Coder-30B-A3B, 1xH20")
|
||||
ax.grid(True, which="both", alpha=0.3)
|
||||
ax.legend()
|
||||
FIG.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig.tight_layout(); fig.savefig(FIG, dpi=140)
|
||||
print("wrote", FIG)
|
||||
|
||||
# Table
|
||||
print(f"\n{'L':>7} {'miss(s)':>10} {'cpu(s)':>10} {'gpu(s)':>10} {'miss/cpu':>9} {'cpu/gpu':>9}")
|
||||
allL = sorted({int(k) for d in (miss, gpu, cpu) if d for k in d["by_length"]})
|
||||
for L in allL:
|
||||
m = miss["by_length"].get(str(L), {}).get("ttft_p50") if miss else None
|
||||
c = cpu["by_length"].get(str(L), {}).get("ttft_p50") if cpu else None
|
||||
g = gpu["by_length"].get(str(L), {}).get("ttft_p50") if gpu else None
|
||||
f = lambda x: f"{x:.4f}" if x is not None else " - "
|
||||
r1 = f"{m/c:.1f}x" if (m and c) else " -"
|
||||
r2 = f"{c/g:.1f}x" if (c and g) else " -"
|
||||
print(f"{L:>7} {f(m):>10} {f(c):>10} {f(g):>10} {r1:>9} {r2:>9}")
|
||||
|
||||
if cpu:
|
||||
vf = {k: v.get("verified_frac") for k, v in cpu["by_length"].items()}
|
||||
print("\nCPU-tier verified fraction (ext_hits>0):", vf)
|
||||
309
v2/exp_a_tier_latency/results/cpu.json
Normal file
309
v2/exp_a_tier_latency/results/cpu.json
Normal file
@@ -0,0 +1,309 @@
|
||||
{
|
||||
"mode": "cpu",
|
||||
"reps": 4,
|
||||
"by_length": {
|
||||
"1024": {
|
||||
"n": 4,
|
||||
"ttft_p50": 0.057389369496377185,
|
||||
"ttft_mean": 0.08805505199416075,
|
||||
"ttft_min": 0.055113587994128466,
|
||||
"ttft_max": 0.18232788098976016,
|
||||
"tier_observed": "cpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"2048": {
|
||||
"n": 4,
|
||||
"ttft_p50": 0.05680296401260421,
|
||||
"ttft_mean": 0.05688378225750057,
|
||||
"ttft_min": 0.05582832600339316,
|
||||
"ttft_max": 0.05810087500140071,
|
||||
"tier_observed": "cpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"4096": {
|
||||
"n": 4,
|
||||
"ttft_p50": 0.06417885900009423,
|
||||
"ttft_mean": 0.06439992749801604,
|
||||
"ttft_min": 0.063741421996383,
|
||||
"ttft_max": 0.0655005699954927,
|
||||
"tier_observed": "cpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"8192": {
|
||||
"n": 4,
|
||||
"ttft_p50": 0.07612077200610656,
|
||||
"ttft_mean": 0.07600563450250775,
|
||||
"ttft_min": 0.07499952000216581,
|
||||
"ttft_max": 0.07678147399565205,
|
||||
"tier_observed": "cpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"16384": {
|
||||
"n": 4,
|
||||
"ttft_p50": 0.10476256850233767,
|
||||
"ttft_mean": 0.10426848525821697,
|
||||
"ttft_min": 0.10178845902555622,
|
||||
"ttft_max": 0.10576034500263631,
|
||||
"tier_observed": "cpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"32768": {
|
||||
"n": 4,
|
||||
"ttft_p50": 0.15755228500347584,
|
||||
"ttft_mean": 0.15822456649766536,
|
||||
"ttft_min": 0.15535229499801062,
|
||||
"ttft_max": 0.16244140098569915,
|
||||
"tier_observed": "cpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"65536": {
|
||||
"n": 4,
|
||||
"ttft_p50": 0.27175235451431945,
|
||||
"ttft_mean": 0.2716009145078715,
|
||||
"ttft_min": 0.26732781299506314,
|
||||
"ttft_max": 0.27557113600778393,
|
||||
"tier_observed": "cpu",
|
||||
"verified_frac": 1.0
|
||||
}
|
||||
},
|
||||
"raw": {
|
||||
"1024": [
|
||||
{
|
||||
"ttft_s": 0.18232788098976016,
|
||||
"e2e_s": 0.182449893996818,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 1024.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05692533400724642,
|
||||
"e2e_s": 0.057079910999163985,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 1024.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05785340498550795,
|
||||
"e2e_s": 0.05796545499470085,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 1024.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.055113587994128466,
|
||||
"e2e_s": 0.055113587994128466,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 1024.0
|
||||
}
|
||||
],
|
||||
"2048": [
|
||||
{
|
||||
"ttft_s": 0.05582832600339316,
|
||||
"e2e_s": 0.055943820014363155,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 2048.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.057600113010266796,
|
||||
"e2e_s": 0.05772249499568716,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 2048.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05600581501494162,
|
||||
"e2e_s": 0.05611848901025951,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 2048.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05810087500140071,
|
||||
"e2e_s": 0.0582130889815744,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 2048.0
|
||||
}
|
||||
],
|
||||
"4096": [
|
||||
{
|
||||
"ttft_s": 0.0655005699954927,
|
||||
"e2e_s": 0.06560429997625761,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 4096.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.063741421996383,
|
||||
"e2e_s": 0.06384500698186457,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 4096.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06445824800175615,
|
||||
"e2e_s": 0.06458494698745199,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 4096.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06389946999843232,
|
||||
"e2e_s": 0.06403137900633737,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 4096.0
|
||||
}
|
||||
],
|
||||
"8192": [
|
||||
{
|
||||
"ttft_s": 0.0759067680046428,
|
||||
"e2e_s": 0.0759067680046428,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 8192.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07678147399565205,
|
||||
"e2e_s": 0.07678147399565205,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 8192.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07633477600757033,
|
||||
"e2e_s": 0.0764636590029113,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 8192.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07499952000216581,
|
||||
"e2e_s": 0.07499952000216581,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 8192.0
|
||||
}
|
||||
],
|
||||
"16384": [
|
||||
{
|
||||
"ttft_s": 0.10444335200008936,
|
||||
"e2e_s": 0.104556644015247,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 16384.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.10178845902555622,
|
||||
"e2e_s": 0.10188649001065642,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 16384.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.10508178500458598,
|
||||
"e2e_s": 0.10518974298611283,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 16384.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.10576034500263631,
|
||||
"e2e_s": 0.10587632199167274,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 16384.0
|
||||
}
|
||||
],
|
||||
"32768": [
|
||||
{
|
||||
"ttft_s": 0.15535229499801062,
|
||||
"e2e_s": 0.155460513982689,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 32768.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.15777600501314737,
|
||||
"e2e_s": 0.15777600501314737,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 32768.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.16244140098569915,
|
||||
"e2e_s": 0.16244140098569915,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 32768.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.15732856499380432,
|
||||
"e2e_s": 0.1574467390018981,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 32768.0
|
||||
}
|
||||
],
|
||||
"65536": [
|
||||
{
|
||||
"ttft_s": 0.27309533301740885,
|
||||
"e2e_s": 0.273235183005454,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 65536.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.27040937601123005,
|
||||
"e2e_s": 0.27040937601123005,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 65536.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.27557113600778393,
|
||||
"e2e_s": 0.27557113600778393,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 65536.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.26732781299506314,
|
||||
"e2e_s": 0.2674778919899836,
|
||||
"tier_observed": "cpu",
|
||||
"expect": "cpu",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 65536.0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
533
v2/exp_a_tier_latency/results/gpu.json
Normal file
533
v2/exp_a_tier_latency/results/gpu.json
Normal file
@@ -0,0 +1,533 @@
|
||||
{
|
||||
"mode": "gpu",
|
||||
"reps": 8,
|
||||
"by_length": {
|
||||
"1024": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.04180275100225117,
|
||||
"ttft_mean": 0.05689269150025211,
|
||||
"ttft_min": 0.041313502995762974,
|
||||
"ttft_max": 0.1606091230059974,
|
||||
"tier_observed": "gpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"2048": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.044922845510882325,
|
||||
"ttft_mean": 0.04646045462868642,
|
||||
"ttft_min": 0.04261300901998766,
|
||||
"ttft_max": 0.06082483098725788,
|
||||
"tier_observed": "gpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"4096": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.0462174300046172,
|
||||
"ttft_mean": 0.04691218675361597,
|
||||
"ttft_min": 0.044408742018276826,
|
||||
"ttft_max": 0.05101387499598786,
|
||||
"tier_observed": "gpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"8192": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.052487702007056214,
|
||||
"ttft_mean": 0.05252782000388834,
|
||||
"ttft_min": 0.050384567002765834,
|
||||
"ttft_max": 0.055209266021847725,
|
||||
"tier_observed": "gpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"16384": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.06340778700541705,
|
||||
"ttft_mean": 0.06307360512437299,
|
||||
"ttft_min": 0.059953891002805904,
|
||||
"ttft_max": 0.06587072199909016,
|
||||
"tier_observed": "gpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"32768": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.07986902150150854,
|
||||
"ttft_mean": 0.08412684850554797,
|
||||
"ttft_min": 0.07615292401169427,
|
||||
"ttft_max": 0.11761908099288121,
|
||||
"tier_observed": "gpu",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"65536": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.11140661900572013,
|
||||
"ttft_mean": 0.10751268000240088,
|
||||
"ttft_min": 0.07390080401091836,
|
||||
"ttft_max": 0.1206158839922864,
|
||||
"tier_observed": "gpu",
|
||||
"verified_frac": 1.0
|
||||
}
|
||||
},
|
||||
"raw": {
|
||||
"1024": [
|
||||
{
|
||||
"ttft_s": 0.1606091230059974,
|
||||
"e2e_s": 0.16078226300305687,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.043477901024743915,
|
||||
"e2e_s": 0.0436010490229819,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04137404798530042,
|
||||
"e2e_s": 0.04146770399529487,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04179324599681422,
|
||||
"e2e_s": 0.041887808009050786,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04326947100344114,
|
||||
"e2e_s": 0.04335355598595925,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04149198398226872,
|
||||
"e2e_s": 0.04157822398701683,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04181225600768812,
|
||||
"e2e_s": 0.04190706100780517,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.041313502995762974,
|
||||
"e2e_s": 0.041313502995762974,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 1008.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"2048": [
|
||||
{
|
||||
"ttft_s": 0.04491939002764411,
|
||||
"e2e_s": 0.045019031007541344,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.045014784001978114,
|
||||
"e2e_s": 0.04511277299025096,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04492630099412054,
|
||||
"e2e_s": 0.04502850098651834,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04566141500254162,
|
||||
"e2e_s": 0.04576313399593346,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04261300901998766,
|
||||
"e2e_s": 0.04271370900096372,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06082483098725788,
|
||||
"e2e_s": 0.06096197199076414,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04355804901570082,
|
||||
"e2e_s": 0.04355804901570082,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.044165857980260625,
|
||||
"e2e_s": 0.044268568977713585,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 2032.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"4096": [
|
||||
{
|
||||
"ttft_s": 0.05101387499598786,
|
||||
"e2e_s": 0.051123478973750025,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.044408742018276826,
|
||||
"e2e_s": 0.044408742018276826,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04538871700060554,
|
||||
"e2e_s": 0.045498208986828104,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04754545699688606,
|
||||
"e2e_s": 0.047664124984294176,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04840670898556709,
|
||||
"e2e_s": 0.04840670898556709,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.0462190090038348,
|
||||
"e2e_s": 0.04632823000429198,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.04609913402236998,
|
||||
"e2e_s": 0.046204126003431156,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.0462158510053996,
|
||||
"e2e_s": 0.0462158510053996,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 4080.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"8192": [
|
||||
{
|
||||
"ttft_s": 0.05042222701013088,
|
||||
"e2e_s": 0.05053543800022453,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05319672200130299,
|
||||
"e2e_s": 0.053308423986891285,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05063424099353142,
|
||||
"e2e_s": 0.05073276098119095,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.050384567002765834,
|
||||
"e2e_s": 0.05048462699051015,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.055105848994571716,
|
||||
"e2e_s": 0.055215683998540044,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05349100599414669,
|
||||
"e2e_s": 0.053595816978486255,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.05177868201280944,
|
||||
"e2e_s": 0.05188246400211938,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.055209266021847725,
|
||||
"e2e_s": 0.05531894601881504,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 8176.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"16384": [
|
||||
{
|
||||
"ttft_s": 0.0633803239907138,
|
||||
"e2e_s": 0.06349112599855289,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06337857199832797,
|
||||
"e2e_s": 0.06350608498905785,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06098292299429886,
|
||||
"e2e_s": 0.061115075019188225,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06343525002012029,
|
||||
"e2e_s": 0.06355450401315466,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.0636955969966948,
|
||||
"e2e_s": 0.0636955969966948,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06389156199293211,
|
||||
"e2e_s": 0.06389156199293211,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.06587072199909016,
|
||||
"e2e_s": 0.06587072199909016,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.059953891002805904,
|
||||
"e2e_s": 0.060058912000386044,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 16368.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"32768": [
|
||||
{
|
||||
"ttft_s": 0.07615292401169427,
|
||||
"e2e_s": 0.07625289200223051,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07992992899380624,
|
||||
"e2e_s": 0.0800386439950671,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07980811400921084,
|
||||
"e2e_s": 0.07995001602103002,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.11761908099288121,
|
||||
"e2e_s": 0.11776423300034367,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07834753501811065,
|
||||
"e2e_s": 0.07834753501811065,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.0814115820103325,
|
||||
"e2e_s": 0.0814115820103325,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.08212830501724966,
|
||||
"e2e_s": 0.08224253499065526,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07761731799109839,
|
||||
"e2e_s": 0.07772363899857737,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 32752.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"65536": [
|
||||
{
|
||||
"ttft_s": 0.1206158839922864,
|
||||
"e2e_s": 0.1206158839922864,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.10727833199780434,
|
||||
"e2e_s": 0.10727833199780434,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.11010084400186315,
|
||||
"e2e_s": 0.11023741000099108,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.11566799599677324,
|
||||
"e2e_s": 0.1157765949901659,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.11422122400836088,
|
||||
"e2e_s": 0.11422122400836088,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07390080401091836,
|
||||
"e2e_s": 0.07390080401091836,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.11271239400957711,
|
||||
"e2e_s": 0.11271239400957711,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.10560396200162359,
|
||||
"e2e_s": 0.10572021701955236,
|
||||
"tier_observed": "gpu",
|
||||
"expect": "gpu",
|
||||
"d_gpu_hits": 65520.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
533
v2/exp_a_tier_latency/results/miss.json
Normal file
533
v2/exp_a_tier_latency/results/miss.json
Normal file
@@ -0,0 +1,533 @@
|
||||
{
|
||||
"mode": "miss",
|
||||
"reps": 8,
|
||||
"by_length": {
|
||||
"1024": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.07843716802017298,
|
||||
"ttft_mean": 0.1441972145003092,
|
||||
"ttft_min": 0.0768489159818273,
|
||||
"ttft_max": 0.6051040079910308,
|
||||
"tier_observed": "miss",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"2048": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.13254689799214248,
|
||||
"ttft_mean": 0.14012471562818973,
|
||||
"ttft_min": 0.13148935200297274,
|
||||
"ttft_max": 0.19460841300315224,
|
||||
"tier_observed": "miss",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"4096": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.2613626064994605,
|
||||
"ttft_mean": 0.2620435417484259,
|
||||
"ttft_min": 0.26085699602845125,
|
||||
"ttft_max": 0.2652779980271589,
|
||||
"tier_observed": "miss",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"8192": {
|
||||
"n": 8,
|
||||
"ttft_p50": 0.588181210012408,
|
||||
"ttft_mean": 0.5881437246280257,
|
||||
"ttft_min": 0.5867919930024073,
|
||||
"ttft_max": 0.5897468629991636,
|
||||
"tier_observed": "miss",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"16384": {
|
||||
"n": 8,
|
||||
"ttft_p50": 1.5470821364870062,
|
||||
"ttft_mean": 1.5479571051182575,
|
||||
"ttft_min": 1.5448924789961893,
|
||||
"ttft_max": 1.5552692519850098,
|
||||
"tier_observed": "miss",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"32768": {
|
||||
"n": 8,
|
||||
"ttft_p50": 4.603862981006387,
|
||||
"ttft_mean": 4.601982127005613,
|
||||
"ttft_min": 4.594119774992578,
|
||||
"ttft_max": 4.608500114001799,
|
||||
"tier_observed": "miss",
|
||||
"verified_frac": 1.0
|
||||
},
|
||||
"65536": {
|
||||
"n": 8,
|
||||
"ttft_p50": 15.230140178493457,
|
||||
"ttft_mean": 15.229316346121777,
|
||||
"ttft_min": 15.177009812992765,
|
||||
"ttft_max": 15.28301460199873,
|
||||
"tier_observed": "miss",
|
||||
"verified_frac": 1.0
|
||||
}
|
||||
},
|
||||
"raw": {
|
||||
"1024": [
|
||||
{
|
||||
"ttft_s": 0.6051040079910308,
|
||||
"e2e_s": 0.6052766389911994,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.08014080999419093,
|
||||
"e2e_s": 0.08025239198468626,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07735737299663015,
|
||||
"e2e_s": 0.07746260898420587,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.0768489159818273,
|
||||
"e2e_s": 0.0768489159818273,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07845149302738719,
|
||||
"e2e_s": 0.07860825702664442,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07842284301295877,
|
||||
"e2e_s": 0.07842284301295877,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.07716230300138704,
|
||||
"e2e_s": 0.07729722000658512,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.08008996999706142,
|
||||
"e2e_s": 0.08020236299489625,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"2048": [
|
||||
{
|
||||
"ttft_s": 0.19460841300315224,
|
||||
"e2e_s": 0.19472959099221043,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.13326648401562124,
|
||||
"e2e_s": 0.13339112501125783,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.1324451889959164,
|
||||
"e2e_s": 0.13255105601274408,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.13292257199645974,
|
||||
"e2e_s": 0.1330343289882876,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.13170140300644562,
|
||||
"e2e_s": 0.13180866098264232,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.13264860698836856,
|
||||
"e2e_s": 0.13274579899734817,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.13148935200297274,
|
||||
"e2e_s": 0.13160509002045728,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.13191570501658134,
|
||||
"e2e_s": 0.1320405500009656,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"4096": [
|
||||
{
|
||||
"ttft_s": 0.2652779980271589,
|
||||
"e2e_s": 0.2653735490166582,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.26092263497412205,
|
||||
"e2e_s": 0.2610465929901693,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.26085699602845125,
|
||||
"e2e_s": 0.2609657910070382,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.2613706949923653,
|
||||
"e2e_s": 0.2613706949923653,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.26135451800655574,
|
||||
"e2e_s": 0.2614478030009195,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.2610389889741782,
|
||||
"e2e_s": 0.2611535779724363,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.26337735800188966,
|
||||
"e2e_s": 0.2634820609819144,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.26214914498268627,
|
||||
"e2e_s": 0.26226521999342367,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"8192": [
|
||||
{
|
||||
"ttft_s": 0.5897468629991636,
|
||||
"e2e_s": 0.5898835949774366,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.5884731799887959,
|
||||
"e2e_s": 0.5884731799887959,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.588430589006748,
|
||||
"e2e_s": 0.588430589006748,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.5895502530038357,
|
||||
"e2e_s": 0.5896741840115283,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.5879318310180679,
|
||||
"e2e_s": 0.5879318310180679,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.5870601140195504,
|
||||
"e2e_s": 0.5871822330227587,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.5871649739856366,
|
||||
"e2e_s": 0.5871649739856366,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 0.5867919930024073,
|
||||
"e2e_s": 0.5869127069890965,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"16384": [
|
||||
{
|
||||
"ttft_s": 1.5552692519850098,
|
||||
"e2e_s": 1.5552692519850098,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 1.5471642419870477,
|
||||
"e2e_s": 1.547288056986872,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 1.5476988689915743,
|
||||
"e2e_s": 1.5476988689915743,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 1.5459686409740243,
|
||||
"e2e_s": 1.5460858139849734,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 1.548691272008,
|
||||
"e2e_s": 1.548691272008,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 1.5470000309869647,
|
||||
"e2e_s": 1.5471212370030116,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 1.5448924789961893,
|
||||
"e2e_s": 1.5448924789961893,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 1.5469720550172497,
|
||||
"e2e_s": 1.5470902820234187,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"32768": [
|
||||
{
|
||||
"ttft_s": 4.603710585011868,
|
||||
"e2e_s": 4.603829422005219,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 4.608500114001799,
|
||||
"e2e_s": 4.608500114001799,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 4.604015377000906,
|
||||
"e2e_s": 4.604134508001152,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 4.606255694001447,
|
||||
"e2e_s": 4.606392626999877,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 4.598494538018713,
|
||||
"e2e_s": 4.598494538018713,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 4.594119774992578,
|
||||
"e2e_s": 4.59425267498591,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 4.594318103016121,
|
||||
"e2e_s": 4.594462227018084,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 4.60644283000147,
|
||||
"e2e_s": 4.60644283000147,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
],
|
||||
"65536": [
|
||||
{
|
||||
"ttft_s": 15.276076515991008,
|
||||
"e2e_s": 15.276189939002506,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 15.275313204998383,
|
||||
"e2e_s": 15.275428983004531,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 15.274311708984897,
|
||||
"e2e_s": 15.274442903988529,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 15.28301460199873,
|
||||
"e2e_s": 15.283127713017166,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 15.185968648002017,
|
||||
"e2e_s": 15.185968648002017,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 15.177009812992765,
|
||||
"e2e_s": 15.177117884973995,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 15.181360610004049,
|
||||
"e2e_s": 15.181482268002583,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
},
|
||||
{
|
||||
"ttft_s": 15.18147566600237,
|
||||
"e2e_s": 15.18147566600237,
|
||||
"tier_observed": "miss",
|
||||
"expect": "miss",
|
||||
"d_gpu_hits": 0.0,
|
||||
"d_ext_hits": 0.0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
40
v2/exp_a_tier_latency/results/pcie.json
Normal file
40
v2/exp_a_tier_latency/results/pcie.json
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"device": "NVIDIA H20",
|
||||
"by_length": {
|
||||
"1024": {
|
||||
"kv_bytes": 100663296,
|
||||
"transfer_s": 0.001876260997960344,
|
||||
"bw_GBps": 53.65100916633112
|
||||
},
|
||||
"2048": {
|
||||
"kv_bytes": 201326592,
|
||||
"transfer_s": 0.003709116979734972,
|
||||
"bw_GBps": 54.27884671741612
|
||||
},
|
||||
"4096": {
|
||||
"kv_bytes": 402653184,
|
||||
"transfer_s": 0.007338636991335079,
|
||||
"bw_GBps": 54.86757070494469
|
||||
},
|
||||
"8192": {
|
||||
"kv_bytes": 805306368,
|
||||
"transfer_s": 0.01476299500791356,
|
||||
"bw_GBps": 54.548983290201164
|
||||
},
|
||||
"16384": {
|
||||
"kv_bytes": 1610612736,
|
||||
"transfer_s": 0.02972855800180696,
|
||||
"bw_GBps": 54.17729093695375
|
||||
},
|
||||
"32768": {
|
||||
"kv_bytes": 3221225472,
|
||||
"transfer_s": 0.059267577016726136,
|
||||
"bw_GBps": 54.35055107940257
|
||||
},
|
||||
"65536": {
|
||||
"kv_bytes": 6442450944,
|
||||
"transfer_s": 0.11847134301206097,
|
||||
"bw_GBps": 54.37982536708583
|
||||
}
|
||||
}
|
||||
}
|
||||
50
v2/exp_a_tier_latency/run.sh
Normal file
50
v2/exp_a_tier_latency/run.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
# Exp (a): three-tier hit-latency. Runs on dash0. One H20 (GPU $GPU).
|
||||
set -uo pipefail
|
||||
cd /home/admin/cpfs/wjh/agentic-kv
|
||||
PY=.venv/bin/python
|
||||
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
||||
GPU=${GPU:-0}
|
||||
PORT=${PORT:-8100}
|
||||
EP=http://127.0.0.1:$PORT
|
||||
OUT=v2/exp_a_tier_latency/results
|
||||
mkdir -p "$OUT"
|
||||
|
||||
VLLM_PID=""
|
||||
launch() { # $1 = extra args, $2 = logfile
|
||||
echo ">>> launch vllm: $1"
|
||||
CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \
|
||||
$PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
|
||||
--host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \
|
||||
--enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \
|
||||
$1 > "$2" 2>&1 &
|
||||
VLLM_PID=$!
|
||||
echo " pid=$VLLM_PID waiting for health..."
|
||||
$PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; \
|
||||
sys.exit(0 if wait_healthy('$EP',900) else 1)" || { echo "LAUNCH FAILED"; tail -30 "$2"; return 1; }
|
||||
echo " healthy."
|
||||
}
|
||||
teardown() {
|
||||
[ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null
|
||||
for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done
|
||||
kill -0 "$VLLM_PID" 2>/dev/null && kill -TERM "$VLLM_PID" 2>/dev/null
|
||||
sleep 3; VLLM_PID=""
|
||||
}
|
||||
trap teardown EXIT
|
||||
|
||||
# ---- Config A1: big GPU pool, NO offload -> measure MISS + GPU hit ----
|
||||
launch "--gpu-memory-utilization 0.9" "$OUT/vllm_a1.log" || exit 1
|
||||
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode miss --reps 8 --out "$OUT/miss.json"
|
||||
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode gpu --reps 8 --out "$OUT/gpu.json"
|
||||
teardown
|
||||
|
||||
# ---- Config A2: small GPU pool (80k tok = 5000 blocks) + CPU offload 40GB -> CPU hit ----
|
||||
launch "--num-gpu-blocks-override 5000 --kv-offloading-size 40 --kv-offloading-backend native" "$OUT/vllm_a2.log" || exit 1
|
||||
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode cpu --reps 4 \
|
||||
--flood-tokens 88000 --flood-chunk 16384 --out "$OUT/cpu.json"
|
||||
teardown
|
||||
|
||||
# ---- PCIe backstop (uses the now-free GPU) ----
|
||||
CUDA_VISIBLE_DEVICES=$GPU $PY v2/exp_a_tier_latency/pcie_transfer.py --reps 20 --out "$OUT/pcie.json"
|
||||
|
||||
echo "=== exp (a) DONE ==="
|
||||
39
v2/exp_a_tier_latency/run_cpu.sh
Normal file
39
v2/exp_a_tier_latency/run_cpu.sh
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
# Exp (a) CPU-tier + PCIe only (miss/gpu already done). HMA fix applied.
|
||||
set -uo pipefail
|
||||
cd /home/admin/cpfs/wjh/agentic-kv
|
||||
PY=.venv/bin/python
|
||||
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
||||
GPU=${GPU:-0}
|
||||
PORT=${PORT:-8100}
|
||||
EP=http://127.0.0.1:$PORT
|
||||
OUT=v2/exp_a_tier_latency/results
|
||||
mkdir -p "$OUT"
|
||||
|
||||
VLLM_PID=""
|
||||
teardown() {
|
||||
[ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null
|
||||
for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done
|
||||
sleep 3; VLLM_PID=""
|
||||
}
|
||||
trap teardown EXIT
|
||||
|
||||
echo ">>> launch A2: small pool + CPU offload (HMA disabled)"
|
||||
CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \
|
||||
$PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \
|
||||
--host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \
|
||||
--enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \
|
||||
--num-gpu-blocks-override 5000 --kv-offloading-size 40 --kv-offloading-backend native \
|
||||
--disable-hybrid-kv-cache-manager > "$OUT/vllm_a2.log" 2>&1 &
|
||||
VLLM_PID=$!
|
||||
echo " pid=$VLLM_PID waiting for health..."
|
||||
$PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; sys.exit(0 if wait_healthy('$EP',900) else 1)" \
|
||||
|| { echo "LAUNCH FAILED"; tail -25 "$OUT/vllm_a2.log"; exit 1; }
|
||||
echo " healthy."
|
||||
|
||||
$PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode cpu --reps 4 \
|
||||
--flood-tokens 88000 --flood-chunk 16384 --out "$OUT/cpu.json"
|
||||
teardown
|
||||
|
||||
CUDA_VISIBLE_DEVICES=$GPU $PY v2/exp_a_tier_latency/pcie_transfer.py --reps 20 --out "$OUT/pcie.json"
|
||||
echo "=== exp (a) CPU+PCIe DONE ==="
|
||||
BIN
v2/figs/exp_a_tier_latency.png
Normal file
BIN
v2/figs/exp_a_tier_latency.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 81 KiB |
Reference in New Issue
Block a user