Breakdown profiling at proxy level captures: t_proxy_recv → t_prefill_sent → t_prefill_done → t_decode_sent → t_first_token Key finding: 87.7% of TTFT is spent in kv+decode phase, NOT prefill. Root cause: decode instance KV cache memory saturation (97.1% usage). With 6P+2D config, 2 decode GPUs have only ~56GB total KV cache. Large agentic requests (avg 33.6k tokens) fill this quickly. Small requests (49 tokens, prefill=0.044s) wait 114s for KV cache to be freed by large requests completing decode. vLLM log confirms: Running=0, Waiting=6, KV cache=97.1% GPU is idle but requests queue for KV cache memory, not compute. This is the fundamental bottleneck of single-machine PD separation for long-context agentic workloads: concentrating decode onto fewer GPUs creates a KV cache memory wall. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
55 lines
1.9 KiB
Python
55 lines
1.9 KiB
Python
"""Analyze per-request breakdown data from the proxy."""
|
|
import json, statistics, sys
|
|
|
|
url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:9090/breakdown"
|
|
|
|
if url.startswith("http"):
|
|
import urllib.request
|
|
data = json.loads(urllib.request.urlopen(url, timeout=10).read())
|
|
else:
|
|
data = json.load(open(url))
|
|
|
|
print("Total records: %d" % len(data))
|
|
|
|
results = []
|
|
for d in data:
|
|
keys = ["t_proxy_recv", "t_prefill_sent", "t_prefill_done", "t_decode_sent", "t_first_token"]
|
|
if not all(k in d for k in keys):
|
|
continue
|
|
results.append({
|
|
"input": d["input_length"],
|
|
"prefill": d["t_prefill_done"] - d["t_prefill_sent"],
|
|
"proxy_gap": d["t_decode_sent"] - d["t_prefill_done"],
|
|
"kv_decode": d["t_first_token"] - d["t_decode_sent"],
|
|
"ttft": d["t_first_token"] - d["t_proxy_recv"],
|
|
})
|
|
|
|
results.sort(key=lambda x: x["input"])
|
|
print("Complete breakdown: %d" % len(results))
|
|
|
|
if not results:
|
|
print("No complete records yet")
|
|
sys.exit(0)
|
|
|
|
print()
|
|
print(" %8s %9s %9s %9s %9s" % ("input", "prefill", "proxy", "kv+dec", "TTFT"))
|
|
print(" %8s %9s %9s %9s %9s" % ("-----", "-------", "-----", "------", "----"))
|
|
for r in results[:25]:
|
|
print(" %8d %9.3f %9.3f %9.3f %9.3f" % (
|
|
r["input"], r["prefill"], r["proxy_gap"], r["kv_decode"], r["ttft"]))
|
|
|
|
print()
|
|
for key in ["prefill", "proxy_gap", "kv_decode", "ttft"]:
|
|
vals = sorted([r[key] for r in results])
|
|
p = lambda q: vals[min(int(q * len(vals)), len(vals) - 1)]
|
|
print(" %s: p50=%.3fs p90=%.3fs mean=%.3fs" % (
|
|
key, p(.5), p(.9), statistics.fmean(vals)))
|
|
|
|
# Fraction of TTFT by stage
|
|
print()
|
|
print(" TTFT breakdown (fraction of total):")
|
|
for key in ["prefill", "proxy_gap", "kv_decode"]:
|
|
fracs = [r[key] / r["ttft"] * 100 for r in results if r["ttft"] > 0.01]
|
|
if fracs:
|
|
print(" %s: mean=%.1f%% of TTFT" % (key, statistics.fmean(fracs)))
|