Critical: - cache_aware_proxy: _handle_pd_sep leaked p_inst.num_requests (never decremented) and never managed d_inst.num_requests; fix media_type from application/json to text/event-stream for SSE stream High: - b3_sweep/b3_isolated_policy/b3_analyze: replace hardcoded /home/admin/cpfs/wjh/ ROOT with script-relative $(dirname "$0")/.. - b3_analyze: replace hardcoded 8-port WORKER_MAP with dynamic generation from BASE_PORT and N_INSTANCES Medium: - analyze_breakdown: warn on stderr when records are skipped (was silent) - deploy_vllm_patches: fail-fast on SSH/SCP errors instead of continuing with empty VENV_SITE - pyproject.toml: declare fastapi and uvicorn as runtime dependencies - launch_elastic_p2p: kill EngineCore and proxy in trap handler to prevent GPU memory leaks on exit
60 lines
2.0 KiB
Python
60 lines
2.0 KiB
Python
"""Analyze per-request breakdown data from the proxy."""
|
|
import json, statistics, sys
|
|
|
|
url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:9090/breakdown"
|
|
|
|
if url.startswith("http"):
|
|
import urllib.request
|
|
data = json.loads(urllib.request.urlopen(url, timeout=10).read())
|
|
else:
|
|
data = json.load(open(url))
|
|
|
|
print("Total records: %d" % len(data))
|
|
|
|
results = []
|
|
skipped = 0
|
|
for d in data:
|
|
keys = ["t_proxy_recv", "t_prefill_sent", "t_prefill_done", "t_decode_sent", "t_first_token"]
|
|
if not all(k in d for k in keys):
|
|
skipped += 1
|
|
continue
|
|
results.append({
|
|
"input": d["input_length"],
|
|
"prefill": d["t_prefill_done"] - d["t_prefill_sent"],
|
|
"proxy_gap": d["t_decode_sent"] - d["t_prefill_done"],
|
|
"kv_decode": d["t_first_token"] - d["t_decode_sent"],
|
|
"ttft": d["t_first_token"] - d["t_proxy_recv"],
|
|
})
|
|
|
|
results.sort(key=lambda x: x["input"])
|
|
print("Complete breakdown: %d" % len(results))
|
|
if skipped:
|
|
print("WARNING: %d records skipped (missing breakdown timestamps)" % skipped,
|
|
file=sys.stderr)
|
|
|
|
if not results:
|
|
print("No complete records yet")
|
|
sys.exit(0)
|
|
|
|
print()
|
|
print(" %8s %9s %9s %9s %9s" % ("input", "prefill", "proxy", "kv+dec", "TTFT"))
|
|
print(" %8s %9s %9s %9s %9s" % ("-----", "-------", "-----", "------", "----"))
|
|
for r in results[:25]:
|
|
print(" %8d %9.3f %9.3f %9.3f %9.3f" % (
|
|
r["input"], r["prefill"], r["proxy_gap"], r["kv_decode"], r["ttft"]))
|
|
|
|
print()
|
|
for key in ["prefill", "proxy_gap", "kv_decode", "ttft"]:
|
|
vals = sorted([r[key] for r in results])
|
|
p = lambda q: vals[min(int(q * len(vals)), len(vals) - 1)]
|
|
print(" %s: p50=%.3fs p90=%.3fs mean=%.3fs" % (
|
|
key, p(.5), p(.9), statistics.fmean(vals)))
|
|
|
|
# Fraction of TTFT by stage
|
|
print()
|
|
print(" TTFT breakdown (fraction of total):")
|
|
for key in ["prefill", "proxy_gap", "kv_decode"]:
|
|
fracs = [r[key] / r["ttft"] * 100 for r in results if r["ttft"] > 0.01]
|
|
if fracs:
|
|
print(" %s: mean=%.1f%% of TTFT" % (key, statistics.fmean(fracs)))
|