D2: run_benchmark.sh and run_experiments.sh still pass --time-scale and --max-inflight-sessions to the replayer, but those flags were removed when the project moved to trace-driven dispatch. The scripts cannot run as-is. D3: ~25 ad-hoc analyze_* / compare_* / profile_* / final_* scripts and a handful of single-experiment run_*.sh point at /home/admin/cpfs paths, deleted output directories, or a sampled trace file that no longer exists. Keep them in scripts/legacy/ for historical reference; the scripts that remain in scripts/ (analyze_trace, analyze_breakdown, analyze_cache_hit, analyze_eviction, compare_results, compute_roofline, sample_trace, analyze_agentic_patterns, simulate_cache_policies, plus launch_*.sh, gpu_monitor.sh, bench.sh) cover the current workflow. Adds scripts/legacy/README.md to document the archival policy.
88 lines
3.7 KiB
Python
88 lines
3.7 KiB
Python
"""Deep profile: why fire-and-forget TTFT is 5x worse than await."""
|
|
import json, statistics
|
|
|
|
await_rows = [json.loads(l) for l in open("outputs/gpu_ab_6p2d/metrics.jsonl")]
|
|
fnf_rows = [json.loads(l) for l in open("outputs/gpu_ab_6p2d_fnf/metrics.jsonl")]
|
|
|
|
await_ok = [r for r in await_rows if not r.get("error")]
|
|
fnf_ok = [r for r in fnf_rows if not r.get("error")]
|
|
|
|
# Match by request_id
|
|
await_by_id = {r["request_id"]: r for r in await_ok}
|
|
fnf_by_id = {r["request_id"]: r for r in fnf_ok}
|
|
common = set(await_by_id.keys()) & set(fnf_by_id.keys())
|
|
|
|
print("=" * 75)
|
|
print(" PROFILE: Fire-and-Forget vs Await-Prefill (same 6P+2D instances)")
|
|
print("=" * 75)
|
|
print(f" Common requests: {len(common)}")
|
|
|
|
# Per-request comparison
|
|
diffs = []
|
|
for rid in common:
|
|
a = await_by_id[rid]
|
|
f = fnf_by_id[rid]
|
|
if a.get("ttft_s") and f.get("ttft_s") and a["ttft_s"] > 0:
|
|
diffs.append({
|
|
"id": rid, "input": a["input_length"],
|
|
"a_ttft": a["ttft_s"], "f_ttft": f["ttft_s"],
|
|
"ratio": f["ttft_s"] / a["ttft_s"],
|
|
"a_e2e": a["latency_s"], "f_e2e": f["latency_s"],
|
|
"a_tpot": a.get("tpot_s", 0), "f_tpot": f.get("tpot_s", 0),
|
|
"a_out": a.get("actual_output_tokens", 0) or 0,
|
|
"f_out": f.get("actual_output_tokens", 0) or 0,
|
|
})
|
|
|
|
diffs.sort(key=lambda x: x["input"])
|
|
|
|
print("\n Per-request (sorted by input_length):")
|
|
hdr = "%8s %10s %10s %7s %10s %10s %8s %8s" % (
|
|
"input", "await_TTFT", "fnf_TTFT", "ratio", "await_E2E", "fnf_E2E", "a_TPOT", "f_TPOT")
|
|
print(" " + hdr)
|
|
print(" " + "-" * len(hdr))
|
|
for d in diffs[:25]:
|
|
print(" %8d %10.3f %10.3f %6.1fx %10.3f %10.3f %8.4f %8.4f" % (
|
|
d["input"], d["a_ttft"], d["f_ttft"], d["ratio"],
|
|
d["a_e2e"], d["f_e2e"], d["a_tpot"], d["f_tpot"]))
|
|
|
|
# Statistics
|
|
if diffs:
|
|
ratios = [d["ratio"] for d in diffs]
|
|
ratios.sort()
|
|
p = lambda v, q: v[min(int(q*len(v)), len(v)-1)]
|
|
print("\n TTFT ratio (FnF / Await):")
|
|
print(" p10=%.2fx p50=%.2fx p90=%.2fx mean=%.2fx" % (
|
|
p(ratios,.1), p(ratios,.5), p(ratios,.9), statistics.fmean(ratios)))
|
|
|
|
faster = sum(1 for r in ratios if r < 1.0)
|
|
print(" FnF faster: %d/%d (%.0f%%)" % (faster, len(ratios), faster*100/len(ratios)))
|
|
|
|
# Bucket by input size
|
|
print("\n TTFT ratio by input size bucket:")
|
|
buckets = [(0, 5000, "<5k"), (5000, 20000, "5-20k"), (20000, 50000, "20-50k"), (50000, 999999, ">50k")]
|
|
for lo, hi, label in buckets:
|
|
subset = [d for d in diffs if lo <= d["input"] < hi]
|
|
if subset:
|
|
rs = [d["ratio"] for d in subset]
|
|
a_ttfts = [d["a_ttft"] for d in subset]
|
|
f_ttfts = [d["f_ttft"] for d in subset]
|
|
print(" %6s: n=%3d await_TTFT=%.3f fnf_TTFT=%.3f ratio=%.2fx" % (
|
|
label, len(subset), statistics.fmean(a_ttfts), statistics.fmean(f_ttfts),
|
|
statistics.fmean(rs)))
|
|
|
|
# TPOT comparison
|
|
a_tpots = [d["a_tpot"] for d in diffs if d["a_tpot"] > 0]
|
|
f_tpots = [d["f_tpot"] for d in diffs if d["f_tpot"] > 0]
|
|
if a_tpots and f_tpots:
|
|
print("\n TPOT comparison:")
|
|
print(" Await: mean=%.4f p50=%.4f" % (statistics.fmean(a_tpots), sorted(a_tpots)[len(a_tpots)//2]))
|
|
print(" FnF: mean=%.4f p50=%.4f" % (statistics.fmean(f_tpots), sorted(f_tpots)[len(f_tpots)//2]))
|
|
|
|
# Also look at non-common requests (FnF only failures)
|
|
fnf_err = [r for r in fnf_rows if r.get("error")]
|
|
await_err_ids = {r["request_id"] for r in await_rows if r.get("error")}
|
|
fnf_only_err = [r for r in fnf_err if r["request_id"] not in await_err_ids]
|
|
print("\n Errors unique to FnF: %d" % len(fnf_only_err))
|
|
for r in fnf_only_err[:5]:
|
|
print(" input=%d err=%s" % (r["input_length"], r["error"][:60]))
|