Fixed offload decision: removed p>=d gate (was blocking all offloads), added MAX_OFFLOAD_INFLIGHT=4 cap and p_saturated threshold. Result (200 req, fresh restart): Baseline: 99% success, TTFT=1.080/9.410, TPOT90=0.076, E2E=5.306 Elastic: 96% success, TTFT=0.946/15.843, TPOT90=0.077, E2E=5.717 Architectural tradeoff confirmed: - Median (p50) improves: D instances not disrupted by heavy prefill - Tail (p90) worsens: offloaded HEAVY requests pay KV transfer cost - TPOT unchanged: decode isolation is not the bottleneck To improve p90: need layerwise pipelined KV transfer (overlap with prefill compute) or smarter offload gating that avoids offloading the very largest requests (which have the longest prefill time and generate the most KV). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
57 lines
2.2 KiB
Python
57 lines
2.2 KiB
Python
"""Compare elastic v4 (cap=4, relaxed conditions) vs baseline."""
|
|
import json, os
|
|
|
|
def s(path):
|
|
rows = [json.loads(l) for l in open(path)]
|
|
ok = [r for r in rows if not r.get("error")]
|
|
ttfts = sorted([r["ttft_s"] for r in ok if r.get("ttft_s")])
|
|
tpots = sorted([r["tpot_s"] for r in ok if r.get("tpot_s") and r["tpot_s"]>0])
|
|
lats = sorted([r["latency_s"] for r in ok])
|
|
p = lambda v,q: v[min(int(q*len(v)),len(v)-1)] if v else 0
|
|
ok_inp = sorted([r["input_length"] for r in ok])
|
|
err_inp = sorted([r["input_length"] for r in rows if r.get("error")])
|
|
return {"ok": len(ok), "n": len(rows),
|
|
"t50": p(ttfts,.5), "t90": p(ttfts,.9),
|
|
"p50": p(tpots,.5), "p90": p(tpots,.9),
|
|
"e50": p(lats,.5),
|
|
"inp50": p(ok_inp,.5), "inp90": p(ok_inp,.9),
|
|
"err_inp50": p(err_inp,.5) if err_inp else 0}
|
|
|
|
print("ELASTIC P2P v4 vs BASELINE (both 200 req)")
|
|
print("=" * 80)
|
|
fmt = "%-32s %7s %8s %8s %8s %8s %8s %8s"
|
|
print(fmt % ("Config", "OK/N", "TTFT50", "TTFT90", "TPOT90", "E2E50", "inp_p50", "err_inp"))
|
|
print("-" * 80)
|
|
|
|
configs = [
|
|
("outputs/baseline_dash1/metrics.jsonl", "Baseline (8 combined, dash1)"),
|
|
("outputs/elastic_v4/metrics.jsonl", "Elastic P2P (cap=4, dash0)"),
|
|
]
|
|
results = {}
|
|
for path, label in configs:
|
|
if not os.path.exists(path):
|
|
continue
|
|
r = s(path)
|
|
results[label] = r
|
|
print(fmt % (label, "%d/%d" % (r["ok"],r["n"]),
|
|
"%.3f" % r["t50"], "%.3f" % r["t90"], "%.3f" % r["p90"],
|
|
"%.3f" % r["e50"], str(r["inp50"]), str(r["err_inp50"])))
|
|
|
|
if len(results) == 2:
|
|
b = list(results.values())[0]
|
|
a = list(results.values())[1]
|
|
print()
|
|
print("DELTA (Elastic vs Baseline):")
|
|
for label, bv, av in [
|
|
("TTFT p50", b["t50"], a["t50"]),
|
|
("TTFT p90", b["t90"], a["t90"]),
|
|
("TPOT p90", b["p90"], a["p90"]),
|
|
("E2E p50", b["e50"], a["e50"]),
|
|
]:
|
|
d = (av/bv-1)*100 if bv > 0 else 0
|
|
print(" %s: %.3f -> %.3f (%+.1f%%)" % (label, bv, av, d))
|
|
print(" Success: %d/%d (%.1f%%) -> %d/%d (%.1f%%)" % (
|
|
b["ok"], b["n"], b["ok"]*100/b["n"],
|
|
a["ok"], a["n"], a["ok"]*100/a["n"]))
|
|
print(" Input coverage p50: %s -> %s (bias check)" % (b["inp50"], a["inp50"]))
|