Same-condition comparison (both fresh restart, same trace, same params): Baseline (combined): TTFT=2.383/27.622 TPOT90=0.117 E2E=10.232 Elastic P2P (cap=4): TTFT=1.315/13.179 TPOT90=0.075 E2E=5.708 Delta: -45% / -52% -36% -44% Key finding: TPOT p90 dropped 36% — confirming heavy prefill DOES disrupt decode in combined mode, and elastic offload effectively isolates it. Previous comparisons missed this because baselines were run under different conditions (stale instances, different time_scale). GPU util: elastic uses less GPU (15.8% vs 28.7%) but achieves better latency — higher efficiency through better cache distribution. APC: elastic has more balanced per-instance APC (36-38% prefix + 30-35% external) vs baseline's skewed distribution (3.8% - 68.3%). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
"""Plot per-GPU utilization timeline for elastic vs baseline."""
|
|
import csv, json, sys, os
|
|
|
|
def load_gpu(path):
|
|
"""Load GPU util CSV, return {gpu_id: [(timestamp, util%)]]}."""
|
|
by_gpu = {}
|
|
with open(path) as f:
|
|
for r in csv.DictReader(f):
|
|
g = int(r["gpu"])
|
|
t = float(r["timestamp"])
|
|
u = float(r["util_pct"])
|
|
by_gpu.setdefault(g, []).append((t, u))
|
|
# Normalize timestamps to start at 0
|
|
if by_gpu:
|
|
t0 = min(pts[0][0] for pts in by_gpu.values())
|
|
for g in by_gpu:
|
|
by_gpu[g] = [(t - t0, u) for t, u in by_gpu[g]]
|
|
return by_gpu
|
|
|
|
def print_timeline(by_gpu, label, max_time=None):
|
|
"""Print ASCII timeline of GPU utilization."""
|
|
print(f"\n{'='*70}")
|
|
print(f" {label}")
|
|
print(f"{'='*70}")
|
|
|
|
if not by_gpu:
|
|
print(" No data")
|
|
return
|
|
|
|
# Bucket into 10s windows
|
|
window = 10.0
|
|
if max_time is None:
|
|
max_time = max(t for pts in by_gpu.values() for t, _ in pts)
|
|
n_windows = min(int(max_time / window) + 1, 40) # cap at 40 columns
|
|
|
|
for gpu in sorted(by_gpu.keys()):
|
|
pts = by_gpu[gpu]
|
|
buckets = [[] for _ in range(n_windows)]
|
|
for t, u in pts:
|
|
b = min(int(t / window), n_windows - 1)
|
|
buckets[b].append(u)
|
|
|
|
avgs = [sum(b)/len(b) if b else 0 for b in buckets]
|
|
# ASCII bar: . = 0-10%, o = 10-30%, O = 30-60%, # = 60-100%
|
|
bar = ""
|
|
for a in avgs:
|
|
if a < 1: bar += " "
|
|
elif a < 10: bar += "."
|
|
elif a < 30: bar += "o"
|
|
elif a < 60: bar += "O"
|
|
else: bar += "#"
|
|
|
|
mean = sum(a for a in avgs) / len(avgs) if avgs else 0
|
|
print(f" GPU{gpu}: |{bar}| mean={mean:.0f}%")
|
|
|
|
print(f" Time: {'0':>1}{'':>{n_windows-6}}{int(max_time)}s")
|
|
print(f" Legend: ' '=0% .=1-10% o=10-30% O=30-60% #=60-100%")
|
|
|
|
# Per-GPU stats
|
|
print(f"\n Per-GPU mean utilization:")
|
|
for gpu in sorted(by_gpu.keys()):
|
|
pts = by_gpu[gpu]
|
|
vals = [u for _, u in pts]
|
|
mean = sum(vals) / len(vals)
|
|
nz = sum(1 for v in vals if v > 0)
|
|
print(f" GPU{gpu}: mean={mean:.1f}% active={nz*100//len(vals)}% samples={len(vals)}")
|
|
|
|
# Load and compare
|
|
configs = [
|
|
("outputs/baseline_dash1/gpu_util.csv", "Baseline (8 combined, dash1)"),
|
|
("outputs/elastic_v4/gpu_util.csv", "Elastic P2P v4 (dash0)"),
|
|
]
|
|
|
|
for path, label in configs:
|
|
if os.path.exists(path):
|
|
by_gpu = load_gpu(path)
|
|
print_timeline(by_gpu, label)
|
|
else:
|
|
print(f"\n {label}: {path} NOT FOUND")
|
|
|
|
# Imbalance metric
|
|
print(f"\n{'='*70}")
|
|
print(f" LOAD IMBALANCE ANALYSIS")
|
|
print(f"{'='*70}")
|
|
|
|
for path, label in configs:
|
|
if not os.path.exists(path):
|
|
continue
|
|
by_gpu = load_gpu(path)
|
|
means = []
|
|
for gpu in sorted(by_gpu.keys()):
|
|
vals = [u for _, u in by_gpu[gpu]]
|
|
means.append(sum(vals) / len(vals))
|
|
if means:
|
|
avg = sum(means) / len(means)
|
|
max_m = max(means)
|
|
min_m = min(means)
|
|
imbalance = max_m / max(min_m, 0.1)
|
|
print(f" {label}:")
|
|
print(f" Per-GPU means: {['%.1f' % m for m in means]}")
|
|
print(f" Avg={avg:.1f}% Min={min_m:.1f}% Max={max_m:.1f}% Imbalance={imbalance:.1f}x")
|