Files
agentic-kvc/scripts/plot_gpu_timeline.py
Gahow Wang 1e8628581b Fair A/B: Elastic P2P wins on ALL metrics vs baseline (fresh restart)
Same-condition comparison (both fresh restart, same trace, same params):
  Baseline (combined):  TTFT=2.383/27.622  TPOT90=0.117  E2E=10.232
  Elastic P2P (cap=4):  TTFT=1.315/13.179  TPOT90=0.075  E2E=5.708
  Delta:                -45%  / -52%        -36%          -44%

Key finding: TPOT p90 dropped 36% — confirming heavy prefill DOES
disrupt decode in combined mode, and elastic offload effectively
isolates it. Previous comparisons missed this because baselines
were run under different conditions (stale instances, different time_scale).

GPU util: elastic uses less GPU (15.8% vs 28.7%) but achieves better
latency — higher efficiency through better cache distribution.

APC: elastic has more balanced per-instance APC (36-38% prefix + 30-35%
external) vs baseline's skewed distribution (3.8% - 68.3%).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 15:48:51 +08:00

102 lines
3.2 KiB
Python

"""Plot per-GPU utilization timeline for elastic vs baseline."""
import csv, json, sys, os
def load_gpu(path):
"""Load GPU util CSV, return {gpu_id: [(timestamp, util%)]]}."""
by_gpu = {}
with open(path) as f:
for r in csv.DictReader(f):
g = int(r["gpu"])
t = float(r["timestamp"])
u = float(r["util_pct"])
by_gpu.setdefault(g, []).append((t, u))
# Normalize timestamps to start at 0
if by_gpu:
t0 = min(pts[0][0] for pts in by_gpu.values())
for g in by_gpu:
by_gpu[g] = [(t - t0, u) for t, u in by_gpu[g]]
return by_gpu
def print_timeline(by_gpu, label, max_time=None):
"""Print ASCII timeline of GPU utilization."""
print(f"\n{'='*70}")
print(f" {label}")
print(f"{'='*70}")
if not by_gpu:
print(" No data")
return
# Bucket into 10s windows
window = 10.0
if max_time is None:
max_time = max(t for pts in by_gpu.values() for t, _ in pts)
n_windows = min(int(max_time / window) + 1, 40) # cap at 40 columns
for gpu in sorted(by_gpu.keys()):
pts = by_gpu[gpu]
buckets = [[] for _ in range(n_windows)]
for t, u in pts:
b = min(int(t / window), n_windows - 1)
buckets[b].append(u)
avgs = [sum(b)/len(b) if b else 0 for b in buckets]
# ASCII bar: . = 0-10%, o = 10-30%, O = 30-60%, # = 60-100%
bar = ""
for a in avgs:
if a < 1: bar += " "
elif a < 10: bar += "."
elif a < 30: bar += "o"
elif a < 60: bar += "O"
else: bar += "#"
mean = sum(a for a in avgs) / len(avgs) if avgs else 0
print(f" GPU{gpu}: |{bar}| mean={mean:.0f}%")
print(f" Time: {'0':>1}{'':>{n_windows-6}}{int(max_time)}s")
print(f" Legend: ' '=0% .=1-10% o=10-30% O=30-60% #=60-100%")
# Per-GPU stats
print(f"\n Per-GPU mean utilization:")
for gpu in sorted(by_gpu.keys()):
pts = by_gpu[gpu]
vals = [u for _, u in pts]
mean = sum(vals) / len(vals)
nz = sum(1 for v in vals if v > 0)
print(f" GPU{gpu}: mean={mean:.1f}% active={nz*100//len(vals)}% samples={len(vals)}")
# Load and compare
configs = [
("outputs/baseline_dash1/gpu_util.csv", "Baseline (8 combined, dash1)"),
("outputs/elastic_v4/gpu_util.csv", "Elastic P2P v4 (dash0)"),
]
for path, label in configs:
if os.path.exists(path):
by_gpu = load_gpu(path)
print_timeline(by_gpu, label)
else:
print(f"\n {label}: {path} NOT FOUND")
# Imbalance metric
print(f"\n{'='*70}")
print(f" LOAD IMBALANCE ANALYSIS")
print(f"{'='*70}")
for path, label in configs:
if not os.path.exists(path):
continue
by_gpu = load_gpu(path)
means = []
for gpu in sorted(by_gpu.keys()):
vals = [u for _, u in by_gpu[gpu]]
means.append(sum(vals) / len(vals))
if means:
avg = sum(means) / len(means)
max_m = max(means)
min_m = min(means)
imbalance = max_m / max(min_m, 0.1)
print(f" {label}:")
print(f" Per-GPU means: {['%.1f' % m for m in means]}")
print(f" Avg={avg:.1f}% Min={min_m:.1f}% Max={max_m:.1f}% Imbalance={imbalance:.1f}x")