Files
agentic-kvc/scripts/legacy/plot_gpu_timeline.py
Gahow Wang 547611e022 scripts: archive obsolete one-off shell/python scripts to legacy/ (D2, D3)
D2: run_benchmark.sh and run_experiments.sh still pass --time-scale and
--max-inflight-sessions to the replayer, but those flags were removed when
the project moved to trace-driven dispatch. The scripts cannot run as-is.

D3: ~25 ad-hoc analyze_* / compare_* / profile_* / final_* scripts and a
handful of single-experiment run_*.sh point at /home/admin/cpfs paths,
deleted output directories, or a sampled trace file that no longer exists.
Keep them in scripts/legacy/ for historical reference; the scripts that
remain in scripts/ (analyze_trace, analyze_breakdown, analyze_cache_hit,
analyze_eviction, compare_results, compute_roofline, sample_trace,
analyze_agentic_patterns, simulate_cache_policies, plus launch_*.sh,
gpu_monitor.sh, bench.sh) cover the current workflow.

Adds scripts/legacy/README.md to document the archival policy.
2026-05-23 20:57:32 +08:00

102 lines
3.2 KiB
Python

"""Plot per-GPU utilization timeline for elastic vs baseline."""
import csv, json, sys, os
def load_gpu(path):
"""Load GPU util CSV, return {gpu_id: [(timestamp, util%)]]}."""
by_gpu = {}
with open(path) as f:
for r in csv.DictReader(f):
g = int(r["gpu"])
t = float(r["timestamp"])
u = float(r["util_pct"])
by_gpu.setdefault(g, []).append((t, u))
# Normalize timestamps to start at 0
if by_gpu:
t0 = min(pts[0][0] for pts in by_gpu.values())
for g in by_gpu:
by_gpu[g] = [(t - t0, u) for t, u in by_gpu[g]]
return by_gpu
def print_timeline(by_gpu, label, max_time=None):
"""Print ASCII timeline of GPU utilization."""
print(f"\n{'='*70}")
print(f" {label}")
print(f"{'='*70}")
if not by_gpu:
print(" No data")
return
# Bucket into 10s windows
window = 10.0
if max_time is None:
max_time = max(t for pts in by_gpu.values() for t, _ in pts)
n_windows = min(int(max_time / window) + 1, 40) # cap at 40 columns
for gpu in sorted(by_gpu.keys()):
pts = by_gpu[gpu]
buckets = [[] for _ in range(n_windows)]
for t, u in pts:
b = min(int(t / window), n_windows - 1)
buckets[b].append(u)
avgs = [sum(b)/len(b) if b else 0 for b in buckets]
# ASCII bar: . = 0-10%, o = 10-30%, O = 30-60%, # = 60-100%
bar = ""
for a in avgs:
if a < 1: bar += " "
elif a < 10: bar += "."
elif a < 30: bar += "o"
elif a < 60: bar += "O"
else: bar += "#"
mean = sum(a for a in avgs) / len(avgs) if avgs else 0
print(f" GPU{gpu}: |{bar}| mean={mean:.0f}%")
print(f" Time: {'0':>1}{'':>{n_windows-6}}{int(max_time)}s")
print(f" Legend: ' '=0% .=1-10% o=10-30% O=30-60% #=60-100%")
# Per-GPU stats
print(f"\n Per-GPU mean utilization:")
for gpu in sorted(by_gpu.keys()):
pts = by_gpu[gpu]
vals = [u for _, u in pts]
mean = sum(vals) / len(vals)
nz = sum(1 for v in vals if v > 0)
print(f" GPU{gpu}: mean={mean:.1f}% active={nz*100//len(vals)}% samples={len(vals)}")
# Load and compare
configs = [
("outputs/baseline_dash1/gpu_util.csv", "Baseline (8 combined, dash1)"),
("outputs/elastic_v4/gpu_util.csv", "Elastic P2P v4 (dash0)"),
]
for path, label in configs:
if os.path.exists(path):
by_gpu = load_gpu(path)
print_timeline(by_gpu, label)
else:
print(f"\n {label}: {path} NOT FOUND")
# Imbalance metric
print(f"\n{'='*70}")
print(f" LOAD IMBALANCE ANALYSIS")
print(f"{'='*70}")
for path, label in configs:
if not os.path.exists(path):
continue
by_gpu = load_gpu(path)
means = []
for gpu in sorted(by_gpu.keys()):
vals = [u for _, u in by_gpu[gpu]]
means.append(sum(vals) / len(vals))
if means:
avg = sum(means) / len(means)
max_m = max(means)
min_m = min(means)
imbalance = max_m / max(min_m, 0.1)
print(f" {label}:")
print(f" Per-GPU means: {['%.1f' % m for m in means]}")
print(f" Avg={avg:.1f}% Min={min_m:.1f}% Max={max_m:.1f}% Imbalance={imbalance:.1f}x")