V2_DEEP_ANALYSIS §3.1 (execution_mode distribution) and §3.2 (path-level latency vs DP) had hand-typed tables with approximate latencies (e.g. "~1.0s") and required readers to mentally compare 5+ rows × 5 columns. Both sections now reference generated PNG figures derived directly from the v2 + DP metrics.jsonl files. §3.1 figure (v2_execution_mode_distribution.png): Horizontal bar chart, log x-axis. 4076 direct-to-D fast-path requests (green) dwarf the rest by ~30x; the long tail of slow / fallback / failure modes is visible at one glance. Counts and percentages annotated on each bar. §3.2 figure (v2_path_level_latency.png): Grouped bar chart, log y-axis. Per-path TTFT p50 / TTFT p99 / Lat p50 with exact numeric labels (no more "~1.0s" approximations). Sample counts annotated below each path. Quick visual reads: - KVC fast path TTFT p50 41ms vs DP 92ms (2.2x faster) - KVC reseed TTFT p99 5.12s vs DP 0.43s (12x slower) -- the cost - KVC no-d-capacity TTFT p99 7.65s (worst case) Bundled: - scripts/analysis/plot_v2_path_breakdown.py -- the script that generates both figures; rerunable when v2 data changes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
224 lines
8.2 KiB
Python
224 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
||
"""Generate the two figures referenced by docs/V2_DEEP_ANALYSIS_ZH.md §3.1 and §3.2.
|
||
|
||
Inputs:
|
||
outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl
|
||
outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl
|
||
|
||
Outputs:
|
||
docs/figures/v2_execution_mode_distribution.png (for §3.1)
|
||
docs/figures/v2_path_level_latency.png (for §3.2)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import statistics
|
||
from collections import Counter, defaultdict
|
||
from pathlib import Path
|
||
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
|
||
ROOT = Path(__file__).resolve().parents[2]
|
||
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
|
||
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
|
||
OUT = ROOT / "docs/figures"
|
||
OUT.mkdir(parents=True, exist_ok=True)
|
||
|
||
|
||
def load(p: Path) -> list[dict]:
|
||
return [json.loads(line) for line in p.open()]
|
||
|
||
|
||
def is_failed(r: dict) -> bool:
|
||
if r.get("error"):
|
||
return True
|
||
fr = r.get("finish_reason")
|
||
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
|
||
return True
|
||
return False
|
||
|
||
|
||
def pct(vals: list[float], q: float) -> float:
|
||
s = sorted(vals)
|
||
if not s:
|
||
return float("nan")
|
||
return s[max(0, min(len(s) - 1, int(len(s) * q)))]
|
||
|
||
|
||
def main() -> None:
|
||
kvc = load(KVC)
|
||
dp = load(DP)
|
||
|
||
kvc_ok = [r for r in kvc if not is_failed(r)]
|
||
dp_ok = [r for r in dp if not is_failed(r)]
|
||
|
||
# ------------------------------------------------------------------
|
||
# Figure 1: §3.1 execution_mode distribution (horizontal bar)
|
||
# Use ALL rows (incl. failures) so percentages match the doc's 91.6%
|
||
# ------------------------------------------------------------------
|
||
mode_counts = Counter(r["execution_mode"] for r in kvc)
|
||
total_kvc = len(kvc)
|
||
|
||
short_label = {
|
||
"kvcache-direct-to-d-session": "direct-to-D-session (fast path)",
|
||
"pd-router-d-session-reseed": "d-session-reseed (mooncake reseed)",
|
||
"pd-router-fallback-session-not-resident-session-cap":
|
||
"fallback: session-not-resident + session-cap",
|
||
"pd-router-fallback-session-not-resident-seed-filter-early-turn":
|
||
"fallback: session-not-resident + seed-filter",
|
||
"pd-router-turn1-seed": "turn1-seed (first turn of each session)",
|
||
"pd-router-fallback-no-d-capacity": "fallback: no-d-capacity",
|
||
"pd-router-fallback-real-large-append-session-cap":
|
||
"fallback: real-large-append",
|
||
"pd-router-fallback-policy-no-bypass-session-cap":
|
||
"fallback: policy-no-bypass",
|
||
"pd-router-d-session-reseed-after-eviction":
|
||
"d-session-reseed-after-eviction",
|
||
"kvcache-centric": "kvcache-centric (admit-but-then-error)",
|
||
}
|
||
sorted_modes = mode_counts.most_common()
|
||
labels = [short_label.get(m, m) for m, _ in sorted_modes]
|
||
counts = [c for _, c in sorted_modes]
|
||
pcts = [c / total_kvc * 100 for c in counts]
|
||
|
||
is_fast = ["direct-to-D" in lbl for lbl in labels]
|
||
colors = ["#2C8C2C" if f else "#D62728" for f in is_fast]
|
||
|
||
fig, ax = plt.subplots(figsize=(11, 5.5))
|
||
y = np.arange(len(labels))[::-1]
|
||
ax.barh(y, counts, color=colors, edgecolor="black", linewidth=0.5)
|
||
ax.set_yticks(y)
|
||
ax.set_yticklabels(labels, fontsize=10)
|
||
ax.set_xscale("log")
|
||
ax.set_xlabel("Request count (log scale)", fontsize=11)
|
||
ax.set_xlim(left=1)
|
||
|
||
# Annotate count + percentage at end of each bar
|
||
for yi, (c, p) in zip(y, zip(counts, pcts)):
|
||
ax.text(c * 1.05, yi, f"{c} ({p:.1f}%)",
|
||
va="center", fontsize=9.5)
|
||
|
||
ax.set_title(
|
||
f"KVC v2 execution_mode distribution (n = {total_kvc} total requests)\n"
|
||
"green = fast path (direct-to-D), red = slow / fallback / failure paths",
|
||
fontsize=12, pad=12,
|
||
)
|
||
ax.grid(axis="x", linestyle=":", alpha=0.4)
|
||
ax.set_axisbelow(True)
|
||
plt.tight_layout()
|
||
out1 = OUT / "v2_execution_mode_distribution.png"
|
||
plt.savefig(out1, dpi=150)
|
||
print(f"wrote {out1}")
|
||
plt.close(fig)
|
||
|
||
# ------------------------------------------------------------------
|
||
# Figure 2: §3.2 path-level latency (grouped bars, log y)
|
||
# ------------------------------------------------------------------
|
||
|
||
# Group KVC paths semantically
|
||
def kvc_group(mode: str) -> str:
|
||
if mode == "kvcache-direct-to-d-session":
|
||
return "KVC direct-to-D\n(fast path, 91.6%)"
|
||
if "reseed" in mode:
|
||
return "KVC reseed\n(slow path, 3.4%)"
|
||
if "no-d-capacity" in mode:
|
||
return "KVC no-d-capacity\n(fallback, 0.7%)"
|
||
if "session-not-resident" in mode:
|
||
return "KVC session-not-resident\n(misc, 2.3%)"
|
||
return "KVC other\n(<2%)"
|
||
|
||
groups = defaultdict(list)
|
||
for r in kvc_ok:
|
||
groups[kvc_group(r["execution_mode"])].append(r)
|
||
|
||
# Order paths by intuitive progression (fast → slow)
|
||
ordered_paths = [
|
||
"KVC direct-to-D\n(fast path, 91.6%)",
|
||
"KVC session-not-resident\n(misc, 2.3%)",
|
||
"KVC reseed\n(slow path, 3.4%)",
|
||
"KVC no-d-capacity\n(fallback, 0.7%)",
|
||
]
|
||
# Filter to only ones present
|
||
ordered_paths = [p for p in ordered_paths if p in groups]
|
||
ordered_paths.append("DP dp-colo-router\n(100%)")
|
||
|
||
def stats(rows: list[dict]) -> dict[str, float]:
|
||
ttfts = [r["ttft_s"] for r in rows if r.get("ttft_s") is not None]
|
||
lats = [r["latency_s"] for r in rows if r.get("latency_s") is not None]
|
||
return {
|
||
"n": len(rows),
|
||
"ttft_p50": pct(ttfts, 0.50),
|
||
"ttft_p99": pct(ttfts, 0.99),
|
||
"lat_p50": pct(lats, 0.50),
|
||
}
|
||
|
||
path_stats = {p: stats(groups[p]) for p in ordered_paths if "DP" not in p}
|
||
path_stats["DP dp-colo-router\n(100%)"] = stats(dp_ok)
|
||
|
||
metrics = [("TTFT p50", "ttft_p50"), ("TTFT p99", "ttft_p99"), ("Latency p50", "lat_p50")]
|
||
bar_w = 0.25
|
||
fig, ax = plt.subplots(figsize=(12, 6))
|
||
x = np.arange(len(ordered_paths))
|
||
|
||
colors_metric = ["#1F77B4", "#FF7F0E", "#9467BD"]
|
||
for i, (label, key) in enumerate(metrics):
|
||
vals = [path_stats[p][key] for p in ordered_paths]
|
||
bars = ax.bar(x + (i - 1) * bar_w, vals, bar_w, label=label,
|
||
color=colors_metric[i], edgecolor="black", linewidth=0.4)
|
||
for xi, v in zip(x + (i - 1) * bar_w, vals):
|
||
if v > 0 and v == v: # not nan
|
||
fmt = f"{v*1000:.0f}ms" if v < 1 else f"{v:.2f}s"
|
||
ax.text(xi, v * 1.10, fmt,
|
||
ha="center", va="bottom", fontsize=8.5, rotation=0)
|
||
|
||
ax.set_yscale("log")
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(ordered_paths, fontsize=9.5)
|
||
ax.set_ylabel("Latency (seconds, log scale)", fontsize=11)
|
||
ax.set_title(
|
||
"Path-level latency: KVC v2 paths vs DP single-path baseline\n"
|
||
"log y-axis · same SWE-Bench 50sess trace · ts=1 · 4× H100 80GB",
|
||
fontsize=12, pad=12,
|
||
)
|
||
ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
|
||
ax.grid(axis="y", linestyle=":", alpha=0.4, which="both")
|
||
ax.set_axisbelow(True)
|
||
|
||
# Annotate sample counts under each path label
|
||
ymin = ax.get_ylim()[0]
|
||
for xi, p in zip(x, ordered_paths):
|
||
n = path_stats[p]["n"]
|
||
ax.text(xi, ymin * 0.5, f"n={n}", ha="center", va="top",
|
||
fontsize=8.5, color="#555")
|
||
|
||
plt.tight_layout()
|
||
out2 = OUT / "v2_path_level_latency.png"
|
||
plt.savefig(out2, dpi=150)
|
||
print(f"wrote {out2}")
|
||
plt.close(fig)
|
||
|
||
# ------------------------------------------------------------------
|
||
# Print numeric values used (for doc reference)
|
||
# ------------------------------------------------------------------
|
||
print("\n=== Numeric values plotted ===")
|
||
print("\nExecution mode counts (KVC v2):")
|
||
for label, c, p in zip(labels, counts, pcts):
|
||
print(f" {c:>5} ({p:>5.2f}%) {label}")
|
||
|
||
print("\nPath-level latency:")
|
||
for p in ordered_paths:
|
||
s = path_stats[p]
|
||
nl = " | ".join([
|
||
f"n={s['n']}",
|
||
f"TTFT p50={s['ttft_p50']*1000:.1f}ms",
|
||
f"TTFT p99={s['ttft_p99']*1000:.1f}ms",
|
||
f"Lat p50={s['lat_p50']:.3f}s",
|
||
])
|
||
print(f" {p.replace(chr(10), ' '):<55} {nl}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|