Hostile audit of the original report flagged three load-bearing errors:
1. held_tokens semantic was inverted. session_held_tokens() at
session_aware_cache.py:278-282 sums (kv_allocated_len - cache_protected_len)
per slot, i.e. slot-private (NOT in radix tree). So "other = cap - held -
avail" actually CONTAINS the radix-tree protected prefix cache (likely the
single biggest component for shared agentic prefixes), not just running
batch + in-flight as the original report claimed.
2. Admission-race causal hypothesis for the 415 EXP2+profile errors is
contradicted by the data: 414/415 errors have kv_transfer_blocks > 0 — they
passed admission and died downstream ("generate stream ended before
producing any token", raised by the client when a 200 response had an empty
stream).
3. Polling deconfound was too quickly dismissed. Mode counts shift ~1:1
(session-cap-fb -356 / kvcache-centric +406), and /server_info is not a
passive read — it dispatches into the scheduler main loop and iterates
every session slot.
Plus: per-D error% confounded by sticky session affinity (only 18 unique
sessions cause 415 errors, decode-3 had 0 errors only because no high-error
session landed there); decile 10 "recovery" was an equal-time binning
artifact (24.5% under equal-count); v5 vs v5+profile time gap was 21h not
6h; p50/p90 latency comparison is N=1.
Rewritten report (docs/V5_PROFILE_INVESTIGATION_ZH.md) marks each correction
with ⚠️ and demotes admission-race to one of four hypotheses (H1-H4).
Action items split into P0 (verify, must do first) and P1 (instrument):
P0 — scripts/sweep_tp1_v5_baseline_rerun_exp2.sh runs 3x v5 baseline EXP2
(no polling, identical config to the original v5 run) to test whether the
9-error baseline result is reproducible. If 3 runs give ~9 errors and
profile gives 415, polling is the leading suspect. Currently running
in background.
P1 — scheduler.py:_compute_pool_breakdown_for_diagnostics adds a read-only
"pool_breakdown" dict to /server_info covering: radix_evictable_tokens,
radix_protected_tokens, slot_private_held_tokens, session_slot_count,
running_batch_{reqs,kv_tokens}, transfer_queue_{reqs,tokens},
prealloc_queue_{reqs,tokens}, retracted_queue_{reqs,tokens}. With these,
"unaccounted = cap - sum(known)" exposes true leakage. replay.py captures
all fields into the per-tick row; analyzer prints the decomposition and
gracefully handles old timeseries (prints "P1 instrument absent").
Mock-tested end-to-end. SGLang patch is read-only and does not affect
admission/scheduling. Old v5+profile data still analyzes correctly.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
347 lines
13 KiB
Python
Executable File
347 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Analyze d-pool-timeseries.jsonl produced by --pool-poll-interval-s.
|
|
|
|
Answers v6's main question: where is D's KV pool actually spent?
|
|
|
|
For each decode worker, decomposes capacity over the run wall-clock into:
|
|
- resident_held_active = held - idle_evictable (sessions in active use)
|
|
- resident_held_idle = idle_evictable (sessions kept around but evictable)
|
|
- prefill_backup_or_other = capacity - held - available (everything else: backup blocks,
|
|
in-flight transfers, fragmentation)
|
|
- free_available = available
|
|
|
|
Also reports session residency churn (how many distinct sessions ever resided per D, and
|
|
how often a session bounced between workers — a strong starvation signal).
|
|
|
|
Usage:
|
|
python scripts/analysis/analyze_pool_timeseries.py <run_dir>
|
|
or
|
|
python scripts/analysis/analyze_pool_timeseries.py <pool_timeseries.jsonl>
|
|
|
|
Output: human-readable text. Add --json to also print a machine-readable summary.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import statistics
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
rows: list[dict[str, Any]] = []
|
|
with path.open() as fh:
|
|
for line in fh:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
rows.append(json.loads(line))
|
|
return rows
|
|
|
|
|
|
def _resolve_input(path: Path) -> Path:
|
|
if path.is_file():
|
|
return path
|
|
if path.is_dir():
|
|
candidate = path / "d-pool-timeseries.jsonl"
|
|
if candidate.is_file():
|
|
return candidate
|
|
raise FileNotFoundError(
|
|
f"{candidate} not found; pass the file directly or a run dir containing it."
|
|
)
|
|
raise FileNotFoundError(path)
|
|
|
|
|
|
def _percentile(values: list[float], p: float) -> float:
|
|
if not values:
|
|
return 0.0
|
|
s = sorted(values)
|
|
idx = min(len(s) - 1, max(0, int(round((len(s) - 1) * p))))
|
|
return s[idx]
|
|
|
|
|
|
def _fmt_tokens(n: float) -> str:
|
|
if n >= 1_000_000:
|
|
return f"{n / 1_000_000:.2f}M"
|
|
if n >= 1_000:
|
|
return f"{n / 1_000:.1f}K"
|
|
return f"{int(n)}"
|
|
|
|
|
|
def _fmt_pct(n: float, total: float) -> str:
|
|
if total <= 0:
|
|
return " - "
|
|
return f"{100 * n / total:5.1f}%"
|
|
|
|
|
|
def analyze(timeseries_path: Path) -> dict[str, Any]:
|
|
rows = _load_jsonl(timeseries_path)
|
|
if not rows:
|
|
raise ValueError(f"empty timeseries: {timeseries_path}")
|
|
|
|
by_worker: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
for row in rows:
|
|
if row.get("error") and "session_cache_enabled" not in row:
|
|
# poller failed at this tick — skip
|
|
continue
|
|
wid = row.get("worker_id") or "?"
|
|
by_worker[wid].append(row)
|
|
|
|
summary: dict[str, Any] = {
|
|
"timeseries_path": str(timeseries_path),
|
|
"total_rows": len(rows),
|
|
"tick_count": len(by_worker[next(iter(by_worker))]) if by_worker else 0,
|
|
"wall_s_span": (
|
|
max(r.get("wall_s", 0.0) for r in rows)
|
|
- min(r.get("wall_s", 0.0) for r in rows)
|
|
),
|
|
"workers": {},
|
|
}
|
|
|
|
print(f"\n=== Pool timeseries: {timeseries_path}")
|
|
print(
|
|
f" rows={summary['total_rows']} workers={len(by_worker)} "
|
|
f"span={summary['wall_s_span']:.1f}s"
|
|
)
|
|
|
|
# Print per-worker decomposition table
|
|
header = (
|
|
f"{'worker':<12} {'role':<8} {'cap':>8} | "
|
|
f"{'avg_active':>10} {'avg_idle':>10} {'avg_other':>10} {'avg_free':>10} | "
|
|
f"{'p90_held':>10} {'max_held':>10} {'p90_avail':>10}"
|
|
)
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
for wid in sorted(by_worker.keys()):
|
|
ws = by_worker[wid]
|
|
role = ws[0].get("worker_role", "?")
|
|
cap_vals = [int(r.get("capacity_tokens") or 0) for r in ws]
|
|
held_vals = [int(r.get("held_tokens") or 0) for r in ws]
|
|
avail_vals = [int(r.get("available_tokens") or 0) for r in ws]
|
|
idle_vals = [int(r.get("idle_evictable_tokens") or 0) for r in ws]
|
|
# active = held - idle (sessions in active use)
|
|
active_vals = [max(0, h - i) for h, i in zip(held_vals, idle_vals)]
|
|
# other = capacity - held - available (prefill backup blocks, in-flight, fragmentation)
|
|
other_vals = [
|
|
max(0, c - h - a) for c, h, a in zip(cap_vals, held_vals, avail_vals)
|
|
]
|
|
cap = max(cap_vals) if cap_vals else 0
|
|
|
|
avg_active = statistics.fmean(active_vals) if active_vals else 0.0
|
|
avg_idle = statistics.fmean(idle_vals) if idle_vals else 0.0
|
|
avg_other = statistics.fmean(other_vals) if other_vals else 0.0
|
|
avg_avail = statistics.fmean(avail_vals) if avail_vals else 0.0
|
|
|
|
p90_held = _percentile([float(v) for v in held_vals], 0.90)
|
|
max_held = max(held_vals) if held_vals else 0
|
|
p90_avail = _percentile([float(v) for v in avail_vals], 0.90)
|
|
|
|
sess_counts = [int(r.get("session_count") or 0) for r in ws]
|
|
resident_counts = [int(r.get("resident_session_count") or 0) for r in ws]
|
|
|
|
print(
|
|
f"{wid:<12} {role:<8} {_fmt_tokens(cap):>8} | "
|
|
f"{_fmt_tokens(avg_active):>4} {_fmt_pct(avg_active, cap):>5} "
|
|
f"{_fmt_tokens(avg_idle):>4} {_fmt_pct(avg_idle, cap):>5} "
|
|
f"{_fmt_tokens(avg_other):>4} {_fmt_pct(avg_other, cap):>5} "
|
|
f"{_fmt_tokens(avg_avail):>4} {_fmt_pct(avg_avail, cap):>5} | "
|
|
f"{_fmt_tokens(p90_held):>10} {_fmt_tokens(max_held):>10} "
|
|
f"{_fmt_tokens(p90_avail):>10}"
|
|
)
|
|
|
|
summary["workers"][wid] = {
|
|
"role": role,
|
|
"capacity_tokens": cap,
|
|
"avg_active_held_tokens": avg_active,
|
|
"avg_idle_evictable_tokens": avg_idle,
|
|
"avg_other_tokens": avg_other,
|
|
"avg_available_tokens": avg_avail,
|
|
"p90_held_tokens": p90_held,
|
|
"max_held_tokens": max_held,
|
|
"p90_available_tokens": p90_avail,
|
|
"max_session_count": max(sess_counts) if sess_counts else 0,
|
|
"max_resident_session_count": (
|
|
max(resident_counts) if resident_counts else 0
|
|
),
|
|
"ticks": len(ws),
|
|
}
|
|
|
|
print(
|
|
"\nLegend: active=held-idle idle=idle_evictable "
|
|
"other=cap-held-avail (radix-protected + running-batch + in-flight + frag)"
|
|
)
|
|
|
|
# P1: decomposition of "other" using pool_breakdown fields (zeros if instrument absent)
|
|
has_breakdown = any(
|
|
any(r.get(k) for k in (
|
|
"radix_evictable_tokens",
|
|
"radix_protected_tokens",
|
|
"running_batch_kv_tokens",
|
|
"transfer_queue_tokens",
|
|
"prealloc_queue_tokens",
|
|
"retracted_queue_tokens",
|
|
))
|
|
for r in rows
|
|
)
|
|
|
|
if has_breakdown:
|
|
print("\n=== P1 'other' decomposition (per worker, mean over run) ===")
|
|
print(
|
|
f"{'worker':<12} {'role':<8} | "
|
|
f"{'r_evictable':>11} {'r_protected':>11} {'slot_private':>12} | "
|
|
f"{'run_batch':>10} {'transfer':>9} {'prealloc':>9} {'retracted':>10} | "
|
|
f"{'unaccounted':>11}"
|
|
)
|
|
for wid in sorted(by_worker.keys()):
|
|
ws = by_worker[wid]
|
|
role = ws[0].get("worker_role", "?")
|
|
cap = max(int(r.get("capacity_tokens") or 0) for r in ws)
|
|
|
|
def m(field: str) -> float:
|
|
vals = [int(r.get(field) or 0) for r in ws]
|
|
return statistics.fmean(vals) if vals else 0.0
|
|
|
|
r_ev = m("radix_evictable_tokens")
|
|
r_pr = m("radix_protected_tokens")
|
|
slot = m("slot_private_held_tokens")
|
|
rb = m("running_batch_kv_tokens")
|
|
tq = m("transfer_queue_tokens")
|
|
pq = m("prealloc_queue_tokens")
|
|
rq = m("retracted_queue_tokens")
|
|
avail = m("available_tokens")
|
|
# `running_batch_kv_tokens` overlaps with radix_protected for tree-tracked
|
|
# reqs — do NOT subtract it again. Decomposition assumes:
|
|
# capacity ≈ avail + r_evictable + r_protected + slot_private
|
|
# + transfer_queue + prealloc_queue + retracted_queue + unaccounted
|
|
unacc = max(
|
|
0,
|
|
cap - avail - r_ev - r_pr - slot - tq - pq - rq,
|
|
)
|
|
print(
|
|
f"{wid:<12} {role:<8} | "
|
|
f"{_fmt_tokens(r_ev):>11} {_fmt_tokens(r_pr):>11} {_fmt_tokens(slot):>12} | "
|
|
f"{_fmt_tokens(rb):>10} {_fmt_tokens(tq):>9} {_fmt_tokens(pq):>9} {_fmt_tokens(rq):>10} | "
|
|
f"{_fmt_tokens(unacc):>11}"
|
|
)
|
|
|
|
summary["workers"][wid]["pool_breakdown_avg"] = {
|
|
"radix_evictable": r_ev,
|
|
"radix_protected": r_pr,
|
|
"slot_private_held": slot,
|
|
"running_batch_kv": rb,
|
|
"transfer_queue": tq,
|
|
"prealloc_queue": pq,
|
|
"retracted_queue": rq,
|
|
"available": avail,
|
|
"unaccounted": unacc,
|
|
}
|
|
print(
|
|
"\nNote: running_batch_kv_tokens overlaps with radix_protected_tokens "
|
|
"(tree-tracked decode reqs are also in protected); not summed."
|
|
)
|
|
else:
|
|
print("\n(P1 instrument absent: pool_breakdown fields are all zero)")
|
|
|
|
# Session residency churn: how many distinct sessions ever sat on each worker,
|
|
# and how many sessions hopped across workers (= starvation indicator).
|
|
print("\n=== Session residency churn ===")
|
|
sessions_per_worker: dict[str, set[str]] = defaultdict(set)
|
|
workers_per_session: dict[str, set[str]] = defaultdict(set)
|
|
resident_ticks_per_session: Counter[str] = Counter()
|
|
resident_ticks_per_worker: Counter[str] = Counter()
|
|
|
|
for row in rows:
|
|
wid = row.get("worker_id")
|
|
if wid is None or row.get("worker_role") != "decode":
|
|
continue
|
|
sessions = row.get("sessions") or []
|
|
if not isinstance(sessions, list):
|
|
continue
|
|
for entry in sessions:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
sid = entry.get("session_id")
|
|
if sid is None:
|
|
continue
|
|
if entry.get("resident"):
|
|
sessions_per_worker[wid].add(sid)
|
|
workers_per_session[sid].add(wid)
|
|
resident_ticks_per_session[(wid, sid)] += 1
|
|
resident_ticks_per_worker[wid] += 1
|
|
|
|
# Per-decode worker: distinct session count
|
|
print(f" {'worker':<12} {'distinct_sess':>14} {'resident_ticks':>16}")
|
|
for wid in sorted(sessions_per_worker.keys()):
|
|
print(
|
|
f" {wid:<12} {len(sessions_per_worker[wid]):>14} "
|
|
f"{resident_ticks_per_worker[wid]:>16}"
|
|
)
|
|
|
|
# Per session: how many workers it hopped across
|
|
hops = Counter(len(ws) for ws in workers_per_session.values())
|
|
print(f"\n Sessions seen on N workers (decode side):")
|
|
for n, count in sorted(hops.items()):
|
|
print(f" on {n} worker(s): {count} sessions")
|
|
|
|
starvation = [sid for sid, ws in workers_per_session.items() if len(ws) == 0]
|
|
multi_hopper = sorted(
|
|
((sid, ws) for sid, ws in workers_per_session.items() if len(ws) >= 2),
|
|
key=lambda x: -len(x[1]),
|
|
)[:10]
|
|
if multi_hopper:
|
|
print(
|
|
"\n Top sessions seen resident on multiple workers (potential thrashing):"
|
|
)
|
|
for sid, ws in multi_hopper:
|
|
print(f" {sid}: {len(ws)} workers ({sorted(ws)})")
|
|
|
|
summary["session_residency"] = {
|
|
"distinct_sessions_per_worker": {
|
|
wid: len(s) for wid, s in sessions_per_worker.items()
|
|
},
|
|
"session_hop_count_distribution": dict(hops),
|
|
"starvation_session_count": len(starvation),
|
|
}
|
|
|
|
# If a request-metrics file is co-located, also bucket fallback reasons
|
|
# against contemporaneous pool state (rough — uses tick nearest to median tick).
|
|
metrics_path = timeseries_path.with_name("request-metrics.jsonl")
|
|
if metrics_path.exists():
|
|
print(f"\n=== Request-metrics summary ({metrics_path.name}) ===")
|
|
mrows = _load_jsonl(metrics_path)
|
|
modes = Counter(r.get("execution_mode") or "?" for r in mrows)
|
|
total = sum(modes.values())
|
|
for mode, count in modes.most_common():
|
|
print(f" {count:>6} ({100 * count / total:5.1f}%) {mode}")
|
|
summary["execution_modes"] = dict(modes)
|
|
|
|
return summary
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument(
|
|
"path",
|
|
type=Path,
|
|
help="Path to d-pool-timeseries.jsonl OR a run dir containing it",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
help="Also print a machine-readable JSON summary",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
resolved = _resolve_input(args.path)
|
|
summary = analyze(resolved)
|
|
if args.json:
|
|
print("\n=== JSON summary ===")
|
|
print(json.dumps(summary, indent=2, sort_keys=True, default=str))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|