Files
agentic-pd-hybrid/scripts/analysis/analyze_pool_timeseries.py
kzlin 4978c0d0cd profile(kvc): rewrite v5+profile report after critic audit + P0/P1 instrument
Hostile audit of the original report flagged three load-bearing errors:

1. held_tokens semantic was inverted. session_held_tokens() at
   session_aware_cache.py:278-282 sums (kv_allocated_len - cache_protected_len)
   per slot, i.e. slot-private (NOT in radix tree). So "other = cap - held -
   avail" actually CONTAINS the radix-tree protected prefix cache (likely the
   single biggest component for shared agentic prefixes), not just running
   batch + in-flight as the original report claimed.

2. Admission-race causal hypothesis for the 415 EXP2+profile errors is
   contradicted by the data: 414/415 errors have kv_transfer_blocks > 0 — they
   passed admission and died downstream ("generate stream ended before
   producing any token", raised by the client when a 200 response had an empty
   stream).

3. Polling deconfound was too quickly dismissed. Mode counts shift ~1:1
   (session-cap-fb -356 / kvcache-centric +406), and /server_info is not a
   passive read — it dispatches into the scheduler main loop and iterates
   every session slot.

Plus: per-D error% confounded by sticky session affinity (only 18 unique
sessions cause 415 errors, decode-3 had 0 errors only because no high-error
session landed there); decile 10 "recovery" was an equal-time binning
artifact (24.5% under equal-count); v5 vs v5+profile time gap was 21h not
6h; p50/p90 latency comparison is N=1.

Rewritten report (docs/V5_PROFILE_INVESTIGATION_ZH.md) marks each correction
with ⚠️ and demotes admission-race to one of four hypotheses (H1-H4).

Action items split into P0 (verify, must do first) and P1 (instrument):

P0 — scripts/sweep_tp1_v5_baseline_rerun_exp2.sh runs 3x v5 baseline EXP2
(no polling, identical config to the original v5 run) to test whether the
9-error baseline result is reproducible. If 3 runs give ~9 errors and
profile gives 415, polling is the leading suspect. Currently running
in background.

P1 — scheduler.py:_compute_pool_breakdown_for_diagnostics adds a read-only
"pool_breakdown" dict to /server_info covering: radix_evictable_tokens,
radix_protected_tokens, slot_private_held_tokens, session_slot_count,
running_batch_{reqs,kv_tokens}, transfer_queue_{reqs,tokens},
prealloc_queue_{reqs,tokens}, retracted_queue_{reqs,tokens}. With these,
"unaccounted = cap - sum(known)" exposes true leakage. replay.py captures
all fields into the per-tick row; analyzer prints the decomposition and
gracefully handles old timeseries (prints "P1 instrument absent").

Mock-tested end-to-end. SGLang patch is read-only and does not affect
admission/scheduling. Old v5+profile data still analyzes correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 22:29:21 +08:00

347 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
"""Analyze d-pool-timeseries.jsonl produced by --pool-poll-interval-s.
Answers v6's main question: where is D's KV pool actually spent?
For each decode worker, decomposes capacity over the run wall-clock into:
- resident_held_active = held - idle_evictable (sessions in active use)
- resident_held_idle = idle_evictable (sessions kept around but evictable)
- prefill_backup_or_other = capacity - held - available (everything else: backup blocks,
in-flight transfers, fragmentation)
- free_available = available
Also reports session residency churn (how many distinct sessions ever resided per D, and
how often a session bounced between workers — a strong starvation signal).
Usage:
python scripts/analysis/analyze_pool_timeseries.py <run_dir>
or
python scripts/analysis/analyze_pool_timeseries.py <pool_timeseries.jsonl>
Output: human-readable text. Add --json to also print a machine-readable summary.
"""
from __future__ import annotations
import argparse
import json
import statistics
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with path.open() as fh:
for line in fh:
line = line.strip()
if not line:
continue
rows.append(json.loads(line))
return rows
def _resolve_input(path: Path) -> Path:
if path.is_file():
return path
if path.is_dir():
candidate = path / "d-pool-timeseries.jsonl"
if candidate.is_file():
return candidate
raise FileNotFoundError(
f"{candidate} not found; pass the file directly or a run dir containing it."
)
raise FileNotFoundError(path)
def _percentile(values: list[float], p: float) -> float:
if not values:
return 0.0
s = sorted(values)
idx = min(len(s) - 1, max(0, int(round((len(s) - 1) * p))))
return s[idx]
def _fmt_tokens(n: float) -> str:
if n >= 1_000_000:
return f"{n / 1_000_000:.2f}M"
if n >= 1_000:
return f"{n / 1_000:.1f}K"
return f"{int(n)}"
def _fmt_pct(n: float, total: float) -> str:
if total <= 0:
return " - "
return f"{100 * n / total:5.1f}%"
def analyze(timeseries_path: Path) -> dict[str, Any]:
rows = _load_jsonl(timeseries_path)
if not rows:
raise ValueError(f"empty timeseries: {timeseries_path}")
by_worker: dict[str, list[dict[str, Any]]] = defaultdict(list)
for row in rows:
if row.get("error") and "session_cache_enabled" not in row:
# poller failed at this tick — skip
continue
wid = row.get("worker_id") or "?"
by_worker[wid].append(row)
summary: dict[str, Any] = {
"timeseries_path": str(timeseries_path),
"total_rows": len(rows),
"tick_count": len(by_worker[next(iter(by_worker))]) if by_worker else 0,
"wall_s_span": (
max(r.get("wall_s", 0.0) for r in rows)
- min(r.get("wall_s", 0.0) for r in rows)
),
"workers": {},
}
print(f"\n=== Pool timeseries: {timeseries_path}")
print(
f" rows={summary['total_rows']} workers={len(by_worker)} "
f"span={summary['wall_s_span']:.1f}s"
)
# Print per-worker decomposition table
header = (
f"{'worker':<12} {'role':<8} {'cap':>8} | "
f"{'avg_active':>10} {'avg_idle':>10} {'avg_other':>10} {'avg_free':>10} | "
f"{'p90_held':>10} {'max_held':>10} {'p90_avail':>10}"
)
print(header)
print("-" * len(header))
for wid in sorted(by_worker.keys()):
ws = by_worker[wid]
role = ws[0].get("worker_role", "?")
cap_vals = [int(r.get("capacity_tokens") or 0) for r in ws]
held_vals = [int(r.get("held_tokens") or 0) for r in ws]
avail_vals = [int(r.get("available_tokens") or 0) for r in ws]
idle_vals = [int(r.get("idle_evictable_tokens") or 0) for r in ws]
# active = held - idle (sessions in active use)
active_vals = [max(0, h - i) for h, i in zip(held_vals, idle_vals)]
# other = capacity - held - available (prefill backup blocks, in-flight, fragmentation)
other_vals = [
max(0, c - h - a) for c, h, a in zip(cap_vals, held_vals, avail_vals)
]
cap = max(cap_vals) if cap_vals else 0
avg_active = statistics.fmean(active_vals) if active_vals else 0.0
avg_idle = statistics.fmean(idle_vals) if idle_vals else 0.0
avg_other = statistics.fmean(other_vals) if other_vals else 0.0
avg_avail = statistics.fmean(avail_vals) if avail_vals else 0.0
p90_held = _percentile([float(v) for v in held_vals], 0.90)
max_held = max(held_vals) if held_vals else 0
p90_avail = _percentile([float(v) for v in avail_vals], 0.90)
sess_counts = [int(r.get("session_count") or 0) for r in ws]
resident_counts = [int(r.get("resident_session_count") or 0) for r in ws]
print(
f"{wid:<12} {role:<8} {_fmt_tokens(cap):>8} | "
f"{_fmt_tokens(avg_active):>4} {_fmt_pct(avg_active, cap):>5} "
f"{_fmt_tokens(avg_idle):>4} {_fmt_pct(avg_idle, cap):>5} "
f"{_fmt_tokens(avg_other):>4} {_fmt_pct(avg_other, cap):>5} "
f"{_fmt_tokens(avg_avail):>4} {_fmt_pct(avg_avail, cap):>5} | "
f"{_fmt_tokens(p90_held):>10} {_fmt_tokens(max_held):>10} "
f"{_fmt_tokens(p90_avail):>10}"
)
summary["workers"][wid] = {
"role": role,
"capacity_tokens": cap,
"avg_active_held_tokens": avg_active,
"avg_idle_evictable_tokens": avg_idle,
"avg_other_tokens": avg_other,
"avg_available_tokens": avg_avail,
"p90_held_tokens": p90_held,
"max_held_tokens": max_held,
"p90_available_tokens": p90_avail,
"max_session_count": max(sess_counts) if sess_counts else 0,
"max_resident_session_count": (
max(resident_counts) if resident_counts else 0
),
"ticks": len(ws),
}
print(
"\nLegend: active=held-idle idle=idle_evictable "
"other=cap-held-avail (radix-protected + running-batch + in-flight + frag)"
)
# P1: decomposition of "other" using pool_breakdown fields (zeros if instrument absent)
has_breakdown = any(
any(r.get(k) for k in (
"radix_evictable_tokens",
"radix_protected_tokens",
"running_batch_kv_tokens",
"transfer_queue_tokens",
"prealloc_queue_tokens",
"retracted_queue_tokens",
))
for r in rows
)
if has_breakdown:
print("\n=== P1 'other' decomposition (per worker, mean over run) ===")
print(
f"{'worker':<12} {'role':<8} | "
f"{'r_evictable':>11} {'r_protected':>11} {'slot_private':>12} | "
f"{'run_batch':>10} {'transfer':>9} {'prealloc':>9} {'retracted':>10} | "
f"{'unaccounted':>11}"
)
for wid in sorted(by_worker.keys()):
ws = by_worker[wid]
role = ws[0].get("worker_role", "?")
cap = max(int(r.get("capacity_tokens") or 0) for r in ws)
def m(field: str) -> float:
vals = [int(r.get(field) or 0) for r in ws]
return statistics.fmean(vals) if vals else 0.0
r_ev = m("radix_evictable_tokens")
r_pr = m("radix_protected_tokens")
slot = m("slot_private_held_tokens")
rb = m("running_batch_kv_tokens")
tq = m("transfer_queue_tokens")
pq = m("prealloc_queue_tokens")
rq = m("retracted_queue_tokens")
avail = m("available_tokens")
# `running_batch_kv_tokens` overlaps with radix_protected for tree-tracked
# reqs — do NOT subtract it again. Decomposition assumes:
# capacity ≈ avail + r_evictable + r_protected + slot_private
# + transfer_queue + prealloc_queue + retracted_queue + unaccounted
unacc = max(
0,
cap - avail - r_ev - r_pr - slot - tq - pq - rq,
)
print(
f"{wid:<12} {role:<8} | "
f"{_fmt_tokens(r_ev):>11} {_fmt_tokens(r_pr):>11} {_fmt_tokens(slot):>12} | "
f"{_fmt_tokens(rb):>10} {_fmt_tokens(tq):>9} {_fmt_tokens(pq):>9} {_fmt_tokens(rq):>10} | "
f"{_fmt_tokens(unacc):>11}"
)
summary["workers"][wid]["pool_breakdown_avg"] = {
"radix_evictable": r_ev,
"radix_protected": r_pr,
"slot_private_held": slot,
"running_batch_kv": rb,
"transfer_queue": tq,
"prealloc_queue": pq,
"retracted_queue": rq,
"available": avail,
"unaccounted": unacc,
}
print(
"\nNote: running_batch_kv_tokens overlaps with radix_protected_tokens "
"(tree-tracked decode reqs are also in protected); not summed."
)
else:
print("\n(P1 instrument absent: pool_breakdown fields are all zero)")
# Session residency churn: how many distinct sessions ever sat on each worker,
# and how many sessions hopped across workers (= starvation indicator).
print("\n=== Session residency churn ===")
sessions_per_worker: dict[str, set[str]] = defaultdict(set)
workers_per_session: dict[str, set[str]] = defaultdict(set)
resident_ticks_per_session: Counter[str] = Counter()
resident_ticks_per_worker: Counter[str] = Counter()
for row in rows:
wid = row.get("worker_id")
if wid is None or row.get("worker_role") != "decode":
continue
sessions = row.get("sessions") or []
if not isinstance(sessions, list):
continue
for entry in sessions:
if not isinstance(entry, dict):
continue
sid = entry.get("session_id")
if sid is None:
continue
if entry.get("resident"):
sessions_per_worker[wid].add(sid)
workers_per_session[sid].add(wid)
resident_ticks_per_session[(wid, sid)] += 1
resident_ticks_per_worker[wid] += 1
# Per-decode worker: distinct session count
print(f" {'worker':<12} {'distinct_sess':>14} {'resident_ticks':>16}")
for wid in sorted(sessions_per_worker.keys()):
print(
f" {wid:<12} {len(sessions_per_worker[wid]):>14} "
f"{resident_ticks_per_worker[wid]:>16}"
)
# Per session: how many workers it hopped across
hops = Counter(len(ws) for ws in workers_per_session.values())
print(f"\n Sessions seen on N workers (decode side):")
for n, count in sorted(hops.items()):
print(f" on {n} worker(s): {count} sessions")
starvation = [sid for sid, ws in workers_per_session.items() if len(ws) == 0]
multi_hopper = sorted(
((sid, ws) for sid, ws in workers_per_session.items() if len(ws) >= 2),
key=lambda x: -len(x[1]),
)[:10]
if multi_hopper:
print(
"\n Top sessions seen resident on multiple workers (potential thrashing):"
)
for sid, ws in multi_hopper:
print(f" {sid}: {len(ws)} workers ({sorted(ws)})")
summary["session_residency"] = {
"distinct_sessions_per_worker": {
wid: len(s) for wid, s in sessions_per_worker.items()
},
"session_hop_count_distribution": dict(hops),
"starvation_session_count": len(starvation),
}
# If a request-metrics file is co-located, also bucket fallback reasons
# against contemporaneous pool state (rough — uses tick nearest to median tick).
metrics_path = timeseries_path.with_name("request-metrics.jsonl")
if metrics_path.exists():
print(f"\n=== Request-metrics summary ({metrics_path.name}) ===")
mrows = _load_jsonl(metrics_path)
modes = Counter(r.get("execution_mode") or "?" for r in mrows)
total = sum(modes.values())
for mode, count in modes.most_common():
print(f" {count:>6} ({100 * count / total:5.1f}%) {mode}")
summary["execution_modes"] = dict(modes)
return summary
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"path",
type=Path,
help="Path to d-pool-timeseries.jsonl OR a run dir containing it",
)
parser.add_argument(
"--json",
action="store_true",
help="Also print a machine-readable JSON summary",
)
args = parser.parse_args()
resolved = _resolve_input(args.path)
summary = analyze(resolved)
if args.json:
print("\n=== JSON summary ===")
print(json.dumps(summary, indent=2, sort_keys=True, default=str))
if __name__ == "__main__":
main()