Add v4 sweep results and post-mortem analysis showing:
- direct-to-D path: 54.3% (1P7D) / 58.0% (2P6D) of requests now use
KVC cleanly. P50=0.5s and TTFT P50=0.043s; this path beats baseline
8DP across the board (P50 -24%, TTFT P50 -54%, TTFT P90 -79%).
- Overall vs baseline (errors+truncated excluded):
v4 2P6D P50=0.85s vs baseline 0.66s (28% slower).
Reason is not errors -- 35% of requests still hit
fallback-large-append-session-cap, where capacity-based
cap = usable_tokens / target_tokens evaluates to 1-2 (not 16)
for large agentic inputs.
- 9-10% errors on KVC variants are mooncake TCP transfer timeouts,
not SGLang logic bugs. Prefill log shows
"Failed to send kv chunk ... 32s timeout ... session not alive".
Errors concentrate in turn>=31 (large inputs) after run >44.8%.
Track:
- docs/KVC_DEBUG_JOURNEY_V1_TO_V4.md: append v4 results table,
per-mode breakdown, and error root cause.
- scripts/analysis/{analyze_v3,analyze_v4,analyze_errors,compare_no_error}.py
- outputs/qwen3-30b-tp1-v{3,4}*/exp*_summary.json (force-added,
small JSON; metrics.jsonl excluded due to size).
- outputs/qwen3-30b-tp1-v{3,4}*/sweep_results.txt
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
137 lines
5.2 KiB
Python
137 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Compare KVC variants vs baseline, EXCLUDING errors and truncated requests."""
|
|
import json
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
OUT = Path("/mnt/kzlin/workflow/pd-hybrid/agentic-pd-hybrid/outputs")
|
|
|
|
DATASETS = [
|
|
("baseline 8DP", OUT / "qwen3-30b-tp1-v2-fixed/exp1_8way_dp_cache_aware_metrics.jsonl"),
|
|
("v3 1P7D", OUT / "qwen3-30b-tp1-v3-kvaware/exp1_1p7d_kvc_kvaware_metrics.jsonl"),
|
|
("v3 2P6D", OUT / "qwen3-30b-tp1-v3-kvaware/exp2_2p6d_kvc_kvaware_metrics.jsonl"),
|
|
("v4 1P7D", OUT / "qwen3-30b-tp1-v4-cap16/exp1_1p7d_kvc_cap16_metrics.jsonl"),
|
|
("v4 2P6D", OUT / "qwen3-30b-tp1-v4-cap16/exp2_2p6d_kvc_cap16_metrics.jsonl"),
|
|
]
|
|
|
|
def load_rows(path):
|
|
rows = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
rows.append(json.loads(line))
|
|
return rows
|
|
|
|
def is_truncated(row):
|
|
a = row.get("actual_output_tokens")
|
|
r = row.get("requested_output_tokens")
|
|
if a is not None and r is not None and r > 1:
|
|
return a < r * 0.5
|
|
return False
|
|
|
|
def stats(values):
|
|
if not values:
|
|
return {"n": 0}
|
|
a = np.array(values)
|
|
return {
|
|
"n": len(a),
|
|
"mean": float(np.mean(a)),
|
|
"p50": float(np.percentile(a, 50)),
|
|
"p90": float(np.percentile(a, 90)),
|
|
"p99": float(np.percentile(a, 99)),
|
|
}
|
|
|
|
def fmt(s, key):
|
|
if s["n"] == 0:
|
|
return "N/A"
|
|
v = s[key]
|
|
return f"{v:.3f}s" if v < 100 else f"{v:.1f}s"
|
|
|
|
results = []
|
|
for label, path in DATASETS:
|
|
if not path.exists():
|
|
print(f"SKIP {label}")
|
|
continue
|
|
rows = load_rows(path)
|
|
total = len(rows)
|
|
err_n = sum(1 for r in rows if r.get("error") is not None)
|
|
trunc_n = sum(1 for r in rows if r.get("error") is None and is_truncated(r))
|
|
|
|
# Filter: error=None AND not truncated AND latency present
|
|
clean = [r for r in rows
|
|
if r.get("error") is None
|
|
and not is_truncated(r)
|
|
and r.get("latency_s") is not None]
|
|
|
|
lats = [r["latency_s"] for r in clean]
|
|
ttfts = [r["ttft_s"] for r in clean if r.get("ttft_s") is not None]
|
|
|
|
results.append({
|
|
"label": label,
|
|
"total": total,
|
|
"err": err_n,
|
|
"trunc": trunc_n,
|
|
"clean_n": len(clean),
|
|
"lat": stats(lats),
|
|
"ttft": stats(ttfts),
|
|
})
|
|
|
|
# Print comparison table
|
|
print(f"\n{'='*100}")
|
|
print("LATENCY (excluding errors AND truncated)")
|
|
print(f"{'='*100}")
|
|
print(f"{'config':<16}{'total':>7}{'err':>6}{'trunc':>7}{'clean':>7} {'mean':>9}{'P50':>9}{'P90':>9}{'P99':>9}")
|
|
for r in results:
|
|
print(f"{r['label']:<16}{r['total']:>7}{r['err']:>6}{r['trunc']:>7}{r['clean_n']:>7} "
|
|
f"{fmt(r['lat'],'mean'):>9}{fmt(r['lat'],'p50'):>9}{fmt(r['lat'],'p90'):>9}{fmt(r['lat'],'p99'):>9}")
|
|
|
|
print(f"\n{'='*100}")
|
|
print("TTFT (excluding errors AND truncated)")
|
|
print(f"{'='*100}")
|
|
print(f"{'config':<16}{'clean':>7} {'mean':>9}{'P50':>9}{'P90':>9}{'P99':>9}")
|
|
for r in results:
|
|
print(f"{r['label']:<16}{r['clean_n']:>7} "
|
|
f"{fmt(r['ttft'],'mean'):>9}{fmt(r['ttft'],'p50'):>9}{fmt(r['ttft'],'p90'):>9}{fmt(r['ttft'],'p99'):>9}")
|
|
|
|
# Also: per-execution-mode breakdown for v4 only (the most interesting)
|
|
print(f"\n{'='*100}")
|
|
print("V4 2P6D: per-execution-mode (excluding errors and truncated)")
|
|
print(f"{'='*100}")
|
|
v4_2p6d = next((p for l, p in DATASETS if l == "v4 2P6D"), None)
|
|
if v4_2p6d:
|
|
rows = load_rows(v4_2p6d)
|
|
clean = [r for r in rows if r.get("error") is None and not is_truncated(r)]
|
|
from collections import Counter
|
|
modes = Counter(r["execution_mode"] for r in clean)
|
|
print(f"{'mode':<55}{'n':>7}{'%':>7} {'mean':>9}{'P50':>9}{'P90':>9}{'P99':>9}")
|
|
for mode, count in modes.most_common(10):
|
|
m_rows = [r for r in clean if r["execution_mode"] == mode]
|
|
s = stats([r["latency_s"] for r in m_rows])
|
|
pct = count/len(clean)*100
|
|
print(f" {mode:<53}{count:>7}{pct:>6.1f}% {fmt(s,'mean'):>9}{fmt(s,'p50'):>9}{fmt(s,'p90'):>9}{fmt(s,'p99'):>9}")
|
|
|
|
# Also: WHAT IF we only count direct-to-D? (Pure KVC performance)
|
|
print(f"\n{'='*100}")
|
|
print("Pure KVC (kvcache-direct-to-d-session ONLY) vs Baseline")
|
|
print(f"{'='*100}")
|
|
for label, path in DATASETS:
|
|
if not path.exists() or "1P7D" not in label and "2P6D" not in label:
|
|
continue
|
|
rows = load_rows(path)
|
|
direct = [r for r in rows
|
|
if r.get("error") is None and not is_truncated(r)
|
|
and r.get("execution_mode") == "kvcache-direct-to-d-session"]
|
|
if not direct:
|
|
continue
|
|
s_lat = stats([r["latency_s"] for r in direct])
|
|
s_ttft = stats([r["ttft_s"] for r in direct if r.get("ttft_s") is not None])
|
|
print(f"{label:<16}n={s_lat['n']:>5} lat: P50={fmt(s_lat,'p50')} P90={fmt(s_lat,'p90')} ttft: P50={fmt(s_ttft,'p50')} P90={fmt(s_ttft,'p90')}")
|
|
|
|
# Baseline for reference (already non-fallback by definition)
|
|
print()
|
|
baseline_path = OUT / "qwen3-30b-tp1-v2-fixed/exp1_8way_dp_cache_aware_metrics.jsonl"
|
|
baseline_rows = load_rows(baseline_path)
|
|
clean = [r for r in baseline_rows if r.get("error") is None and not is_truncated(r)]
|
|
s_lat = stats([r["latency_s"] for r in clean])
|
|
s_ttft = stats([r["ttft_s"] for r in clean if r.get("ttft_s") is not None])
|
|
print(f"{'baseline 8DP':<16}n={s_lat['n']:>5} lat: P50={fmt(s_lat,'p50')} P90={fmt(s_lat,'p90')} ttft: P50={fmt(s_ttft,'p50')} P90={fmt(s_ttft,'p90')}")
|