Files
agentic-pd-hybrid/scripts/analysis/recompute_summary.py
kzlin 5eac9b4f6b fix(metrics): exclude aborted requests from latency/ttft/tpot stats
The old filter `if row.latency_s is not None` accepted SGLang's fast
input-length-aborts (latency_s ~ 0.08s, finish_reason='abort/BadRequest')
as if they were successful zero-cost requests. This deflated mean/p50
of any run where the model rejected oversized inputs.

Impact on existing comparisons (ts=1 4-run validation + v2):
  KVC v2 has 40 aborts + 5 ReadTimeouts (was reported as just 5);
  DP 4w  has 67 aborts (was reported as 5).
Both runs have abort behavior; the asymmetry (40 vs 67) is purely from
SGLang's mem-fraction-derived max-input-len: KVC decode-only worker gets
~10 GB free GPU mem -> max-input=92098, DP fused worker gets ~9 GB ->
max-input=87811, because DP also needs chunked-prefill workspace.

The KVC-vs-DP latency-win direction holds and widens slightly under the
fixed filter (lat mean delta: -0.8% -> -1.4%); see V2_DEEP_ANALYSIS_ZH
§4.3 for the recomputed table.

Changes:
- metrics.py: new _is_failed_request(row) helper; latency/ttft/tpot
  stats now exclude both errors and aborts. New summary fields
  abort_count and failure_count expose the counts directly.
- scripts/analysis/recompute_summary.py: re-derives summary.json from
  existing metrics.jsonl using the fixed code, with optional --diff
  against the old buggy summary for inspection.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 17:29:18 +08:00

106 lines
4.0 KiB
Python

#!/usr/bin/env python3
"""Re-derive summary.json from existing metrics.jsonl using the fixed metrics.py.
Bug fixed: requests aborted by SGLang (e.g. input > max-input-len returns
a fast 400 with latency_s ~ 0.08s) were previously counted in latency_stats
as if successful, deflating mean/p50/p90. The fixed metrics.py excludes
all failed requests (errors or aborts) from latency/ttft/tpot stats and
exposes abort_count / failure_count.
Usage:
python3 scripts/analysis/recompute_summary.py path/to/metrics.jsonl ...
python3 scripts/analysis/recompute_summary.py --diff path/to/metrics.jsonl path/to/old_summary.json
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
from agentic_pd_hybrid.metrics import RequestMetrics, write_summary_json
def load_rows(metrics_path: Path) -> list[RequestMetrics]:
rows = []
field_names = {f for f in RequestMetrics.__dataclass_fields__}
with metrics_path.open() as handle:
for line in handle:
line = line.strip()
if not line:
continue
raw = json.loads(line)
kwargs = {k: raw.get(k) for k in field_names}
rows.append(RequestMetrics(**kwargs))
return rows
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("metrics_paths", nargs="+", type=Path)
parser.add_argument(
"--out",
type=Path,
default=None,
help="output summary path (default: alongside metrics with .recomputed_summary.json)",
)
parser.add_argument(
"--diff",
action="store_true",
help="print before/after diff against the old <metrics>.summary.json",
)
args = parser.parse_args()
for metrics_path in args.metrics_paths:
rows = load_rows(metrics_path)
out_path = args.out or metrics_path.with_suffix(".recomputed_summary.json")
write_summary_json(
out_path,
rows,
trace_path=metrics_path,
router_url=None,
)
new = json.load(out_path.open())
print(f"\n=== {metrics_path} ===")
print(f" written: {out_path}")
print(f" total rows: {new['request_count']}")
print(f" error_count: {new['error_count']}")
print(f" abort_count: {new.get('abort_count', '?')}")
print(f" failure_count: {new.get('failure_count', '?')}")
ls = new.get("latency_stats_s", {}) or {}
ts = new.get("ttft_stats_s", {}) or {}
print(f" lat: n={ls.get('count')} mean={ls.get('mean'):.4f} p50={ls.get('p50'):.4f} p90={ls.get('p90'):.4f} p99={ls.get('p99'):.4f}")
print(f" ttft: n={ts.get('count')} mean={ts.get('mean'):.4f} p50={ts.get('p50'):.4f} p90={ts.get('p90'):.4f} p99={ts.get('p99'):.4f}")
if args.diff:
# find old summary (sibling file)
candidates = [
metrics_path.parent / f"{metrics_path.stem}.summary.json",
metrics_path.with_suffix(".summary.json"),
]
old_path = next((p for p in candidates if p.exists()), None)
if old_path:
old = json.load(old_path.open())
print(f" vs old {old_path}:")
old_ls = old.get("latency_stats_s", {}) or {}
old_ts = old.get("ttft_stats_s", {}) or {}
for k in ("count", "mean", "p50", "p90", "p99"):
o = old_ls.get(k)
n = ls.get(k)
if o is not None and n is not None:
delta = n - o
print(f" lat.{k}: {o:.4f} -> {n:.4f} ({delta:+.4f})")
for k in ("count", "mean", "p50", "p90", "p99"):
o = old_ts.get(k)
n = ts.get(k)
if o is not None and n is not None:
delta = n - o
print(f" ttft.{k}: {o:.4f} -> {n:.4f} ({delta:+.4f})")
if __name__ == "__main__":
main()