fix(metrics): exclude aborted requests from latency/ttft/tpot stats

The old filter `if row.latency_s is not None` accepted SGLang's fast
input-length-aborts (latency_s ~ 0.08s, finish_reason='abort/BadRequest')
as if they were successful zero-cost requests. This deflated mean/p50
of any run where the model rejected oversized inputs.

Impact on existing comparisons (ts=1 4-run validation + v2):
  KVC v2 has 40 aborts + 5 ReadTimeouts (was reported as just 5);
  DP 4w  has 67 aborts (was reported as 5).
Both runs have abort behavior; the asymmetry (40 vs 67) is purely from
SGLang's mem-fraction-derived max-input-len: KVC decode-only worker gets
~10 GB free GPU mem -> max-input=92098, DP fused worker gets ~9 GB ->
max-input=87811, because DP also needs chunked-prefill workspace.

The KVC-vs-DP latency-win direction holds and widens slightly under the
fixed filter (lat mean delta: -0.8% -> -1.4%); see V2_DEEP_ANALYSIS_ZH
§4.3 for the recomputed table.

Changes:
- metrics.py: new _is_failed_request(row) helper; latency/ttft/tpot
  stats now exclude both errors and aborts. New summary fields
  abort_count and failure_count expose the counts directly.
- scripts/analysis/recompute_summary.py: re-derives summary.json from
  existing metrics.jsonl using the fixed code, with optional --diff
  against the old buggy summary for inspection.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
kzlin
2026-05-11 17:29:18 +08:00
parent 0c25168cad
commit 5eac9b4f6b
2 changed files with 130 additions and 3 deletions

View File

@@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""Re-derive summary.json from existing metrics.jsonl using the fixed metrics.py.
Bug fixed: requests aborted by SGLang (e.g. input > max-input-len returns
a fast 400 with latency_s ~ 0.08s) were previously counted in latency_stats
as if successful, deflating mean/p50/p90. The fixed metrics.py excludes
all failed requests (errors or aborts) from latency/ttft/tpot stats and
exposes abort_count / failure_count.
Usage:
python3 scripts/analysis/recompute_summary.py path/to/metrics.jsonl ...
python3 scripts/analysis/recompute_summary.py --diff path/to/metrics.jsonl path/to/old_summary.json
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
from agentic_pd_hybrid.metrics import RequestMetrics, write_summary_json
def load_rows(metrics_path: Path) -> list[RequestMetrics]:
rows = []
field_names = {f for f in RequestMetrics.__dataclass_fields__}
with metrics_path.open() as handle:
for line in handle:
line = line.strip()
if not line:
continue
raw = json.loads(line)
kwargs = {k: raw.get(k) for k in field_names}
rows.append(RequestMetrics(**kwargs))
return rows
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("metrics_paths", nargs="+", type=Path)
parser.add_argument(
"--out",
type=Path,
default=None,
help="output summary path (default: alongside metrics with .recomputed_summary.json)",
)
parser.add_argument(
"--diff",
action="store_true",
help="print before/after diff against the old <metrics>.summary.json",
)
args = parser.parse_args()
for metrics_path in args.metrics_paths:
rows = load_rows(metrics_path)
out_path = args.out or metrics_path.with_suffix(".recomputed_summary.json")
write_summary_json(
out_path,
rows,
trace_path=metrics_path,
router_url=None,
)
new = json.load(out_path.open())
print(f"\n=== {metrics_path} ===")
print(f" written: {out_path}")
print(f" total rows: {new['request_count']}")
print(f" error_count: {new['error_count']}")
print(f" abort_count: {new.get('abort_count', '?')}")
print(f" failure_count: {new.get('failure_count', '?')}")
ls = new.get("latency_stats_s", {}) or {}
ts = new.get("ttft_stats_s", {}) or {}
print(f" lat: n={ls.get('count')} mean={ls.get('mean'):.4f} p50={ls.get('p50'):.4f} p90={ls.get('p90'):.4f} p99={ls.get('p99'):.4f}")
print(f" ttft: n={ts.get('count')} mean={ts.get('mean'):.4f} p50={ts.get('p50'):.4f} p90={ts.get('p90'):.4f} p99={ts.get('p99'):.4f}")
if args.diff:
# find old summary (sibling file)
candidates = [
metrics_path.parent / f"{metrics_path.stem}.summary.json",
metrics_path.with_suffix(".summary.json"),
]
old_path = next((p for p in candidates if p.exists()), None)
if old_path:
old = json.load(old_path.open())
print(f" vs old {old_path}:")
old_ls = old.get("latency_stats_s", {}) or {}
old_ts = old.get("ttft_stats_s", {}) or {}
for k in ("count", "mean", "p50", "p90", "p99"):
o = old_ls.get(k)
n = ls.get(k)
if o is not None and n is not None:
delta = n - o
print(f" lat.{k}: {o:.4f} -> {n:.4f} ({delta:+.4f})")
for k in ("count", "mean", "p50", "p90", "p99"):
o = old_ts.get(k)
n = ts.get(k)
if o is not None and n is not None:
delta = n - o
print(f" ttft.{k}: {o:.4f} -> {n:.4f} ({delta:+.4f})")
if __name__ == "__main__":
main()

View File

@@ -114,6 +114,16 @@ def write_metrics_jsonl(path: Path, rows: list[RequestMetrics]) -> None:
handle.write(json.dumps(asdict(row), sort_keys=True) + "\n")
def _is_failed_request(row: RequestMetrics) -> bool:
if row.error is not None:
return True
if row.finish_reason is not None:
fr = str(row.finish_reason).lower()
if "abort" in fr or "badrequest" in fr:
return True
return False
def write_summary_json(
path: Path,
rows: list[RequestMetrics],
@@ -121,9 +131,10 @@ def write_summary_json(
trace_path: Path,
router_url: str | None,
) -> None:
latencies = [row.latency_s for row in rows if row.latency_s is not None]
ttfts = [row.ttft_s for row in rows if row.ttft_s is not None]
tpots = [row.tpot_s for row in rows if row.tpot_s is not None]
successful = [row for row in rows if not _is_failed_request(row)]
latencies = [row.latency_s for row in successful if row.latency_s is not None]
ttfts = [row.ttft_s for row in successful if row.ttft_s is not None]
tpots = [row.tpot_s for row in successful if row.tpot_s is not None]
per_decode_load = Counter(row.assigned_decode_node for row in rows)
per_prefill_load = Counter(row.assigned_prefill_node for row in rows)
prefill_priorities = Counter(
@@ -167,6 +178,17 @@ def write_summary_json(
str(key): value for key, value in sorted(decode_priorities.items())
},
"error_count": sum(1 for row in rows if row.error is not None),
"abort_count": sum(
1
for row in rows
if row.error is None
and row.finish_reason is not None
and (
"abort" in str(row.finish_reason).lower()
or "badrequest" in str(row.finish_reason).lower()
)
),
"failure_count": sum(1 for row in rows if _is_failed_request(row)),
"truncated_request_count": sum(
1
for row in rows