fix(metrics): exclude aborted requests from latency/ttft/tpot stats
The old filter `if row.latency_s is not None` accepted SGLang's fast input-length-aborts (latency_s ~ 0.08s, finish_reason='abort/BadRequest') as if they were successful zero-cost requests. This deflated mean/p50 of any run where the model rejected oversized inputs. Impact on existing comparisons (ts=1 4-run validation + v2): KVC v2 has 40 aborts + 5 ReadTimeouts (was reported as just 5); DP 4w has 67 aborts (was reported as 5). Both runs have abort behavior; the asymmetry (40 vs 67) is purely from SGLang's mem-fraction-derived max-input-len: KVC decode-only worker gets ~10 GB free GPU mem -> max-input=92098, DP fused worker gets ~9 GB -> max-input=87811, because DP also needs chunked-prefill workspace. The KVC-vs-DP latency-win direction holds and widens slightly under the fixed filter (lat mean delta: -0.8% -> -1.4%); see V2_DEEP_ANALYSIS_ZH §4.3 for the recomputed table. Changes: - metrics.py: new _is_failed_request(row) helper; latency/ttft/tpot stats now exclude both errors and aborts. New summary fields abort_count and failure_count expose the counts directly. - scripts/analysis/recompute_summary.py: re-derives summary.json from existing metrics.jsonl using the fixed code, with optional --diff against the old buggy summary for inspection. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
105
scripts/analysis/recompute_summary.py
Normal file
105
scripts/analysis/recompute_summary.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Re-derive summary.json from existing metrics.jsonl using the fixed metrics.py.
|
||||
|
||||
Bug fixed: requests aborted by SGLang (e.g. input > max-input-len returns
|
||||
a fast 400 with latency_s ~ 0.08s) were previously counted in latency_stats
|
||||
as if successful, deflating mean/p50/p90. The fixed metrics.py excludes
|
||||
all failed requests (errors or aborts) from latency/ttft/tpot stats and
|
||||
exposes abort_count / failure_count.
|
||||
|
||||
Usage:
|
||||
python3 scripts/analysis/recompute_summary.py path/to/metrics.jsonl ...
|
||||
python3 scripts/analysis/recompute_summary.py --diff path/to/metrics.jsonl path/to/old_summary.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
|
||||
|
||||
from agentic_pd_hybrid.metrics import RequestMetrics, write_summary_json
|
||||
|
||||
|
||||
def load_rows(metrics_path: Path) -> list[RequestMetrics]:
|
||||
rows = []
|
||||
field_names = {f for f in RequestMetrics.__dataclass_fields__}
|
||||
with metrics_path.open() as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
raw = json.loads(line)
|
||||
kwargs = {k: raw.get(k) for k in field_names}
|
||||
rows.append(RequestMetrics(**kwargs))
|
||||
return rows
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("metrics_paths", nargs="+", type=Path)
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="output summary path (default: alongside metrics with .recomputed_summary.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--diff",
|
||||
action="store_true",
|
||||
help="print before/after diff against the old <metrics>.summary.json",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
for metrics_path in args.metrics_paths:
|
||||
rows = load_rows(metrics_path)
|
||||
out_path = args.out or metrics_path.with_suffix(".recomputed_summary.json")
|
||||
write_summary_json(
|
||||
out_path,
|
||||
rows,
|
||||
trace_path=metrics_path,
|
||||
router_url=None,
|
||||
)
|
||||
new = json.load(out_path.open())
|
||||
print(f"\n=== {metrics_path} ===")
|
||||
print(f" written: {out_path}")
|
||||
print(f" total rows: {new['request_count']}")
|
||||
print(f" error_count: {new['error_count']}")
|
||||
print(f" abort_count: {new.get('abort_count', '?')}")
|
||||
print(f" failure_count: {new.get('failure_count', '?')}")
|
||||
ls = new.get("latency_stats_s", {}) or {}
|
||||
ts = new.get("ttft_stats_s", {}) or {}
|
||||
print(f" lat: n={ls.get('count')} mean={ls.get('mean'):.4f} p50={ls.get('p50'):.4f} p90={ls.get('p90'):.4f} p99={ls.get('p99'):.4f}")
|
||||
print(f" ttft: n={ts.get('count')} mean={ts.get('mean'):.4f} p50={ts.get('p50'):.4f} p90={ts.get('p90'):.4f} p99={ts.get('p99'):.4f}")
|
||||
|
||||
if args.diff:
|
||||
# find old summary (sibling file)
|
||||
candidates = [
|
||||
metrics_path.parent / f"{metrics_path.stem}.summary.json",
|
||||
metrics_path.with_suffix(".summary.json"),
|
||||
]
|
||||
old_path = next((p for p in candidates if p.exists()), None)
|
||||
if old_path:
|
||||
old = json.load(old_path.open())
|
||||
print(f" vs old {old_path}:")
|
||||
old_ls = old.get("latency_stats_s", {}) or {}
|
||||
old_ts = old.get("ttft_stats_s", {}) or {}
|
||||
for k in ("count", "mean", "p50", "p90", "p99"):
|
||||
o = old_ls.get(k)
|
||||
n = ls.get(k)
|
||||
if o is not None and n is not None:
|
||||
delta = n - o
|
||||
print(f" lat.{k}: {o:.4f} -> {n:.4f} ({delta:+.4f})")
|
||||
for k in ("count", "mean", "p50", "p90", "p99"):
|
||||
o = old_ts.get(k)
|
||||
n = ts.get(k)
|
||||
if o is not None and n is not None:
|
||||
delta = n - o
|
||||
print(f" ttft.{k}: {o:.4f} -> {n:.4f} ({delta:+.4f})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -114,6 +114,16 @@ def write_metrics_jsonl(path: Path, rows: list[RequestMetrics]) -> None:
|
||||
handle.write(json.dumps(asdict(row), sort_keys=True) + "\n")
|
||||
|
||||
|
||||
def _is_failed_request(row: RequestMetrics) -> bool:
|
||||
if row.error is not None:
|
||||
return True
|
||||
if row.finish_reason is not None:
|
||||
fr = str(row.finish_reason).lower()
|
||||
if "abort" in fr or "badrequest" in fr:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def write_summary_json(
|
||||
path: Path,
|
||||
rows: list[RequestMetrics],
|
||||
@@ -121,9 +131,10 @@ def write_summary_json(
|
||||
trace_path: Path,
|
||||
router_url: str | None,
|
||||
) -> None:
|
||||
latencies = [row.latency_s for row in rows if row.latency_s is not None]
|
||||
ttfts = [row.ttft_s for row in rows if row.ttft_s is not None]
|
||||
tpots = [row.tpot_s for row in rows if row.tpot_s is not None]
|
||||
successful = [row for row in rows if not _is_failed_request(row)]
|
||||
latencies = [row.latency_s for row in successful if row.latency_s is not None]
|
||||
ttfts = [row.ttft_s for row in successful if row.ttft_s is not None]
|
||||
tpots = [row.tpot_s for row in successful if row.tpot_s is not None]
|
||||
per_decode_load = Counter(row.assigned_decode_node for row in rows)
|
||||
per_prefill_load = Counter(row.assigned_prefill_node for row in rows)
|
||||
prefill_priorities = Counter(
|
||||
@@ -167,6 +178,17 @@ def write_summary_json(
|
||||
str(key): value for key, value in sorted(decode_priorities.items())
|
||||
},
|
||||
"error_count": sum(1 for row in rows if row.error is not None),
|
||||
"abort_count": sum(
|
||||
1
|
||||
for row in rows
|
||||
if row.error is None
|
||||
and row.finish_reason is not None
|
||||
and (
|
||||
"abort" in str(row.finish_reason).lower()
|
||||
or "badrequest" in str(row.finish_reason).lower()
|
||||
)
|
||||
),
|
||||
"failure_count": sum(1 for row in rows if _is_failed_request(row)),
|
||||
"truncated_request_count": sum(
|
||||
1
|
||||
for row in rows
|
||||
|
||||
Reference in New Issue
Block a user