import csv import json from collections import Counter from pathlib import Path from .features import feature_to_row from .helpers import series_stats from .parser import flatten_record, record_to_dict def ensure_output_dir(path): path.mkdir(parents=True, exist_ok=True) return path def write_jsonl(path, rows): with open(path, "w", encoding="utf-8") as handle: for row in rows: handle.write(json.dumps(row, ensure_ascii=False) + "\n") def write_csv(path, rows): if not rows: with open(path, "w", encoding="utf-8", newline="") as handle: handle.write("") return fieldnames = list(rows[0].keys()) with open(path, "w", encoding="utf-8", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) def write_parquet(path, rows): try: import pyarrow as pa import pyarrow.parquet as pq except ImportError as exc: raise RuntimeError("Parquet output requires pyarrow to be installed.") from exc table = pa.Table.from_pylist(rows) pq.write_table(table, path) def write_normalized(records, output_dir, output_format="jsonl"): output_dir = ensure_output_dir(output_dir) rows = [record_to_dict(record) for record in records] if output_format == "jsonl": path = output_dir / "normalized.jsonl" write_jsonl(path, rows) return path if output_format == "csv": path = output_dir / "normalized.csv" write_csv(path, [flatten_record(record) for record in records]) return path if output_format == "parquet": path = output_dir / "normalized.parquet" write_parquet(path, rows) return path raise ValueError(f"Unsupported format: {output_format}") def write_features(features, output_dir): output_dir = ensure_output_dir(output_dir) path = output_dir / "features.csv" write_csv(path, [feature_to_row(feature) for feature in features]) return path def build_summary(records, features): model_counts = Counter(feature.model or "unknown" for feature in features) status_code_counts = Counter(feature.status_code or "unknown" for feature in features) role_transition_counts = Counter() for feature in features: role_transition_counts["assistant->tool"] += feature.assistant_to_tool_count role_transition_counts["tool->assistant"] += feature.tool_to_assistant_count role_transition_counts["tool->tool"] += feature.tool_to_tool_count role_transition_counts["assistant->user"] += feature.assistant_to_user_count role_transition_counts["user->assistant"] += feature.user_to_assistant_count latency_stats = series_stats([feature.latency_ms for feature in features]) cache_ratio_stats = series_stats([feature.cache_hit_ratio for feature in features]) cached_token_stats = series_stats([feature.cached_tokens for feature in features]) declared_tool_stats = series_stats([feature.declared_tool_count for feature in features]) burst_stats = series_stats([feature.max_consecutive_tool_msgs for feature in features]) high_burst_requests = sorted( [ { "request_id": feature.request_id, "session_id": feature.session_id, "max_consecutive_tool_msgs": feature.max_consecutive_tool_msgs, "tool_to_tool_count": feature.tool_to_tool_count, } for feature in features if feature.tool_burst_alert ], key=lambda item: (item["max_consecutive_tool_msgs"], item["tool_to_tool_count"]), reverse=True, )[:10] slow_despite_cache = sorted( [ { "request_id": feature.request_id, "session_id": feature.session_id, "latency_ms": feature.latency_ms, "cache_hit_ratio": feature.cache_hit_ratio, } for feature in features if "slow-despite-cache" in feature.pattern_labels ], key=lambda item: item["latency_ms"], reverse=True, )[:10] long_context_no_cache = sorted( [ { "request_id": feature.request_id, "session_id": feature.session_id, "input_tokens": feature.input_tokens, "cache_hit_ratio": feature.cache_hit_ratio, } for feature in features if "long-context-no-cache" in feature.pattern_labels ], key=lambda item: item["input_tokens"], reverse=True, )[:10] cache_buckets = [] for label, low, high in [ ("lt_0_2", 0.0, 0.2), ("0_2_to_0_8", 0.2, 0.8), ("ge_0_8", 0.8, 1.01), ]: bucket = [feature for feature in features if low <= feature.cache_hit_ratio < high] cache_buckets.append( { "bucket": label, "count": len(bucket), "avg_latency_ms": series_stats([feature.latency_ms for feature in bucket])["mean"], "avg_cache_hit_ratio": series_stats([feature.cache_hit_ratio for feature in bucket])["mean"], } ) return { "record_count": len(records), "success_count": sum(1 for feature in features if feature.status_code in {"1000", "200"}), "session_count": len({record.meta.session_id for record in records if record.meta.session_id}), "model_counts": dict(model_counts), "status_code_counts": dict(status_code_counts), "thresholds": { "long_context": 32000, "high_cache": 0.8, "tool_burst_alert": 4, "tool_loop_alert": 3, "slow_request_p90_latency_ms": latency_stats["p90"], }, "tool_patterns": { "role_transitions": dict(role_transition_counts), "declared_tool_count": declared_tool_stats, "max_consecutive_tool_msgs": burst_stats, "tool_burst_alert_count": sum(feature.tool_burst_alert for feature in features), "tool_loop_alert_count": sum(feature.tool_loop_alert for feature in features), "high_burst_requests": high_burst_requests, }, "cache_patterns": { "cached_tokens": cached_token_stats, "cache_hit_ratio": cache_ratio_stats, "latency_ms": latency_stats, "cache_buckets": cache_buckets, }, "anomalies": { "slow_despite_cache": slow_despite_cache, "long_context_no_cache": long_context_no_cache, }, } def _format_top_requests(rows, columns): if not rows: return "_none_" header = "| " + " | ".join(columns) + " |" divider = "| " + " | ".join(["---"] * len(columns)) + " |" lines = [header, divider] for row in rows: lines.append("| " + " | ".join(_render_value(row.get(column, "")) for column in columns) + " |") return "\n".join(lines) def _render_value(value): if isinstance(value, float): return f"{value:.4f}".rstrip("0").rstrip(".") return str(value) def _render_mapping(mapping): if isinstance(mapping, dict): rendered = {key: _render_mapping(value) for key, value in mapping.items()} return json.dumps(rendered, ensure_ascii=False) if isinstance(mapping, list): return [_render_mapping(value) for value in mapping] if isinstance(mapping, float): return float(f"{mapping:.4f}") return mapping def build_markdown_report(summary): lines = [ "# Trace Analysis Report", "", "## Data Overview", f"- Records: {summary['record_count']}", f"- Success count: {summary['success_count']}", f"- Session count: {summary['session_count']}", f"- Models: {_render_mapping(summary['model_counts'])}", f"- Status codes: {_render_mapping(summary['status_code_counts'])}", "", "## Tool Patterns", f"- Role transitions: {_render_mapping(summary['tool_patterns']['role_transitions'])}", f"- Declared tool count stats: {_render_mapping(summary['tool_patterns']['declared_tool_count'])}", f"- Max consecutive tool msg stats: {_render_mapping(summary['tool_patterns']['max_consecutive_tool_msgs'])}", f"- Tool burst alerts: {summary['tool_patterns']['tool_burst_alert_count']}", f"- Tool loop alerts: {summary['tool_patterns']['tool_loop_alert_count']}", "", "High burst requests:", _format_top_requests( summary["tool_patterns"]["high_burst_requests"], ["request_id", "session_id", "max_consecutive_tool_msgs", "tool_to_tool_count"], ), "", "## Cache Patterns", f"- Cached token stats: {_render_mapping(summary['cache_patterns']['cached_tokens'])}", f"- Cache hit ratio stats: {_render_mapping(summary['cache_patterns']['cache_hit_ratio'])}", f"- Latency stats: {_render_mapping(summary['cache_patterns']['latency_ms'])}", "", "Cache buckets:", _format_top_requests( summary["cache_patterns"]["cache_buckets"], ["bucket", "count", "avg_latency_ms", "avg_cache_hit_ratio"], ), "", "## Anomalies", "Slow despite cache:", _format_top_requests( summary["anomalies"]["slow_despite_cache"], ["request_id", "session_id", "latency_ms", "cache_hit_ratio"], ), "", "Long context no cache:", _format_top_requests( summary["anomalies"]["long_context_no_cache"], ["request_id", "session_id", "input_tokens", "cache_hit_ratio"], ), "", ] return "\n".join(lines) def write_report(records, features, output_dir): output_dir = ensure_output_dir(output_dir) summary = build_summary(records, features) summary_path = output_dir / "summary.json" with open(summary_path, "w", encoding="utf-8") as handle: json.dump(summary, handle, ensure_ascii=False, indent=2) report_path = output_dir / "report.md" with open(report_path, "w", encoding="utf-8") as handle: handle.write(build_markdown_report(summary)) return summary_path, report_path