from __future__ import annotations import hashlib import heapq import json import os import shutil import sys import subprocess import tempfile from array import array from concurrent.futures import ProcessPoolExecutor, as_completed from contextlib import contextmanager, nullcontext from dataclasses import asdict from pathlib import Path from typing import Iterator, TextIO from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int from tokenizers import Tokenizer from tqdm.auto import tqdm from trace_model_meta import infer_model_family_from_request_model, resolve_tokenizer_path from . import SCHEMA_VERSION from .raw_parser import get_raw_adapter from .sessionization import ( LogicalSessionizer, build_message_fingerprints, build_sequence_hashes, decode_prefix_hashes, decode_roles, encode_prefix_hashes, encode_roles, extract_user_id, ) from .time_windows import infer_time_offset_ms, infer_time_window, parse_time_to_ms def _is_supported_trace_file(path: Path) -> bool: return path.name.endswith(".jsonl") or path.name.endswith(".jsonl.zst") def derive_trace_name(input_path: str | Path) -> str: resolved = Path(input_path) if resolved.is_dir(): return resolved.name name = resolved.name if name.endswith(".jsonl.zst"): return name[: -len(".jsonl.zst")] if name.endswith(".jsonl"): return name[: -len(".jsonl")] return resolved.stem def default_formatted_name(input_path: str | Path) -> str: base_name = derive_trace_name(input_path) return base_name if base_name.endswith("-formatted") else f"{base_name}-formatted" def derive_output_label(input_path: str | Path, *, time_window=None) -> str: if time_window is not None and getattr(time_window, "label", None): return str(time_window.label) return derive_trace_name(input_path) def discover_source_files(input_dir: str | Path) -> list[Path]: root = Path(input_dir) if not root.exists(): raise FileNotFoundError(f"Input path does not exist: {root}") if root.is_file(): if not _is_supported_trace_file(root): raise FileNotFoundError(f"Input file must be .jsonl or .jsonl.zst: {root}") return [root] preferred: dict[str, Path] = {} for path in sorted(root.iterdir()): if not path.is_file(): continue if path.name.endswith(".jsonl.zst"): stem = path.name[: -len(".jsonl.zst")] preferred[stem] = path elif path.name.endswith(".jsonl"): preferred.setdefault(path.stem, path) files = [preferred[key] for key in sorted(preferred)] if not files: raise FileNotFoundError(f"No .jsonl or .jsonl.zst files found under {root}") return files @contextmanager def open_trace_text(path: str | Path) -> Iterator[TextIO]: resolved = Path(path) if resolved.suffix == ".zst": proc = subprocess.Popen( ["zstdcat", str(resolved)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", ) if proc.stdout is None: raise RuntimeError(f"Failed to stream {resolved}") try: yield proc.stdout finally: stdout = proc.stdout stdout.close() stderr = proc.stderr.read() if proc.stderr else "" return_code = proc.wait() if return_code != 0: raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}") return with resolved.open("r", encoding="utf-8") as handle: yield handle def _normalize_time_ms(*, raw_time_ms: int, wall_clock_ms: int, time_offset_ms: int) -> int: if not raw_time_ms: return wall_clock_ms - time_offset_ms if wall_clock_ms and time_offset_ms else wall_clock_ms if not wall_clock_ms or not time_offset_ms: return raw_time_ms delta_ms = wall_clock_ms - raw_time_ms tolerance_ms = 10 * 60 * 1000 if abs(delta_ms - time_offset_ms) <= tolerance_ms: return raw_time_ms if abs(delta_ms) <= tolerance_ms: return raw_time_ms - time_offset_ms return raw_time_ms def _extract_sort_time_ms(raw: dict, attributes: dict, *, time_offset_ms: int = 0) -> int: wall_clock_ms = parse_time_to_ms(str(raw.get("time", ""))) if raw.get("time") else 0 ready_ms = safe_int(attributes.get("x-dashscope-inner-requestreadytime")) if ready_ms: return _normalize_time_ms(raw_time_ms=ready_ms, wall_clock_ms=wall_clock_ms, time_offset_ms=time_offset_ms) if wall_clock_ms: return _normalize_time_ms(raw_time_ms=0, wall_clock_ms=wall_clock_ms, time_offset_ms=time_offset_ms) raw_epoch_seconds = safe_int(raw.get("__time__")) if raw_epoch_seconds: return raw_epoch_seconds * 1000 return 0 def _extract_response_message(response_payload: dict) -> dict: output = response_payload.get("output", {}) if isinstance(response_payload, dict) else {} if not isinstance(output, dict): return {} choices = output.get("choices", []) if not isinstance(choices, list) or not choices: return {} message = choices[0].get("message", {}) return message if isinstance(message, dict) else {} def _extract_usage(response_payload: dict) -> dict: usage_payload = response_payload.get("usage", {}) if isinstance(response_payload, dict) else {} output_payload = response_payload.get("output", {}) if isinstance(response_payload, dict) else {} if (not isinstance(usage_payload, dict) or not usage_payload) and isinstance(output_payload, dict): usage_payload = output_payload.get("usage", {}) output_details = parse_jsonish(usage_payload.get("output_tokens_details", {})) prompt_details = parse_jsonish(usage_payload.get("prompt_tokens_details", {})) return { "input_tokens": safe_int(usage_payload.get("input_tokens", usage_payload.get("prompt_tokens"))), "output_tokens": safe_int(usage_payload.get("output_tokens", usage_payload.get("completion_tokens"))), "total_tokens": safe_int(usage_payload.get("total_tokens")), "reasoning_tokens": safe_int( output_details.get("reasoning_tokens") if isinstance(output_details, dict) else 0 ), "cached_tokens": safe_int( prompt_details.get("cached_tokens") if isinstance(prompt_details, dict) else 0 ), } def _extract_request_components(raw: dict) -> tuple[dict, dict | None, dict, dict, list]: request_params = parse_jsonish(raw.get("request_params", {})) response_params = parse_jsonish(raw.get("response_params", {})) request_header = request_params.get("header", {}) if isinstance(request_params, dict) else {} request_attributes = request_header.get("attributes", {}) if isinstance(request_header, dict) else {} request_payload = request_params.get("payload", {}) if isinstance(request_params, dict) else {} request_input = request_payload.get("input", {}) if isinstance(request_payload, dict) else {} messages = request_input.get("messages", []) return request_params, response_params, request_payload, request_attributes, messages def _build_unified_row_from_components( raw: dict, *, request_params: dict, response_params: dict | None, request_payload: dict, request_attributes: dict, messages: list, source_file: str, source_line: int, time_offset_ms: int = 0, ) -> dict: adapter = get_raw_adapter(raw) request_parameters = request_payload.get("parameters", {}) if isinstance(request_payload, dict) else {} response_payload = response_params.get("payload", {}) if isinstance(response_params, dict) else {} response_header = response_params.get("header", {}) if isinstance(response_params, dict) else {} response_attributes = response_header.get("attributes", {}) if isinstance(response_header, dict) else {} sort_time_ms = _extract_sort_time_ms(raw, request_attributes, time_offset_ms=time_offset_ms) total_cost_time_ms = safe_int(raw.get("total_cost_time")) request_end_time_ms = sort_time_ms + total_cost_time_ms if sort_time_ms else total_cost_time_ms declared_tools = request_parameters.get("tools", []) canonical_prompt = adapter.build_canonical_prompt(request_payload) usage = _extract_usage(response_payload) message_events = [asdict(adapter.parse_message(message)) for message in messages if isinstance(message, dict)] tool_specs = [asdict(adapter.parse_tool(tool)) for tool in declared_tools if isinstance(tool, dict)] role_sequence = [event["role"] for event in message_events] user_id = extract_user_id(request_params) model_family = infer_model_family_from_request_model(raw.get("request_model")) or "glm5" raw_messages = [message for message in messages if isinstance(message, dict)] backend_first_request_time_ms = safe_int(response_attributes.get("x-ds-backend-first-request-time")) backend_first_response_time_ms = safe_int(response_attributes.get("x-ds-backend-first-response-time")) return { "schema_version": SCHEMA_VERSION, "sort_time_ms": sort_time_ms, "meta": { "model_family": model_family, "request_id": normalize_unicode_text(str(raw.get("request_id", ""))), "session_id": "", "raw_session_id": normalize_unicode_text(str(raw.get("session_id", ""))), "user_id": user_id, "parent_request_id": "", "parent_chat_id": -1, "chat_id": -1, "turn": 0, "request_model": normalize_unicode_text(str(raw.get("request_model", ""))), "time": normalize_unicode_text(str(raw.get("time", ""))), "status_code": normalize_unicode_text(str(raw.get("status_code", ""))), "status_name": normalize_unicode_text(str(raw.get("status_name", ""))), "request_ready_time_ms": sort_time_ms, "request_end_time_ms": request_end_time_ms, "total_cost_time_ms": total_cost_time_ms, "backend_first_request_time_ms": backend_first_request_time_ms, "backend_first_response_time_ms": backend_first_response_time_ms, }, "role_sequence": role_sequence, "message_events": message_events, "declared_tools": tool_specs, "usage": usage, "canonical_prompt": canonical_prompt, "response_message": _extract_response_message(response_payload), "raw_messages": raw_messages, } def _has_empty_response_params(raw: dict, response_params) -> bool: raw_value = raw.get("response_params") if raw_value is None: return True if isinstance(raw_value, str) and raw_value.strip().lower() in {"", "none", "null"}: return True return response_params is None or (isinstance(response_params, dict) and not response_params) def build_unified_row(raw: dict, *, source_file: str, source_line: int, time_offset_ms: int = 0) -> dict: request_params, response_params, request_payload, request_attributes, messages = _extract_request_components(raw) return _build_unified_row_from_components( raw, request_params=request_params, response_params=response_params, request_payload=request_payload, request_attributes=request_attributes, messages=messages, source_file=source_file, source_line=source_line, time_offset_ms=time_offset_ms, ) def _build_unified_row_payload( raw: dict, *, source_file: str, source_line: int, time_offset_ms: int = 0, ) -> tuple[int, str, str, str, str, str]: request_params, response_params, request_payload, request_attributes, messages = _extract_request_components(raw) return _build_unified_row_payload_from_components( raw, request_params=request_params, response_params=response_params, request_payload=request_payload, request_attributes=request_attributes, messages=messages, source_file=source_file, source_line=source_line, time_offset_ms=time_offset_ms, ) def _build_unified_row_payload_from_components( raw: dict, *, request_params: dict, response_params: dict | None, request_payload: dict, request_attributes: dict, messages: list, source_file: str, source_line: int, time_offset_ms: int = 0, ) -> tuple[int, str, str, str, str, str]: normalized_messages = [message for message in messages if isinstance(message, dict)] row = _build_unified_row_from_components( raw, request_params=request_params, response_params=response_params, request_payload=request_payload, request_attributes=request_attributes, messages=messages, source_file=source_file, source_line=source_line, time_offset_ms=time_offset_ms, ) message_fingerprints = build_message_fingerprints(normalized_messages) return ( safe_int(row.get("sort_time_ms")), str(row["meta"].get("user_id", "")), str(row["meta"].get("request_id", "")), encode_prefix_hashes(build_sequence_hashes(message_fingerprints)), encode_roles([str(message.get("role", "")) for message in normalized_messages]), json.dumps(row, ensure_ascii=False, separators=(",", ":")), ) def _write_chunk(rows: list[tuple[int, int, str, str, str, str, str]], tmp_dir: Path, chunk_index: int) -> Path: rows.sort(key=lambda item: (item[0], item[1])) path = tmp_dir / f"chunk_{chunk_index:05d}.jsonl" with path.open("w", encoding="utf-8") as handle: for sort_time_ms, seq, user_id, request_id, sequence_hashes, roles, row_json in rows: handle.write(f"{sort_time_ms}\t{seq}\t{user_id}\t{request_id}\t{sequence_hashes}\t{roles}\t{row_json}\n") return path def _iter_chunk_rows(path: Path) -> Iterator[tuple[int, int, str, str, str, str, str]]: with path.open("r", encoding="utf-8") as handle: for line in handle: sort_text, seq_text, user_id, request_id, sequence_hashes, roles, row_json = line.rstrip("\n").split("\t", 6) yield int(sort_text), int(seq_text), user_id, request_id, sequence_hashes, roles, row_json def _replace_json_field_once(row_json: str, *, key: str, value_text: str) -> str: target = f'"{key}":""' if target in row_json: return row_json.replace(target, f'"{key}":{value_text}', 1) numeric_target = None if key in {"parent_chat_id", "chat_id"}: numeric_target = f'"{key}":-1' elif key == "turn": numeric_target = f'"{key}":0' if numeric_target and numeric_target in row_json: return row_json.replace(numeric_target, f'"{key}":{value_text}', 1) raise ValueError(f"Unable to patch {key} in formatted row json") def _apply_session_assignment_to_row_json(row_json: str, assignment) -> str: patched = _replace_json_field_once( row_json, key="session_id", value_text=json.dumps(assignment.session_id, ensure_ascii=False), ) patched = _replace_json_field_once( patched, key="parent_request_id", value_text=json.dumps(assignment.parent_request_id, ensure_ascii=False), ) patched = _replace_json_field_once( patched, key="parent_chat_id", value_text=str(assignment.parent_chat_id), ) patched = _replace_json_field_once( patched, key="chat_id", value_text=str(assignment.chat_id), ) return _replace_json_field_once( patched, key="turn", value_text=str(assignment.turn), ) class _TeeStream: def __init__(self, *streams): self._streams = [stream for stream in streams if stream is not None] def write(self, data): for stream in self._streams: stream.write(data) return len(data) def flush(self): for stream in self._streams: stream.flush() @contextmanager def _open_progress_stream(log_file: str | Path | None): if log_file is None: yield sys.stderr return path = Path(log_file) path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as handle: yield _TeeStream(sys.stderr, handle) def _resolve_temp_root_dir(*, tmp_dir: str | Path | None, output_path: str | Path) -> Path: if tmp_dir is not None: root = Path(tmp_dir) else: root = Path(output_path).parent root.mkdir(parents=True, exist_ok=True) return root def _block_digest(block: list[int]) -> bytes: digest = hashlib.blake2b(digest_size=16) digest.update(len(block).to_bytes(4, "little", signed=False)) digest.update(array("I", block).tobytes()) return digest.digest() def _load_release_tokenizer(model_family: str) -> Tokenizer: resolved = Path(resolve_tokenizer_path(model_family=model_family)) tokenizer_file = resolved / "tokenizer.json" if resolved.is_dir() else resolved return Tokenizer.from_file(str(tokenizer_file)) def _infer_window_start_ms_from_raw_rows(raw_path: Path) -> int: with raw_path.open("r", encoding="utf-8") as handle: for line in handle: stripped = line.strip() if not stripped: continue row = json.loads(stripped) meta = row.get("meta", {}) if isinstance(row.get("meta", {}), dict) else {} return safe_int(meta.get("request_ready_time_ms", row.get("sort_time_ms"))) return 0 def _compute_release_row_core(row: dict, *, base_ms: int) -> dict: meta = row.get("meta", {}) if isinstance(row.get("meta", {}), dict) else {} ready_ms = safe_int(meta.get("request_ready_time_ms", row.get("sort_time_ms"))) timestamp_ms = ready_ms - base_ms if ready_ms and base_ms else 0 return { "chat_id": safe_int(meta.get("chat_id")), "parent_chat_id": safe_int(meta.get("parent_chat_id", -1), default=-1), "timestamp": round(timestamp_ms / 1000.0, 3), "input_length": safe_int(row.get("usage", {}).get("input_tokens")), "output_length": safe_int(row.get("usage", {}).get("output_tokens")), "type": "coder", "turn": safe_int(meta.get("turn")), } def _compute_release_segments(path: Path, jobs: int) -> list[tuple[int, int, int]]: total_size = path.stat().st_size if total_size <= 0: return [(0, 0, 0)] shard_count = max(1, min(jobs, total_size)) boundaries = [0] with path.open("rb") as handle: for index in range(1, shard_count): target = total_size * index // shard_count handle.seek(target) handle.readline() boundary = handle.tell() if boundary > boundaries[-1]: boundaries.append(boundary) if boundaries[-1] != total_size: boundaries.append(total_size) segments: list[tuple[int, int, int]] = [] for index, (start, end) in enumerate(zip(boundaries, boundaries[1:])): if end > start: segments.append((index, start, end)) return segments or [(0, 0, total_size)] def _build_release_shard( *, raw_input_path: str, shard_index: int, start_offset: int, end_offset: int, shard_output_path: str, block_size: int, base_ms: int, ) -> dict: input_path = Path(raw_input_path) output_path = Path(shard_output_path) tokenizer_cache: dict[str, Tokenizer] = {} row_count = 0 with input_path.open("rb") as source, output_path.open("w", encoding="utf-8") as destination: source.seek(start_offset) while source.tell() < end_offset: line_bytes = source.readline() if not line_bytes: break stripped = line_bytes.strip() if not stripped: continue row = json.loads(stripped) meta = row.get("meta", {}) if isinstance(row.get("meta", {}), dict) else {} model_family = str(meta.get("model_family", "") or "glm5") tokenizer = tokenizer_cache.get(model_family) if tokenizer is None: tokenizer = _load_release_tokenizer(model_family) tokenizer_cache[model_family] = tokenizer token_ids = tokenizer.encode(str(row.get("canonical_prompt", ""))).ids digest_hexes = [] for index in range(0, len(token_ids), block_size): block = token_ids[index:index + block_size] digest_hexes.append(_block_digest(block).hex()) core = _compute_release_row_core(row, base_ms=base_ms) destination.write(json.dumps(core, ensure_ascii=False, separators=(",", ":"))) destination.write("\t") destination.write(",".join(digest_hexes)) destination.write("\n") row_count += 1 return { "shard_index": shard_index, "shard_output_path": str(output_path), "row_count": row_count, "size_bytes": output_path.stat().st_size if output_path.exists() else 0, } def export_release_ready_trace( *, raw_input_path: str | Path, release_output_path: str | Path, window_start_ms: int | None = None, block_size: int = 512, jobs: int | None = None, tmp_dir: str | Path | None = None, show_progress: bool = False, progress_stream=None, log_file: str | Path | None = None, ) -> dict: if progress_stream is None: with _open_progress_stream(log_file) as owned_progress_stream: return export_release_ready_trace( raw_input_path=raw_input_path, release_output_path=release_output_path, window_start_ms=window_start_ms, block_size=block_size, jobs=jobs, tmp_dir=tmp_dir, show_progress=show_progress, progress_stream=owned_progress_stream, ) input_path = Path(raw_input_path) release_destination = Path(release_output_path) release_destination.parent.mkdir(parents=True, exist_ok=True) temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=release_destination) requested_jobs = jobs if jobs is not None else min(os.cpu_count() or 1, 16) shard_jobs = max(1, requested_jobs) base_ms = window_start_ms or _infer_window_start_ms_from_raw_rows(input_path) segments = _compute_release_segments(input_path, shard_jobs) next_block_id = 0 block_ids_by_digest: dict[str, int] = {} row_count = 0 with tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root: shard_root = Path(temp_root) / "release-shards" shard_root.mkdir(parents=True, exist_ok=True) shard_specs = [ { "raw_input_path": str(input_path), "shard_index": shard_index, "start_offset": start_offset, "end_offset": end_offset, "shard_output_path": str(shard_root / f"shard_{shard_index:05d}.jsonl"), "block_size": block_size, "base_ms": base_ms, } for shard_index, start_offset, end_offset in segments ] shard_progress = tqdm( total=len(shard_specs), desc="Build release shards", unit="shard", dynamic_ncols=True, file=progress_stream or sys.stderr, disable=not show_progress, ) shard_results: list[dict] = [] try: if len(shard_specs) == 1: shard_results.append(_build_release_shard(**shard_specs[0])) if show_progress: shard_progress.update(1) else: with ProcessPoolExecutor(max_workers=len(shard_specs)) as executor: futures = [executor.submit(_build_release_shard, **spec) for spec in shard_specs] for future in as_completed(futures): shard_results.append(future.result()) if show_progress: shard_progress.update(1) finally: if show_progress: shard_progress.close() shard_results.sort(key=lambda item: item["shard_index"]) shard_paths = [Path(item["shard_output_path"]) for item in shard_results] finalize_progress = tqdm( total=sum(item["size_bytes"] for item in shard_results), desc="Finalize release trace", unit="B", unit_scale=True, dynamic_ncols=True, file=progress_stream or sys.stderr, disable=not show_progress, ) try: with release_destination.open("w", encoding="utf-8") as destination: for shard_path in shard_paths: with shard_path.open("r", encoding="utf-8") as source: for line in source: stripped = line.rstrip("\n") if not stripped: if show_progress: finalize_progress.update(len(line.encode("utf-8"))) continue core_json, _, digest_text = stripped.partition("\t") release_row = json.loads(core_json) hash_ids = [] if digest_text: for digest_hex in digest_text.split(","): if not digest_hex: continue block_id = block_ids_by_digest.get(digest_hex) if block_id is None: block_id = next_block_id next_block_id += 1 block_ids_by_digest[digest_hex] = block_id hash_ids.append(block_id) release_row["hash_ids"] = hash_ids destination.write(json.dumps(release_row, ensure_ascii=False)) destination.write("\n") row_count += 1 if show_progress: finalize_progress.update(len(line.encode("utf-8"))) finalize_progress.set_postfix(rows=row_count, unique_blocks=next_block_id) finally: if show_progress: finalize_progress.close() return { "release_output_path": str(release_destination), "release_row_count": row_count, "release_unique_block_count": next_block_id, "release_shard_count": len(segments), } def format_and_sort_trace( *, input_dir: str | Path, output_path: str | Path, tmp_dir: str | Path | None = None, chunk_bytes: int = 128 * 1024 * 1024, start_time: str | None = None, end_time: str | None = None, truncate_to_window: bool = True, show_progress: bool = False, log_file: str | Path | None = None, ) -> dict: source_files = discover_source_files(input_dir) destination = Path(output_path) destination.parent.mkdir(parents=True, exist_ok=True) temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=destination) time_offset_ms = infer_time_offset_ms(source_files[0]) if source_files else 0 time_window = infer_time_window(source_files, start_time=start_time, end_time=end_time) if truncate_to_window else None total_input_bytes = sum(path.stat().st_size for path in source_files if path.suffix != ".zst") has_zst = any(path.suffix == ".zst" for path in source_files) with _open_progress_stream(log_file) as progress_stream, tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root: temp_raw_destination = Path(temp_root) / "formatted-raw.tmp.jsonl" chunk_root = Path(temp_root) chunk_paths: list[Path] = [] chunk_rows: list[tuple[int, int, str, str, str, str, str]] = [] chunk_size_bytes = 0 total_rows = 0 global_seq = 0 min_sort_time_ms: int | None = None max_sort_time_ms: int | None = None user_scoped_rows = 0 truncated_rows = 0 filtered_rows = 0 filtered_empty_messages_rows = 0 filtered_empty_response_rows = 0 scan_progress = tqdm( total=None if has_zst else total_input_bytes, desc="Scan raw trace", unit="B" if not has_zst else "line", unit_scale=not has_zst, dynamic_ncols=True, file=progress_stream, disable=not show_progress, ) try: for source_file in source_files: with open_trace_text(source_file) as handle: for source_line, line in enumerate(handle, start=1): stripped = line.strip() if not stripped: if show_progress: scan_progress.update(1 if has_zst else len(line.encode("utf-8"))) continue raw = json.loads(stripped) request_params, response_params, request_payload, attributes, messages = _extract_request_components(raw) empty_messages = not isinstance(messages, list) or len(messages) == 0 empty_response_params = _has_empty_response_params(raw, response_params) if empty_messages or empty_response_params: filtered_rows += 1 if empty_messages: filtered_empty_messages_rows += 1 if empty_response_params: filtered_empty_response_rows += 1 if show_progress: scan_progress.update(1 if has_zst else len(line.encode("utf-8"))) scan_progress.set_postfix( kept=total_rows, filtered=filtered_rows, truncated=truncated_rows, chunks=len(chunk_paths), ) continue if time_window is not None: ready_time_ms = _extract_sort_time_ms(raw, attributes, time_offset_ms=time_offset_ms) if ready_time_ms and (ready_time_ms < time_window.start_ms or ready_time_ms >= time_window.end_ms): truncated_rows += 1 if show_progress: scan_progress.update(1 if has_zst else len(line.encode("utf-8"))) scan_progress.set_postfix( kept=total_rows, filtered=filtered_rows, truncated=truncated_rows, chunks=len(chunk_paths), ) continue sort_time_ms, user_id, request_id, sequence_hashes, roles, row_json = _build_unified_row_payload_from_components( raw, request_params=request_params, response_params=response_params, request_payload=request_payload, request_attributes=attributes, messages=messages, source_file=source_file.name, source_line=source_line, time_offset_ms=time_offset_ms, ) chunk_rows.append((sort_time_ms, global_seq, user_id, request_id, sequence_hashes, roles, row_json)) chunk_size_bytes += ( len(row_json.encode("utf-8")) + len(user_id.encode("utf-8")) + len(request_id.encode("utf-8")) + len(sequence_hashes.encode("utf-8")) + len(roles.encode("utf-8")) + 64 ) total_rows += 1 global_seq += 1 min_sort_time_ms = sort_time_ms if min_sort_time_ms is None else min(min_sort_time_ms, sort_time_ms) max_sort_time_ms = sort_time_ms if max_sort_time_ms is None else max(max_sort_time_ms, sort_time_ms) if user_id: user_scoped_rows += 1 if chunk_size_bytes >= chunk_bytes: chunk_paths.append(_write_chunk(chunk_rows, chunk_root, len(chunk_paths))) chunk_rows = [] chunk_size_bytes = 0 if show_progress: scan_progress.update(1 if has_zst else len(line.encode("utf-8"))) scan_progress.set_postfix( kept=total_rows, filtered=filtered_rows, truncated=truncated_rows, chunks=len(chunk_paths), ) finally: if show_progress: scan_progress.close() if chunk_rows: chunk_paths.append(_write_chunk(chunk_rows, chunk_root, len(chunk_paths))) iterators = [_iter_chunk_rows(path) for path in chunk_paths] sessionizer = LogicalSessionizer() merge_progress = tqdm( total=total_rows, desc="Merge formatted trace", unit="row", dynamic_ncols=True, file=progress_stream, disable=not show_progress, ) try: with temp_raw_destination.open("w", encoding="utf-8") as output_handle: for _, _, user_id, request_id, sequence_hashes, roles, row_json in heapq.merge( *iterators, key=lambda item: (item[0], item[1]) ): assignment = sessionizer.assign_precomputed( user_id=user_id, request_id=request_id, sequence_hashes=decode_prefix_hashes(sequence_hashes), roles=decode_roles(roles), ) output_handle.write(_apply_session_assignment_to_row_json(row_json, assignment)) output_handle.write("\n") if show_progress: merge_progress.update(1) merge_progress.set_postfix(rows=total_rows, chunks=len(chunk_paths)) finally: if show_progress: merge_progress.close() shutil.move(str(temp_raw_destination), str(destination)) return { "output_path": str(destination), "row_count": total_rows, "source_file_count": len(source_files), "chunk_count": len(chunk_paths), "min_sort_time_ms": min_sort_time_ms or 0, "max_sort_time_ms": max_sort_time_ms or 0, "rows_with_user_id": user_scoped_rows, "truncated_row_count": truncated_rows, "filtered_row_count": filtered_rows, "filtered_empty_messages_row_count": filtered_empty_messages_rows, "filtered_empty_response_params_row_count": filtered_empty_response_rows, "window_start_ms": time_window.start_ms if time_window is not None else None, "window_end_ms": time_window.end_ms if time_window is not None else None, }