861 lines
35 KiB
Python
861 lines
35 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import heapq
|
|
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import subprocess
|
|
import tempfile
|
|
from array import array
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
from contextlib import contextmanager, nullcontext
|
|
from dataclasses import asdict
|
|
from pathlib import Path
|
|
from typing import Iterator, TextIO
|
|
|
|
from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int
|
|
from tokenizers import Tokenizer
|
|
from tqdm.auto import tqdm
|
|
from trace_model_meta import infer_model_family_from_request_model, resolve_tokenizer_path
|
|
|
|
from . import SCHEMA_VERSION
|
|
from .raw_parser import get_raw_adapter
|
|
from .sessionization import (
|
|
LogicalSessionizer,
|
|
build_message_fingerprints,
|
|
build_sequence_hashes,
|
|
decode_prefix_hashes,
|
|
decode_roles,
|
|
encode_prefix_hashes,
|
|
encode_roles,
|
|
extract_user_id,
|
|
)
|
|
from .time_windows import infer_time_offset_ms, infer_time_window, parse_time_to_ms
|
|
|
|
|
|
def _is_supported_trace_file(path: Path) -> bool:
|
|
return path.name.endswith(".jsonl") or path.name.endswith(".jsonl.zst")
|
|
|
|
|
|
def derive_trace_name(input_path: str | Path) -> str:
|
|
resolved = Path(input_path)
|
|
if resolved.is_dir():
|
|
return resolved.name
|
|
name = resolved.name
|
|
if name.endswith(".jsonl.zst"):
|
|
return name[: -len(".jsonl.zst")]
|
|
if name.endswith(".jsonl"):
|
|
return name[: -len(".jsonl")]
|
|
return resolved.stem
|
|
|
|
|
|
def default_formatted_name(input_path: str | Path) -> str:
|
|
base_name = derive_trace_name(input_path)
|
|
return base_name if base_name.endswith("-formatted") else f"{base_name}-formatted"
|
|
|
|
|
|
def derive_output_label(input_path: str | Path, *, time_window=None) -> str:
|
|
if time_window is not None and getattr(time_window, "label", None):
|
|
return str(time_window.label)
|
|
return derive_trace_name(input_path)
|
|
|
|
|
|
def discover_source_files(input_dir: str | Path) -> list[Path]:
|
|
root = Path(input_dir)
|
|
if not root.exists():
|
|
raise FileNotFoundError(f"Input path does not exist: {root}")
|
|
|
|
if root.is_file():
|
|
if not _is_supported_trace_file(root):
|
|
raise FileNotFoundError(f"Input file must be .jsonl or .jsonl.zst: {root}")
|
|
return [root]
|
|
|
|
preferred: dict[str, Path] = {}
|
|
for path in sorted(root.iterdir()):
|
|
if not path.is_file():
|
|
continue
|
|
if path.name.endswith(".jsonl.zst"):
|
|
stem = path.name[: -len(".jsonl.zst")]
|
|
preferred[stem] = path
|
|
elif path.name.endswith(".jsonl"):
|
|
preferred.setdefault(path.stem, path)
|
|
|
|
files = [preferred[key] for key in sorted(preferred)]
|
|
if not files:
|
|
raise FileNotFoundError(f"No .jsonl or .jsonl.zst files found under {root}")
|
|
return files
|
|
|
|
|
|
@contextmanager
|
|
def open_trace_text(path: str | Path) -> Iterator[TextIO]:
|
|
resolved = Path(path)
|
|
if resolved.suffix == ".zst":
|
|
proc = subprocess.Popen(
|
|
["zstdcat", str(resolved)],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
encoding="utf-8",
|
|
)
|
|
if proc.stdout is None:
|
|
raise RuntimeError(f"Failed to stream {resolved}")
|
|
try:
|
|
yield proc.stdout
|
|
finally:
|
|
stdout = proc.stdout
|
|
stdout.close()
|
|
stderr = proc.stderr.read() if proc.stderr else ""
|
|
return_code = proc.wait()
|
|
if return_code != 0:
|
|
raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}")
|
|
return
|
|
|
|
with resolved.open("r", encoding="utf-8") as handle:
|
|
yield handle
|
|
|
|
|
|
def _normalize_time_ms(*, raw_time_ms: int, wall_clock_ms: int, time_offset_ms: int) -> int:
|
|
if not raw_time_ms:
|
|
return wall_clock_ms - time_offset_ms if wall_clock_ms and time_offset_ms else wall_clock_ms
|
|
if not wall_clock_ms or not time_offset_ms:
|
|
return raw_time_ms
|
|
|
|
delta_ms = wall_clock_ms - raw_time_ms
|
|
tolerance_ms = 10 * 60 * 1000
|
|
if abs(delta_ms - time_offset_ms) <= tolerance_ms:
|
|
return raw_time_ms
|
|
if abs(delta_ms) <= tolerance_ms:
|
|
return raw_time_ms - time_offset_ms
|
|
return raw_time_ms
|
|
|
|
|
|
def _extract_sort_time_ms(raw: dict, attributes: dict, *, time_offset_ms: int = 0) -> int:
|
|
wall_clock_ms = parse_time_to_ms(str(raw.get("time", ""))) if raw.get("time") else 0
|
|
ready_ms = safe_int(attributes.get("x-dashscope-inner-requestreadytime"))
|
|
if ready_ms:
|
|
return _normalize_time_ms(raw_time_ms=ready_ms, wall_clock_ms=wall_clock_ms, time_offset_ms=time_offset_ms)
|
|
if wall_clock_ms:
|
|
return _normalize_time_ms(raw_time_ms=0, wall_clock_ms=wall_clock_ms, time_offset_ms=time_offset_ms)
|
|
raw_epoch_seconds = safe_int(raw.get("__time__"))
|
|
if raw_epoch_seconds:
|
|
return raw_epoch_seconds * 1000
|
|
return 0
|
|
|
|
|
|
def _extract_response_message(response_payload: dict) -> dict:
|
|
output = response_payload.get("output", {}) if isinstance(response_payload, dict) else {}
|
|
if not isinstance(output, dict):
|
|
return {}
|
|
choices = output.get("choices", [])
|
|
if not isinstance(choices, list) or not choices:
|
|
return {}
|
|
message = choices[0].get("message", {})
|
|
return message if isinstance(message, dict) else {}
|
|
|
|
|
|
def _extract_usage(response_payload: dict) -> dict:
|
|
usage_payload = response_payload.get("usage", {}) if isinstance(response_payload, dict) else {}
|
|
output_payload = response_payload.get("output", {}) if isinstance(response_payload, dict) else {}
|
|
if (not isinstance(usage_payload, dict) or not usage_payload) and isinstance(output_payload, dict):
|
|
usage_payload = output_payload.get("usage", {})
|
|
output_details = parse_jsonish(usage_payload.get("output_tokens_details", {}))
|
|
prompt_details = parse_jsonish(usage_payload.get("prompt_tokens_details", {}))
|
|
return {
|
|
"input_tokens": safe_int(usage_payload.get("input_tokens", usage_payload.get("prompt_tokens"))),
|
|
"output_tokens": safe_int(usage_payload.get("output_tokens", usage_payload.get("completion_tokens"))),
|
|
"total_tokens": safe_int(usage_payload.get("total_tokens")),
|
|
"reasoning_tokens": safe_int(
|
|
output_details.get("reasoning_tokens") if isinstance(output_details, dict) else 0
|
|
),
|
|
"cached_tokens": safe_int(
|
|
prompt_details.get("cached_tokens") if isinstance(prompt_details, dict) else 0
|
|
),
|
|
}
|
|
|
|
|
|
def _extract_request_components(raw: dict) -> tuple[dict, dict | None, dict, dict, list]:
|
|
request_params = parse_jsonish(raw.get("request_params", {}))
|
|
response_params = parse_jsonish(raw.get("response_params", {}))
|
|
request_header = request_params.get("header", {}) if isinstance(request_params, dict) else {}
|
|
request_attributes = request_header.get("attributes", {}) if isinstance(request_header, dict) else {}
|
|
request_payload = request_params.get("payload", {}) if isinstance(request_params, dict) else {}
|
|
request_input = request_payload.get("input", {}) if isinstance(request_payload, dict) else {}
|
|
messages = request_input.get("messages", [])
|
|
return request_params, response_params, request_payload, request_attributes, messages
|
|
|
|
|
|
def _build_unified_row_from_components(
|
|
raw: dict,
|
|
*,
|
|
request_params: dict,
|
|
response_params: dict | None,
|
|
request_payload: dict,
|
|
request_attributes: dict,
|
|
messages: list,
|
|
source_file: str,
|
|
source_line: int,
|
|
time_offset_ms: int = 0,
|
|
) -> dict:
|
|
adapter = get_raw_adapter(raw)
|
|
request_parameters = request_payload.get("parameters", {}) if isinstance(request_payload, dict) else {}
|
|
response_payload = response_params.get("payload", {}) if isinstance(response_params, dict) else {}
|
|
response_header = response_params.get("header", {}) if isinstance(response_params, dict) else {}
|
|
response_attributes = response_header.get("attributes", {}) if isinstance(response_header, dict) else {}
|
|
|
|
sort_time_ms = _extract_sort_time_ms(raw, request_attributes, time_offset_ms=time_offset_ms)
|
|
total_cost_time_ms = safe_int(raw.get("total_cost_time"))
|
|
request_end_time_ms = sort_time_ms + total_cost_time_ms if sort_time_ms else total_cost_time_ms
|
|
|
|
declared_tools = request_parameters.get("tools", [])
|
|
canonical_prompt = adapter.build_canonical_prompt(request_payload)
|
|
usage = _extract_usage(response_payload)
|
|
message_events = [asdict(adapter.parse_message(message)) for message in messages if isinstance(message, dict)]
|
|
tool_specs = [asdict(adapter.parse_tool(tool)) for tool in declared_tools if isinstance(tool, dict)]
|
|
role_sequence = [event["role"] for event in message_events]
|
|
user_id = extract_user_id(request_params)
|
|
model_family = infer_model_family_from_request_model(raw.get("request_model")) or "glm5"
|
|
raw_messages = [message for message in messages if isinstance(message, dict)]
|
|
backend_first_request_time_ms = safe_int(response_attributes.get("x-ds-backend-first-request-time"))
|
|
backend_first_response_time_ms = safe_int(response_attributes.get("x-ds-backend-first-response-time"))
|
|
|
|
return {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"sort_time_ms": sort_time_ms,
|
|
"meta": {
|
|
"model_family": model_family,
|
|
"request_id": normalize_unicode_text(str(raw.get("request_id", ""))),
|
|
"session_id": "",
|
|
"raw_session_id": normalize_unicode_text(str(raw.get("session_id", ""))),
|
|
"user_id": user_id,
|
|
"parent_request_id": "",
|
|
"parent_chat_id": -1,
|
|
"chat_id": -1,
|
|
"turn": 0,
|
|
"request_model": normalize_unicode_text(str(raw.get("request_model", ""))),
|
|
"time": normalize_unicode_text(str(raw.get("time", ""))),
|
|
"status_code": normalize_unicode_text(str(raw.get("status_code", ""))),
|
|
"status_name": normalize_unicode_text(str(raw.get("status_name", ""))),
|
|
"request_ready_time_ms": sort_time_ms,
|
|
"request_end_time_ms": request_end_time_ms,
|
|
"total_cost_time_ms": total_cost_time_ms,
|
|
"backend_first_request_time_ms": backend_first_request_time_ms,
|
|
"backend_first_response_time_ms": backend_first_response_time_ms,
|
|
},
|
|
"role_sequence": role_sequence,
|
|
"message_events": message_events,
|
|
"declared_tools": tool_specs,
|
|
"usage": usage,
|
|
"canonical_prompt": canonical_prompt,
|
|
"response_message": _extract_response_message(response_payload),
|
|
"raw_messages": raw_messages,
|
|
}
|
|
|
|
|
|
def _has_empty_response_params(raw: dict, response_params) -> bool:
|
|
raw_value = raw.get("response_params")
|
|
if raw_value is None:
|
|
return True
|
|
if isinstance(raw_value, str) and raw_value.strip().lower() in {"", "none", "null"}:
|
|
return True
|
|
return response_params is None or (isinstance(response_params, dict) and not response_params)
|
|
|
|
|
|
def build_unified_row(raw: dict, *, source_file: str, source_line: int, time_offset_ms: int = 0) -> dict:
|
|
request_params, response_params, request_payload, request_attributes, messages = _extract_request_components(raw)
|
|
return _build_unified_row_from_components(
|
|
raw,
|
|
request_params=request_params,
|
|
response_params=response_params,
|
|
request_payload=request_payload,
|
|
request_attributes=request_attributes,
|
|
messages=messages,
|
|
source_file=source_file,
|
|
source_line=source_line,
|
|
time_offset_ms=time_offset_ms,
|
|
)
|
|
|
|
|
|
def _build_unified_row_payload(
|
|
raw: dict,
|
|
*,
|
|
source_file: str,
|
|
source_line: int,
|
|
time_offset_ms: int = 0,
|
|
) -> tuple[int, str, str, str, str, str]:
|
|
request_params, response_params, request_payload, request_attributes, messages = _extract_request_components(raw)
|
|
return _build_unified_row_payload_from_components(
|
|
raw,
|
|
request_params=request_params,
|
|
response_params=response_params,
|
|
request_payload=request_payload,
|
|
request_attributes=request_attributes,
|
|
messages=messages,
|
|
source_file=source_file,
|
|
source_line=source_line,
|
|
time_offset_ms=time_offset_ms,
|
|
)
|
|
|
|
|
|
def _build_unified_row_payload_from_components(
|
|
raw: dict,
|
|
*,
|
|
request_params: dict,
|
|
response_params: dict | None,
|
|
request_payload: dict,
|
|
request_attributes: dict,
|
|
messages: list,
|
|
source_file: str,
|
|
source_line: int,
|
|
time_offset_ms: int = 0,
|
|
) -> tuple[int, str, str, str, str, str]:
|
|
normalized_messages = [message for message in messages if isinstance(message, dict)]
|
|
row = _build_unified_row_from_components(
|
|
raw,
|
|
request_params=request_params,
|
|
response_params=response_params,
|
|
request_payload=request_payload,
|
|
request_attributes=request_attributes,
|
|
messages=messages,
|
|
source_file=source_file,
|
|
source_line=source_line,
|
|
time_offset_ms=time_offset_ms,
|
|
)
|
|
message_fingerprints = build_message_fingerprints(normalized_messages)
|
|
return (
|
|
safe_int(row.get("sort_time_ms")),
|
|
str(row["meta"].get("user_id", "")),
|
|
str(row["meta"].get("request_id", "")),
|
|
encode_prefix_hashes(build_sequence_hashes(message_fingerprints)),
|
|
encode_roles([str(message.get("role", "")) for message in normalized_messages]),
|
|
json.dumps(row, ensure_ascii=False, separators=(",", ":")),
|
|
)
|
|
|
|
|
|
def _write_chunk(rows: list[tuple[int, int, str, str, str, str, str]], tmp_dir: Path, chunk_index: int) -> Path:
|
|
rows.sort(key=lambda item: (item[0], item[1]))
|
|
path = tmp_dir / f"chunk_{chunk_index:05d}.jsonl"
|
|
with path.open("w", encoding="utf-8") as handle:
|
|
for sort_time_ms, seq, user_id, request_id, sequence_hashes, roles, row_json in rows:
|
|
handle.write(f"{sort_time_ms}\t{seq}\t{user_id}\t{request_id}\t{sequence_hashes}\t{roles}\t{row_json}\n")
|
|
return path
|
|
|
|
|
|
def _iter_chunk_rows(path: Path) -> Iterator[tuple[int, int, str, str, str, str, str]]:
|
|
with path.open("r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
sort_text, seq_text, user_id, request_id, sequence_hashes, roles, row_json = line.rstrip("\n").split("\t", 6)
|
|
yield int(sort_text), int(seq_text), user_id, request_id, sequence_hashes, roles, row_json
|
|
|
|
|
|
def _replace_json_field_once(row_json: str, *, key: str, value_text: str) -> str:
|
|
target = f'"{key}":""'
|
|
if target in row_json:
|
|
return row_json.replace(target, f'"{key}":{value_text}', 1)
|
|
numeric_target = None
|
|
if key in {"parent_chat_id", "chat_id"}:
|
|
numeric_target = f'"{key}":-1'
|
|
elif key == "turn":
|
|
numeric_target = f'"{key}":0'
|
|
if numeric_target and numeric_target in row_json:
|
|
return row_json.replace(numeric_target, f'"{key}":{value_text}', 1)
|
|
raise ValueError(f"Unable to patch {key} in formatted row json")
|
|
|
|
|
|
def _apply_session_assignment_to_row_json(row_json: str, assignment) -> str:
|
|
patched = _replace_json_field_once(
|
|
row_json,
|
|
key="session_id",
|
|
value_text=json.dumps(assignment.session_id, ensure_ascii=False),
|
|
)
|
|
patched = _replace_json_field_once(
|
|
patched,
|
|
key="parent_request_id",
|
|
value_text=json.dumps(assignment.parent_request_id, ensure_ascii=False),
|
|
)
|
|
patched = _replace_json_field_once(
|
|
patched,
|
|
key="parent_chat_id",
|
|
value_text=str(assignment.parent_chat_id),
|
|
)
|
|
patched = _replace_json_field_once(
|
|
patched,
|
|
key="chat_id",
|
|
value_text=str(assignment.chat_id),
|
|
)
|
|
return _replace_json_field_once(
|
|
patched,
|
|
key="turn",
|
|
value_text=str(assignment.turn),
|
|
)
|
|
|
|
|
|
class _TeeStream:
|
|
def __init__(self, *streams):
|
|
self._streams = [stream for stream in streams if stream is not None]
|
|
|
|
def write(self, data):
|
|
for stream in self._streams:
|
|
stream.write(data)
|
|
return len(data)
|
|
|
|
def flush(self):
|
|
for stream in self._streams:
|
|
stream.flush()
|
|
|
|
|
|
@contextmanager
|
|
def _open_progress_stream(log_file: str | Path | None):
|
|
if log_file is None:
|
|
yield sys.stderr
|
|
return
|
|
|
|
path = Path(log_file)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with path.open("w", encoding="utf-8") as handle:
|
|
yield _TeeStream(sys.stderr, handle)
|
|
|
|
|
|
def _resolve_temp_root_dir(*, tmp_dir: str | Path | None, output_path: str | Path) -> Path:
|
|
if tmp_dir is not None:
|
|
root = Path(tmp_dir)
|
|
else:
|
|
root = Path(output_path).parent
|
|
root.mkdir(parents=True, exist_ok=True)
|
|
return root
|
|
|
|
|
|
def _block_digest(block: list[int]) -> bytes:
|
|
digest = hashlib.blake2b(digest_size=16)
|
|
digest.update(len(block).to_bytes(4, "little", signed=False))
|
|
digest.update(array("I", block).tobytes())
|
|
return digest.digest()
|
|
|
|
|
|
def _load_release_tokenizer(model_family: str) -> Tokenizer:
|
|
resolved = Path(resolve_tokenizer_path(model_family=model_family))
|
|
tokenizer_file = resolved / "tokenizer.json" if resolved.is_dir() else resolved
|
|
return Tokenizer.from_file(str(tokenizer_file))
|
|
|
|
|
|
def _infer_window_start_ms_from_raw_rows(raw_path: Path) -> int:
|
|
with raw_path.open("r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
row = json.loads(stripped)
|
|
meta = row.get("meta", {}) if isinstance(row.get("meta", {}), dict) else {}
|
|
return safe_int(meta.get("request_ready_time_ms", row.get("sort_time_ms")))
|
|
return 0
|
|
|
|
|
|
def _compute_release_row_core(row: dict, *, base_ms: int) -> dict:
|
|
meta = row.get("meta", {}) if isinstance(row.get("meta", {}), dict) else {}
|
|
ready_ms = safe_int(meta.get("request_ready_time_ms", row.get("sort_time_ms")))
|
|
timestamp_ms = ready_ms - base_ms if ready_ms and base_ms else 0
|
|
return {
|
|
"chat_id": safe_int(meta.get("chat_id")),
|
|
"parent_chat_id": safe_int(meta.get("parent_chat_id", -1), default=-1),
|
|
"timestamp": round(timestamp_ms / 1000.0, 3),
|
|
"input_length": safe_int(row.get("usage", {}).get("input_tokens")),
|
|
"output_length": safe_int(row.get("usage", {}).get("output_tokens")),
|
|
"type": "coder",
|
|
"turn": safe_int(meta.get("turn")),
|
|
}
|
|
|
|
|
|
def _compute_release_segments(path: Path, jobs: int) -> list[tuple[int, int, int]]:
|
|
total_size = path.stat().st_size
|
|
if total_size <= 0:
|
|
return [(0, 0, 0)]
|
|
|
|
shard_count = max(1, min(jobs, total_size))
|
|
boundaries = [0]
|
|
with path.open("rb") as handle:
|
|
for index in range(1, shard_count):
|
|
target = total_size * index // shard_count
|
|
handle.seek(target)
|
|
handle.readline()
|
|
boundary = handle.tell()
|
|
if boundary > boundaries[-1]:
|
|
boundaries.append(boundary)
|
|
if boundaries[-1] != total_size:
|
|
boundaries.append(total_size)
|
|
|
|
segments: list[tuple[int, int, int]] = []
|
|
for index, (start, end) in enumerate(zip(boundaries, boundaries[1:])):
|
|
if end > start:
|
|
segments.append((index, start, end))
|
|
return segments or [(0, 0, total_size)]
|
|
|
|
|
|
def _build_release_shard(
|
|
*,
|
|
raw_input_path: str,
|
|
shard_index: int,
|
|
start_offset: int,
|
|
end_offset: int,
|
|
shard_output_path: str,
|
|
block_size: int,
|
|
base_ms: int,
|
|
) -> dict:
|
|
input_path = Path(raw_input_path)
|
|
output_path = Path(shard_output_path)
|
|
tokenizer_cache: dict[str, Tokenizer] = {}
|
|
row_count = 0
|
|
|
|
with input_path.open("rb") as source, output_path.open("w", encoding="utf-8") as destination:
|
|
source.seek(start_offset)
|
|
while source.tell() < end_offset:
|
|
line_bytes = source.readline()
|
|
if not line_bytes:
|
|
break
|
|
stripped = line_bytes.strip()
|
|
if not stripped:
|
|
continue
|
|
|
|
row = json.loads(stripped)
|
|
meta = row.get("meta", {}) if isinstance(row.get("meta", {}), dict) else {}
|
|
model_family = str(meta.get("model_family", "") or "glm5")
|
|
tokenizer = tokenizer_cache.get(model_family)
|
|
if tokenizer is None:
|
|
tokenizer = _load_release_tokenizer(model_family)
|
|
tokenizer_cache[model_family] = tokenizer
|
|
|
|
token_ids = tokenizer.encode(str(row.get("canonical_prompt", ""))).ids
|
|
digest_hexes = []
|
|
for index in range(0, len(token_ids), block_size):
|
|
block = token_ids[index:index + block_size]
|
|
digest_hexes.append(_block_digest(block).hex())
|
|
|
|
core = _compute_release_row_core(row, base_ms=base_ms)
|
|
destination.write(json.dumps(core, ensure_ascii=False, separators=(",", ":")))
|
|
destination.write("\t")
|
|
destination.write(",".join(digest_hexes))
|
|
destination.write("\n")
|
|
row_count += 1
|
|
|
|
return {
|
|
"shard_index": shard_index,
|
|
"shard_output_path": str(output_path),
|
|
"row_count": row_count,
|
|
"size_bytes": output_path.stat().st_size if output_path.exists() else 0,
|
|
}
|
|
|
|
|
|
def export_release_ready_trace(
|
|
*,
|
|
raw_input_path: str | Path,
|
|
release_output_path: str | Path,
|
|
window_start_ms: int | None = None,
|
|
block_size: int = 512,
|
|
jobs: int | None = None,
|
|
tmp_dir: str | Path | None = None,
|
|
show_progress: bool = False,
|
|
progress_stream=None,
|
|
log_file: str | Path | None = None,
|
|
) -> dict:
|
|
if progress_stream is None:
|
|
with _open_progress_stream(log_file) as owned_progress_stream:
|
|
return export_release_ready_trace(
|
|
raw_input_path=raw_input_path,
|
|
release_output_path=release_output_path,
|
|
window_start_ms=window_start_ms,
|
|
block_size=block_size,
|
|
jobs=jobs,
|
|
tmp_dir=tmp_dir,
|
|
show_progress=show_progress,
|
|
progress_stream=owned_progress_stream,
|
|
)
|
|
|
|
input_path = Path(raw_input_path)
|
|
release_destination = Path(release_output_path)
|
|
release_destination.parent.mkdir(parents=True, exist_ok=True)
|
|
temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=release_destination)
|
|
|
|
requested_jobs = jobs if jobs is not None else min(os.cpu_count() or 1, 16)
|
|
shard_jobs = max(1, requested_jobs)
|
|
base_ms = window_start_ms or _infer_window_start_ms_from_raw_rows(input_path)
|
|
segments = _compute_release_segments(input_path, shard_jobs)
|
|
next_block_id = 0
|
|
block_ids_by_digest: dict[str, int] = {}
|
|
row_count = 0
|
|
|
|
with tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root:
|
|
shard_root = Path(temp_root) / "release-shards"
|
|
shard_root.mkdir(parents=True, exist_ok=True)
|
|
shard_specs = [
|
|
{
|
|
"raw_input_path": str(input_path),
|
|
"shard_index": shard_index,
|
|
"start_offset": start_offset,
|
|
"end_offset": end_offset,
|
|
"shard_output_path": str(shard_root / f"shard_{shard_index:05d}.jsonl"),
|
|
"block_size": block_size,
|
|
"base_ms": base_ms,
|
|
}
|
|
for shard_index, start_offset, end_offset in segments
|
|
]
|
|
|
|
shard_progress = tqdm(
|
|
total=len(shard_specs),
|
|
desc="Build release shards",
|
|
unit="shard",
|
|
dynamic_ncols=True,
|
|
file=progress_stream or sys.stderr,
|
|
disable=not show_progress,
|
|
)
|
|
shard_results: list[dict] = []
|
|
try:
|
|
if len(shard_specs) == 1:
|
|
shard_results.append(_build_release_shard(**shard_specs[0]))
|
|
if show_progress:
|
|
shard_progress.update(1)
|
|
else:
|
|
with ProcessPoolExecutor(max_workers=len(shard_specs)) as executor:
|
|
futures = [executor.submit(_build_release_shard, **spec) for spec in shard_specs]
|
|
for future in as_completed(futures):
|
|
shard_results.append(future.result())
|
|
if show_progress:
|
|
shard_progress.update(1)
|
|
finally:
|
|
if show_progress:
|
|
shard_progress.close()
|
|
|
|
shard_results.sort(key=lambda item: item["shard_index"])
|
|
shard_paths = [Path(item["shard_output_path"]) for item in shard_results]
|
|
finalize_progress = tqdm(
|
|
total=sum(item["size_bytes"] for item in shard_results),
|
|
desc="Finalize release trace",
|
|
unit="B",
|
|
unit_scale=True,
|
|
dynamic_ncols=True,
|
|
file=progress_stream or sys.stderr,
|
|
disable=not show_progress,
|
|
)
|
|
try:
|
|
with release_destination.open("w", encoding="utf-8") as destination:
|
|
for shard_path in shard_paths:
|
|
with shard_path.open("r", encoding="utf-8") as source:
|
|
for line in source:
|
|
stripped = line.rstrip("\n")
|
|
if not stripped:
|
|
if show_progress:
|
|
finalize_progress.update(len(line.encode("utf-8")))
|
|
continue
|
|
core_json, _, digest_text = stripped.partition("\t")
|
|
release_row = json.loads(core_json)
|
|
hash_ids = []
|
|
if digest_text:
|
|
for digest_hex in digest_text.split(","):
|
|
if not digest_hex:
|
|
continue
|
|
block_id = block_ids_by_digest.get(digest_hex)
|
|
if block_id is None:
|
|
block_id = next_block_id
|
|
next_block_id += 1
|
|
block_ids_by_digest[digest_hex] = block_id
|
|
hash_ids.append(block_id)
|
|
release_row["hash_ids"] = hash_ids
|
|
destination.write(json.dumps(release_row, ensure_ascii=False))
|
|
destination.write("\n")
|
|
row_count += 1
|
|
if show_progress:
|
|
finalize_progress.update(len(line.encode("utf-8")))
|
|
finalize_progress.set_postfix(rows=row_count, unique_blocks=next_block_id)
|
|
finally:
|
|
if show_progress:
|
|
finalize_progress.close()
|
|
|
|
return {
|
|
"release_output_path": str(release_destination),
|
|
"release_row_count": row_count,
|
|
"release_unique_block_count": next_block_id,
|
|
"release_shard_count": len(segments),
|
|
}
|
|
|
|
|
|
def format_and_sort_trace(
|
|
*,
|
|
input_dir: str | Path,
|
|
output_path: str | Path,
|
|
tmp_dir: str | Path | None = None,
|
|
chunk_bytes: int = 128 * 1024 * 1024,
|
|
start_time: str | None = None,
|
|
end_time: str | None = None,
|
|
truncate_to_window: bool = True,
|
|
show_progress: bool = False,
|
|
log_file: str | Path | None = None,
|
|
) -> dict:
|
|
source_files = discover_source_files(input_dir)
|
|
destination = Path(output_path)
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=destination)
|
|
time_offset_ms = infer_time_offset_ms(source_files[0]) if source_files else 0
|
|
time_window = infer_time_window(source_files, start_time=start_time, end_time=end_time) if truncate_to_window else None
|
|
total_input_bytes = sum(path.stat().st_size for path in source_files if path.suffix != ".zst")
|
|
has_zst = any(path.suffix == ".zst" for path in source_files)
|
|
|
|
with _open_progress_stream(log_file) as progress_stream, tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root:
|
|
temp_raw_destination = Path(temp_root) / "formatted-raw.tmp.jsonl"
|
|
chunk_root = Path(temp_root)
|
|
chunk_paths: list[Path] = []
|
|
chunk_rows: list[tuple[int, int, str, str, str, str, str]] = []
|
|
chunk_size_bytes = 0
|
|
total_rows = 0
|
|
global_seq = 0
|
|
min_sort_time_ms: int | None = None
|
|
max_sort_time_ms: int | None = None
|
|
user_scoped_rows = 0
|
|
truncated_rows = 0
|
|
filtered_rows = 0
|
|
filtered_empty_messages_rows = 0
|
|
filtered_empty_response_rows = 0
|
|
|
|
scan_progress = tqdm(
|
|
total=None if has_zst else total_input_bytes,
|
|
desc="Scan raw trace",
|
|
unit="B" if not has_zst else "line",
|
|
unit_scale=not has_zst,
|
|
dynamic_ncols=True,
|
|
file=progress_stream,
|
|
disable=not show_progress,
|
|
)
|
|
try:
|
|
for source_file in source_files:
|
|
with open_trace_text(source_file) as handle:
|
|
for source_line, line in enumerate(handle, start=1):
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
if show_progress:
|
|
scan_progress.update(1 if has_zst else len(line.encode("utf-8")))
|
|
continue
|
|
raw = json.loads(stripped)
|
|
request_params, response_params, request_payload, attributes, messages = _extract_request_components(raw)
|
|
empty_messages = not isinstance(messages, list) or len(messages) == 0
|
|
empty_response_params = _has_empty_response_params(raw, response_params)
|
|
if empty_messages or empty_response_params:
|
|
filtered_rows += 1
|
|
if empty_messages:
|
|
filtered_empty_messages_rows += 1
|
|
if empty_response_params:
|
|
filtered_empty_response_rows += 1
|
|
if show_progress:
|
|
scan_progress.update(1 if has_zst else len(line.encode("utf-8")))
|
|
scan_progress.set_postfix(
|
|
kept=total_rows,
|
|
filtered=filtered_rows,
|
|
truncated=truncated_rows,
|
|
chunks=len(chunk_paths),
|
|
)
|
|
continue
|
|
if time_window is not None:
|
|
ready_time_ms = _extract_sort_time_ms(raw, attributes, time_offset_ms=time_offset_ms)
|
|
if ready_time_ms and (ready_time_ms < time_window.start_ms or ready_time_ms >= time_window.end_ms):
|
|
truncated_rows += 1
|
|
if show_progress:
|
|
scan_progress.update(1 if has_zst else len(line.encode("utf-8")))
|
|
scan_progress.set_postfix(
|
|
kept=total_rows,
|
|
filtered=filtered_rows,
|
|
truncated=truncated_rows,
|
|
chunks=len(chunk_paths),
|
|
)
|
|
continue
|
|
sort_time_ms, user_id, request_id, sequence_hashes, roles, row_json = _build_unified_row_payload_from_components(
|
|
raw,
|
|
request_params=request_params,
|
|
response_params=response_params,
|
|
request_payload=request_payload,
|
|
request_attributes=attributes,
|
|
messages=messages,
|
|
source_file=source_file.name,
|
|
source_line=source_line,
|
|
time_offset_ms=time_offset_ms,
|
|
)
|
|
chunk_rows.append((sort_time_ms, global_seq, user_id, request_id, sequence_hashes, roles, row_json))
|
|
chunk_size_bytes += (
|
|
len(row_json.encode("utf-8"))
|
|
+ len(user_id.encode("utf-8"))
|
|
+ len(request_id.encode("utf-8"))
|
|
+ len(sequence_hashes.encode("utf-8"))
|
|
+ len(roles.encode("utf-8"))
|
|
+ 64
|
|
)
|
|
total_rows += 1
|
|
global_seq += 1
|
|
min_sort_time_ms = sort_time_ms if min_sort_time_ms is None else min(min_sort_time_ms, sort_time_ms)
|
|
max_sort_time_ms = sort_time_ms if max_sort_time_ms is None else max(max_sort_time_ms, sort_time_ms)
|
|
if user_id:
|
|
user_scoped_rows += 1
|
|
|
|
if chunk_size_bytes >= chunk_bytes:
|
|
chunk_paths.append(_write_chunk(chunk_rows, chunk_root, len(chunk_paths)))
|
|
chunk_rows = []
|
|
chunk_size_bytes = 0
|
|
|
|
if show_progress:
|
|
scan_progress.update(1 if has_zst else len(line.encode("utf-8")))
|
|
scan_progress.set_postfix(
|
|
kept=total_rows,
|
|
filtered=filtered_rows,
|
|
truncated=truncated_rows,
|
|
chunks=len(chunk_paths),
|
|
)
|
|
finally:
|
|
if show_progress:
|
|
scan_progress.close()
|
|
|
|
if chunk_rows:
|
|
chunk_paths.append(_write_chunk(chunk_rows, chunk_root, len(chunk_paths)))
|
|
|
|
iterators = [_iter_chunk_rows(path) for path in chunk_paths]
|
|
sessionizer = LogicalSessionizer()
|
|
merge_progress = tqdm(
|
|
total=total_rows,
|
|
desc="Merge formatted trace",
|
|
unit="row",
|
|
dynamic_ncols=True,
|
|
file=progress_stream,
|
|
disable=not show_progress,
|
|
)
|
|
try:
|
|
with temp_raw_destination.open("w", encoding="utf-8") as output_handle:
|
|
for _, _, user_id, request_id, sequence_hashes, roles, row_json in heapq.merge(
|
|
*iterators, key=lambda item: (item[0], item[1])
|
|
):
|
|
assignment = sessionizer.assign_precomputed(
|
|
user_id=user_id,
|
|
request_id=request_id,
|
|
sequence_hashes=decode_prefix_hashes(sequence_hashes),
|
|
roles=decode_roles(roles),
|
|
)
|
|
output_handle.write(_apply_session_assignment_to_row_json(row_json, assignment))
|
|
output_handle.write("\n")
|
|
if show_progress:
|
|
merge_progress.update(1)
|
|
merge_progress.set_postfix(rows=total_rows, chunks=len(chunk_paths))
|
|
finally:
|
|
if show_progress:
|
|
merge_progress.close()
|
|
|
|
shutil.move(str(temp_raw_destination), str(destination))
|
|
|
|
return {
|
|
"output_path": str(destination),
|
|
"row_count": total_rows,
|
|
"source_file_count": len(source_files),
|
|
"chunk_count": len(chunk_paths),
|
|
"min_sort_time_ms": min_sort_time_ms or 0,
|
|
"max_sort_time_ms": max_sort_time_ms or 0,
|
|
"rows_with_user_id": user_scoped_rows,
|
|
"truncated_row_count": truncated_rows,
|
|
"filtered_row_count": filtered_rows,
|
|
"filtered_empty_messages_row_count": filtered_empty_messages_rows,
|
|
"filtered_empty_response_params_row_count": filtered_empty_response_rows,
|
|
"window_start_ms": time_window.start_ms if time_window is not None else None,
|
|
"window_end_ms": time_window.end_ms if time_window is not None else None,
|
|
}
|