fix: tool parser and illegal char and /tmp space
This commit is contained in:
@@ -15,7 +15,7 @@ from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Iterator, TextIO
|
||||
|
||||
from trace_analyzer.helpers import parse_jsonish, safe_int
|
||||
from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int
|
||||
from tokenizers import Tokenizer
|
||||
from tqdm.auto import tqdm
|
||||
from trace_model_meta import infer_model_family_from_request_model, resolve_tokenizer_path
|
||||
@@ -225,18 +225,18 @@ def _build_unified_row_from_components(
|
||||
"sort_time_ms": sort_time_ms,
|
||||
"meta": {
|
||||
"model_family": model_family,
|
||||
"request_id": str(raw.get("request_id", "")),
|
||||
"request_id": normalize_unicode_text(str(raw.get("request_id", ""))),
|
||||
"session_id": "",
|
||||
"raw_session_id": str(raw.get("session_id", "")),
|
||||
"raw_session_id": normalize_unicode_text(str(raw.get("session_id", ""))),
|
||||
"user_id": user_id,
|
||||
"parent_request_id": "",
|
||||
"parent_chat_id": -1,
|
||||
"chat_id": -1,
|
||||
"turn": 0,
|
||||
"request_model": str(raw.get("request_model", "")),
|
||||
"time": str(raw.get("time", "")),
|
||||
"status_code": str(raw.get("status_code", "")),
|
||||
"status_name": str(raw.get("status_name", "")),
|
||||
"request_model": normalize_unicode_text(str(raw.get("request_model", ""))),
|
||||
"time": normalize_unicode_text(str(raw.get("time", ""))),
|
||||
"status_code": normalize_unicode_text(str(raw.get("status_code", ""))),
|
||||
"status_name": normalize_unicode_text(str(raw.get("status_name", ""))),
|
||||
"request_ready_time_ms": sort_time_ms,
|
||||
"request_end_time_ms": request_end_time_ms,
|
||||
"total_cost_time_ms": total_cost_time_ms,
|
||||
@@ -417,6 +417,15 @@ def _open_progress_stream(log_file: str | Path | None):
|
||||
yield _TeeStream(sys.stderr, handle)
|
||||
|
||||
|
||||
def _resolve_temp_root_dir(*, tmp_dir: str | Path | None, output_path: str | Path) -> Path:
|
||||
if tmp_dir is not None:
|
||||
root = Path(tmp_dir)
|
||||
else:
|
||||
root = Path(output_path).parent
|
||||
root.mkdir(parents=True, exist_ok=True)
|
||||
return root
|
||||
|
||||
|
||||
def _block_digest(block: list[int]) -> bytes:
|
||||
digest = hashlib.blake2b(digest_size=16)
|
||||
digest.update(len(block).to_bytes(4, "little", signed=False))
|
||||
@@ -564,6 +573,7 @@ def export_release_ready_trace(
|
||||
input_path = Path(raw_input_path)
|
||||
release_destination = Path(release_output_path)
|
||||
release_destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=release_destination)
|
||||
|
||||
requested_jobs = jobs if jobs is not None else min(os.cpu_count() or 1, 16)
|
||||
shard_jobs = max(1, requested_jobs)
|
||||
@@ -573,7 +583,7 @@ def export_release_ready_trace(
|
||||
block_ids_by_digest: dict[str, int] = {}
|
||||
row_count = 0
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=tmp_dir) as temp_root:
|
||||
with tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root:
|
||||
shard_root = Path(temp_root) / "release-shards"
|
||||
shard_root.mkdir(parents=True, exist_ok=True)
|
||||
shard_specs = [
|
||||
@@ -682,12 +692,13 @@ def format_and_sort_trace(
|
||||
source_files = discover_source_files(input_dir)
|
||||
destination = Path(output_path)
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=destination)
|
||||
time_offset_ms = infer_time_offset_ms(source_files[0]) if source_files else 0
|
||||
time_window = infer_time_window(source_files, start_time=start_time, end_time=end_time) if truncate_to_window else None
|
||||
total_input_bytes = sum(path.stat().st_size for path in source_files if path.suffix != ".zst")
|
||||
has_zst = any(path.suffix == ".zst" for path in source_files)
|
||||
|
||||
with _open_progress_stream(log_file) as progress_stream, tempfile.TemporaryDirectory(dir=tmp_dir) as temp_root:
|
||||
with _open_progress_stream(log_file) as progress_stream, tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root:
|
||||
temp_raw_destination = Path(temp_root) / "formatted-raw.tmp.jsonl"
|
||||
chunk_root = Path(temp_root)
|
||||
chunk_paths: list[Path] = []
|
||||
|
||||
Reference in New Issue
Block a user