Fix trace reuse and packaged model assets

This commit is contained in:
2026-05-08 21:24:29 +08:00
parent 152f01613b
commit 5a6b1acb49
6 changed files with 210 additions and 37 deletions

View File

@@ -6,14 +6,13 @@ import json
import os
import shutil
import sys
import subprocess
import tempfile
from array import array
from concurrent.futures import ProcessPoolExecutor, as_completed
from contextlib import contextmanager, nullcontext
from dataclasses import asdict
from pathlib import Path
from typing import Iterator, TextIO
from typing import Iterator
from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int
from tokenizers import Tokenizer
@@ -32,6 +31,7 @@ from .sessionization import (
encode_roles,
extract_user_id,
)
from .trace_io import open_trace_text
from .time_windows import infer_time_offset_ms, infer_time_window, parse_time_to_ms
@@ -88,34 +88,6 @@ def discover_source_files(input_dir: str | Path) -> list[Path]:
return files
@contextmanager
def open_trace_text(path: str | Path) -> Iterator[TextIO]:
resolved = Path(path)
if resolved.suffix == ".zst":
proc = subprocess.Popen(
["zstdcat", str(resolved)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
)
if proc.stdout is None:
raise RuntimeError(f"Failed to stream {resolved}")
try:
yield proc.stdout
finally:
stdout = proc.stdout
stdout.close()
stderr = proc.stderr.read() if proc.stderr else ""
return_code = proc.wait()
if return_code != 0:
raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}")
return
with resolved.open("r", encoding="utf-8") as handle:
yield handle
def _normalize_time_ms(*, raw_time_ms: int, wall_clock_ms: int, time_offset_ms: int) -> int:
if not raw_time_ms:
return wall_clock_ms - time_offset_ms if wall_clock_ms and time_offset_ms else wall_clock_ms

View File

@@ -8,6 +8,8 @@ from pathlib import Path
from trace_analyzer.helpers import parse_jsonish, safe_int
from .trace_io import open_trace_text
WINDOW_RE = re.compile(r"(?P<day>\d{4})-(?P<start>\d{4})-(?P<end>\d{4})$")
UTC_PLUS_8 = timezone(timedelta(hours=8))
@@ -32,8 +34,17 @@ def parse_time_to_ms(value: str) -> int:
raise ValueError(f"Unsupported timestamp format: {value!r}")
def _trace_window_name(path: Path) -> str:
name = path.name
if name.endswith(".jsonl.zst"):
return name[: -len(".jsonl.zst")]
if name.endswith(".jsonl"):
return name[: -len(".jsonl")]
return path.stem
def _read_first_timestamp(path: Path) -> str:
with path.open("r", encoding="utf-8") as handle:
with open_trace_text(path) as handle:
for line in handle:
stripped = line.strip()
if not stripped:
@@ -46,7 +57,7 @@ def _read_first_timestamp(path: Path) -> str:
def _read_first_timestamp_and_ready_ms(path: Path) -> tuple[str, int]:
with path.open("r", encoding="utf-8") as handle:
with open_trace_text(path) as handle:
for line in handle:
stripped = line.strip()
if not stripped:
@@ -90,8 +101,8 @@ def infer_time_window(
if not source_files:
return None
first_match = WINDOW_RE.match(source_files[0].stem)
last_match = WINDOW_RE.match(source_files[-1].stem)
first_match = WINDOW_RE.match(_trace_window_name(source_files[0]))
last_match = WINDOW_RE.match(_trace_window_name(source_files[-1]))
if first_match is None or last_match is None:
return None

View File

@@ -0,0 +1,33 @@
from __future__ import annotations
import subprocess
from contextlib import contextmanager
from pathlib import Path
from typing import Iterator, TextIO
@contextmanager
def open_trace_text(path: str | Path) -> Iterator[TextIO]:
resolved = Path(path)
if resolved.suffix == ".zst":
proc = subprocess.Popen(
["zstdcat", str(resolved)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
)
if proc.stdout is None:
raise RuntimeError(f"Failed to stream {resolved}")
try:
yield proc.stdout
finally:
proc.stdout.close()
stderr = proc.stderr.read() if proc.stderr else ""
return_code = proc.wait()
if return_code != 0:
raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}")
return
with resolved.open("r", encoding="utf-8") as handle:
yield handle