Fix trace reuse and packaged model assets

This commit is contained in:
2026-05-08 21:24:29 +08:00
parent 152f01613b
commit 5a6b1acb49
6 changed files with 210 additions and 37 deletions

View File

@@ -21,7 +21,10 @@ trace-formatter = "trace_formatter.cli:main"
trace-analyzer = "trace_analyzer.cli:main" trace-analyzer = "trace_analyzer.cli:main"
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
include = ["trace_analyzer", "trace_formatter", "trace_model_meta"] include = ["trace_analyzer", "trace_formatter", "trace_model_meta*"]
[tool.setuptools.package-data]
trace_model_meta = ["**/*.json", "**/*.jinja", "**/*.py"]
[dependency-groups] [dependency-groups]
dev = [ dev = [

View File

@@ -1,5 +1,8 @@
import json import json
import shutil
import subprocess
import tempfile import tempfile
import tomllib
import unittest import unittest
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
@@ -655,6 +658,33 @@ class AliTracePipelineTest(unittest.TestCase):
self.assertEqual(Path(captured["dir"]), output_dir) self.assertEqual(Path(captured["dir"]), output_dir)
def test_format_and_sort_trace_supports_zstd_input_during_time_inference(self):
if shutil.which("zstd") is None or shutil.which("zstdcat") is None:
self.skipTest("zstd/zstdcat are required for .jsonl.zst formatter smoke test")
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
raw_path = root / "0417-1500-1530.jsonl"
zst_path = root / "0417-1500-1530.jsonl.zst"
output_path = root / "formatted" / "trace-raw.jsonl"
raw_path.write_text(
json.dumps(
make_raw_row(
"req-zst",
utc_ms("2026-04-17 15:00:01.000"),
time_text="2026-04-17 15:00:01.000",
)
)
+ "\n",
encoding="utf-8",
)
subprocess.run(["zstd", "-q", str(raw_path), "-o", str(zst_path)], check=True)
stats = format_and_sort_trace(input_dir=zst_path, output_path=output_path, chunk_bytes=256)
self.assertEqual(stats["row_count"], 1)
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
self.assertEqual([row["meta"]["request_id"] for row in formatted_rows], ["req-zst"])
def test_export_release_ready_trace_defaults_temp_dir_to_output_parent(self): def test_export_release_ready_trace_defaults_temp_dir_to_output_parent(self):
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir) root = Path(temp_dir)
@@ -855,3 +885,57 @@ class AliTracePipelineTest(unittest.TestCase):
(analysis_dir / "details" / "details_summary.json").stat().st_mtime_ns, (analysis_dir / "details" / "details_summary.json").stat().st_mtime_ns,
details_summary_mtime_ns, details_summary_mtime_ns,
) )
def test_trace_analyzer_rebuilds_when_same_output_gets_different_input(self):
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
formatted_root = root / "outputs" / "formatted"
analysis_root = root / "outputs" / "analysis"
def build_trace(label: str, request_id: str, ready: str) -> Path:
raw_dir = root / label
raw_dir.mkdir()
raw_path = raw_dir / "0417-1500-1530.jsonl"
raw_path.write_text(
json.dumps(make_raw_row(request_id, utc_ms(ready), user_id=f"user-{request_id}")) + "\n",
encoding="utf-8",
)
self.assertEqual(formatter_main(["format", str(raw_dir), "--output-root", str(formatted_root)]), 0)
formatted_path = formatted_root / "041715-041715-raw.jsonl"
renamed_path = formatted_root / f"{label}-raw.jsonl"
formatted_path.replace(renamed_path)
self.assertEqual(formatter_main(["build-release", str(renamed_path), "--jobs", "1", "--block-size", "8"]), 0)
return renamed_path
first_path = build_trace("first", "req-first", "2026-04-17 15:00:01.000")
second_path = build_trace("second", "req-second", "2026-04-17 15:00:02.000")
common_args = [
"--output-root",
str(analysis_root),
"--dataset-name",
"same-dataset",
"--segment-mode",
"bytes",
"--block-size",
"8",
]
self.assertEqual(analyzer_main(["analyze", str(first_path), *common_args]), 0)
analysis_dir = analysis_root / "same-dataset"
self.assertIn("req-first", (analysis_dir / "features.csv").read_text(encoding="utf-8"))
self.assertEqual(analyzer_main(["analyze", str(second_path), *common_args]), 0)
features_text = (analysis_dir / "features.csv").read_text(encoding="utf-8")
self.assertIn("req-second", features_text)
self.assertNotIn("req-first", features_text)
details_summary = json.loads((analysis_dir / "details" / "details_summary.json").read_text(encoding="utf-8"))
self.assertTrue(str(details_summary["release_path"]).endswith("second.jsonl"))
def test_pyproject_includes_trace_model_meta_package_data(self):
pyproject = tomllib.loads(Path("pyproject.toml").read_text(encoding="utf-8"))
package_data = pyproject["tool"]["setuptools"]["package-data"]
self.assertIn("trace_model_meta", package_data)
self.assertIn("**/*.json", package_data["trace_model_meta"])
self.assertIn("**/*.jinja", package_data["trace_model_meta"])

View File

@@ -14,6 +14,9 @@ from .reporting import write_reports
from .resume_advanced import collect_existing_detail_paths, run_advanced_from_existing from .resume_advanced import collect_existing_detail_paths, run_advanced_from_existing
from .study import parse_input_length_bucket_thresholds, run_study from .study import parse_input_length_bucket_thresholds, run_study
ANALYSIS_PROVENANCE_FILENAME = "analysis_provenance.json"
ANALYSIS_PROVENANCE_SCHEMA_VERSION = 1
def build_parser(): def build_parser():
parser = argparse.ArgumentParser(description="Analyze coding-agent trace patterns.") parser = argparse.ArgumentParser(description="Analyze coding-agent trace patterns.")
@@ -282,6 +285,70 @@ def _existing_detail_outputs(output_dir):
return collect_existing_detail_paths(output_dir) return collect_existing_detail_paths(output_dir)
def _file_fingerprint(path: str | Path) -> dict:
resolved = Path(path).resolve()
stat = resolved.stat()
return {
"path": str(resolved),
"size": stat.st_size,
"mtime_ns": stat.st_mtime_ns,
}
def _path_option(value: str | None) -> str | None:
if value is None:
return None
path = Path(value)
return str(path.resolve()) if path.exists() else str(value)
def _expected_analysis_provenance(args, release_input_path: Path, input_length_bucket_thresholds: list[int]) -> dict:
return {
"schema_version": ANALYSIS_PROVENANCE_SCHEMA_VERSION,
"raw_input": _file_fingerprint(args.input),
"release_input": _file_fingerprint(release_input_path),
"options": {
"block_size": args.block_size,
"segment_mode": args.segment_mode,
"tokenizer_path": _path_option(args.tokenizer_path),
"tokenizer_batch_size": args.tokenizer_batch_size,
"model_family": args.model_family,
"model_meta_dir": _path_option(args.model_meta_dir),
"input_length_buckets": input_length_bucket_thresholds,
},
}
def _analysis_provenance_path(output_dir: Path) -> Path:
return output_dir / ANALYSIS_PROVENANCE_FILENAME
def _load_analysis_provenance(output_dir: Path) -> dict | None:
path = _analysis_provenance_path(output_dir)
if not path.exists():
return None
try:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
except (OSError, json.JSONDecodeError):
return None
return data if isinstance(data, dict) else None
def _analysis_provenance_matches(output_dir: Path, expected: dict) -> bool:
return _load_analysis_provenance(output_dir) == expected
def _write_analysis_provenance(output_dir: Path, provenance: dict) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
destination = _analysis_provenance_path(output_dir)
temp_path = destination.with_suffix(destination.suffix + ".tmp")
with temp_path.open("w", encoding="utf-8") as handle:
json.dump(provenance, handle, ensure_ascii=False, indent=2, sort_keys=True)
handle.write("\n")
temp_path.replace(destination)
def _stage_message(progress, step: int, total_steps: int, message: str) -> None: def _stage_message(progress, step: int, total_steps: int, message: str) -> None:
tqdm.write(f"Stage {step}/{total_steps}: {message}") tqdm.write(f"Stage {step}/{total_steps}: {message}")
progress.update(1) progress.update(1)
@@ -301,6 +368,8 @@ def main(argv=None):
f"Release trace not found for raw trace {args.input}. " f"Release trace not found for raw trace {args.input}. "
"Run `python -m trace_formatter build-release <raw-trace>` first, or pass --release-input." "Run `python -m trace_formatter build-release <raw-trace>` first, or pass --release-input."
) )
expected_provenance = _expected_analysis_provenance(args, release_input_path, input_length_bucket_thresholds)
reuse_allowed = _analysis_provenance_matches(output_dir, expected_provenance)
total_steps = 4 total_steps = 4
progress = tqdm( progress = tqdm(
total=total_steps, total=total_steps,
@@ -310,7 +379,7 @@ def main(argv=None):
) )
try: try:
prepare_result = None prepare_result = None
reusable_base = _existing_base_outputs(output_dir) reusable_base = _existing_base_outputs(output_dir) if reuse_allowed else None
if reusable_base: if reusable_base:
_stage_message(progress, 1, total_steps, "reuse existing features.csv") _stage_message(progress, 1, total_steps, "reuse existing features.csv")
prepare_result = { prepare_result = {
@@ -320,7 +389,7 @@ def main(argv=None):
else: else:
_stage_message(progress, 1, total_steps, "prepare features.csv") _stage_message(progress, 1, total_steps, "prepare features.csv")
prepare_result = stream_prepare(args.input, output_dir, show_progress=True) prepare_result = stream_prepare(args.input, output_dir, show_progress=True)
reusable_details = _existing_detail_outputs(output_dir) reusable_details = _existing_detail_outputs(output_dir) if reuse_allowed else None
if reusable_details: if reusable_details:
_stage_message(progress, 2, total_steps, "reuse existing details/") _stage_message(progress, 2, total_steps, "reuse existing details/")
advanced_paths = reusable_details advanced_paths = reusable_details
@@ -362,6 +431,7 @@ def main(argv=None):
dataset_title=dataset_name, dataset_title=dataset_name,
show_progress=True, show_progress=True,
) )
_write_analysis_provenance(output_dir, expected_provenance)
finally: finally:
progress.close() progress.close()
print( print(

View File

@@ -6,14 +6,13 @@ import json
import os import os
import shutil import shutil
import sys import sys
import subprocess
import tempfile import tempfile
from array import array from array import array
from concurrent.futures import ProcessPoolExecutor, as_completed from concurrent.futures import ProcessPoolExecutor, as_completed
from contextlib import contextmanager, nullcontext from contextlib import contextmanager, nullcontext
from dataclasses import asdict from dataclasses import asdict
from pathlib import Path from pathlib import Path
from typing import Iterator, TextIO from typing import Iterator
from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int
from tokenizers import Tokenizer from tokenizers import Tokenizer
@@ -32,6 +31,7 @@ from .sessionization import (
encode_roles, encode_roles,
extract_user_id, extract_user_id,
) )
from .trace_io import open_trace_text
from .time_windows import infer_time_offset_ms, infer_time_window, parse_time_to_ms from .time_windows import infer_time_offset_ms, infer_time_window, parse_time_to_ms
@@ -88,34 +88,6 @@ def discover_source_files(input_dir: str | Path) -> list[Path]:
return files return files
@contextmanager
def open_trace_text(path: str | Path) -> Iterator[TextIO]:
resolved = Path(path)
if resolved.suffix == ".zst":
proc = subprocess.Popen(
["zstdcat", str(resolved)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
)
if proc.stdout is None:
raise RuntimeError(f"Failed to stream {resolved}")
try:
yield proc.stdout
finally:
stdout = proc.stdout
stdout.close()
stderr = proc.stderr.read() if proc.stderr else ""
return_code = proc.wait()
if return_code != 0:
raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}")
return
with resolved.open("r", encoding="utf-8") as handle:
yield handle
def _normalize_time_ms(*, raw_time_ms: int, wall_clock_ms: int, time_offset_ms: int) -> int: def _normalize_time_ms(*, raw_time_ms: int, wall_clock_ms: int, time_offset_ms: int) -> int:
if not raw_time_ms: if not raw_time_ms:
return wall_clock_ms - time_offset_ms if wall_clock_ms and time_offset_ms else wall_clock_ms return wall_clock_ms - time_offset_ms if wall_clock_ms and time_offset_ms else wall_clock_ms

View File

@@ -8,6 +8,8 @@ from pathlib import Path
from trace_analyzer.helpers import parse_jsonish, safe_int from trace_analyzer.helpers import parse_jsonish, safe_int
from .trace_io import open_trace_text
WINDOW_RE = re.compile(r"(?P<day>\d{4})-(?P<start>\d{4})-(?P<end>\d{4})$") WINDOW_RE = re.compile(r"(?P<day>\d{4})-(?P<start>\d{4})-(?P<end>\d{4})$")
UTC_PLUS_8 = timezone(timedelta(hours=8)) UTC_PLUS_8 = timezone(timedelta(hours=8))
@@ -32,8 +34,17 @@ def parse_time_to_ms(value: str) -> int:
raise ValueError(f"Unsupported timestamp format: {value!r}") raise ValueError(f"Unsupported timestamp format: {value!r}")
def _trace_window_name(path: Path) -> str:
name = path.name
if name.endswith(".jsonl.zst"):
return name[: -len(".jsonl.zst")]
if name.endswith(".jsonl"):
return name[: -len(".jsonl")]
return path.stem
def _read_first_timestamp(path: Path) -> str: def _read_first_timestamp(path: Path) -> str:
with path.open("r", encoding="utf-8") as handle: with open_trace_text(path) as handle:
for line in handle: for line in handle:
stripped = line.strip() stripped = line.strip()
if not stripped: if not stripped:
@@ -46,7 +57,7 @@ def _read_first_timestamp(path: Path) -> str:
def _read_first_timestamp_and_ready_ms(path: Path) -> tuple[str, int]: def _read_first_timestamp_and_ready_ms(path: Path) -> tuple[str, int]:
with path.open("r", encoding="utf-8") as handle: with open_trace_text(path) as handle:
for line in handle: for line in handle:
stripped = line.strip() stripped = line.strip()
if not stripped: if not stripped:
@@ -90,8 +101,8 @@ def infer_time_window(
if not source_files: if not source_files:
return None return None
first_match = WINDOW_RE.match(source_files[0].stem) first_match = WINDOW_RE.match(_trace_window_name(source_files[0]))
last_match = WINDOW_RE.match(source_files[-1].stem) last_match = WINDOW_RE.match(_trace_window_name(source_files[-1]))
if first_match is None or last_match is None: if first_match is None or last_match is None:
return None return None

View File

@@ -0,0 +1,33 @@
from __future__ import annotations
import subprocess
from contextlib import contextmanager
from pathlib import Path
from typing import Iterator, TextIO
@contextmanager
def open_trace_text(path: str | Path) -> Iterator[TextIO]:
resolved = Path(path)
if resolved.suffix == ".zst":
proc = subprocess.Popen(
["zstdcat", str(resolved)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
)
if proc.stdout is None:
raise RuntimeError(f"Failed to stream {resolved}")
try:
yield proc.stdout
finally:
proc.stdout.close()
stderr = proc.stderr.read() if proc.stderr else ""
return_code = proc.wait()
if return_code != 0:
raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}")
return
with resolved.open("r", encoding="utf-8") as handle:
yield handle