Fix trace reuse and packaged model assets
This commit is contained in:
@@ -21,7 +21,10 @@ trace-formatter = "trace_formatter.cli:main"
|
|||||||
trace-analyzer = "trace_analyzer.cli:main"
|
trace-analyzer = "trace_analyzer.cli:main"
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
include = ["trace_analyzer", "trace_formatter", "trace_model_meta"]
|
include = ["trace_analyzer", "trace_formatter", "trace_model_meta*"]
|
||||||
|
|
||||||
|
[tool.setuptools.package-data]
|
||||||
|
trace_model_meta = ["**/*.json", "**/*.jinja", "**/*.py"]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import tomllib
|
||||||
import unittest
|
import unittest
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -655,6 +658,33 @@ class AliTracePipelineTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertEqual(Path(captured["dir"]), output_dir)
|
self.assertEqual(Path(captured["dir"]), output_dir)
|
||||||
|
|
||||||
|
def test_format_and_sort_trace_supports_zstd_input_during_time_inference(self):
|
||||||
|
if shutil.which("zstd") is None or shutil.which("zstdcat") is None:
|
||||||
|
self.skipTest("zstd/zstdcat are required for .jsonl.zst formatter smoke test")
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
root = Path(temp_dir)
|
||||||
|
raw_path = root / "0417-1500-1530.jsonl"
|
||||||
|
zst_path = root / "0417-1500-1530.jsonl.zst"
|
||||||
|
output_path = root / "formatted" / "trace-raw.jsonl"
|
||||||
|
raw_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
make_raw_row(
|
||||||
|
"req-zst",
|
||||||
|
utc_ms("2026-04-17 15:00:01.000"),
|
||||||
|
time_text="2026-04-17 15:00:01.000",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+ "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
subprocess.run(["zstd", "-q", str(raw_path), "-o", str(zst_path)], check=True)
|
||||||
|
|
||||||
|
stats = format_and_sort_trace(input_dir=zst_path, output_path=output_path, chunk_bytes=256)
|
||||||
|
|
||||||
|
self.assertEqual(stats["row_count"], 1)
|
||||||
|
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
|
||||||
|
self.assertEqual([row["meta"]["request_id"] for row in formatted_rows], ["req-zst"])
|
||||||
|
|
||||||
def test_export_release_ready_trace_defaults_temp_dir_to_output_parent(self):
|
def test_export_release_ready_trace_defaults_temp_dir_to_output_parent(self):
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
root = Path(temp_dir)
|
root = Path(temp_dir)
|
||||||
@@ -855,3 +885,57 @@ class AliTracePipelineTest(unittest.TestCase):
|
|||||||
(analysis_dir / "details" / "details_summary.json").stat().st_mtime_ns,
|
(analysis_dir / "details" / "details_summary.json").stat().st_mtime_ns,
|
||||||
details_summary_mtime_ns,
|
details_summary_mtime_ns,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_trace_analyzer_rebuilds_when_same_output_gets_different_input(self):
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
root = Path(temp_dir)
|
||||||
|
formatted_root = root / "outputs" / "formatted"
|
||||||
|
analysis_root = root / "outputs" / "analysis"
|
||||||
|
|
||||||
|
def build_trace(label: str, request_id: str, ready: str) -> Path:
|
||||||
|
raw_dir = root / label
|
||||||
|
raw_dir.mkdir()
|
||||||
|
raw_path = raw_dir / "0417-1500-1530.jsonl"
|
||||||
|
raw_path.write_text(
|
||||||
|
json.dumps(make_raw_row(request_id, utc_ms(ready), user_id=f"user-{request_id}")) + "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
self.assertEqual(formatter_main(["format", str(raw_dir), "--output-root", str(formatted_root)]), 0)
|
||||||
|
formatted_path = formatted_root / "041715-041715-raw.jsonl"
|
||||||
|
renamed_path = formatted_root / f"{label}-raw.jsonl"
|
||||||
|
formatted_path.replace(renamed_path)
|
||||||
|
self.assertEqual(formatter_main(["build-release", str(renamed_path), "--jobs", "1", "--block-size", "8"]), 0)
|
||||||
|
return renamed_path
|
||||||
|
|
||||||
|
first_path = build_trace("first", "req-first", "2026-04-17 15:00:01.000")
|
||||||
|
second_path = build_trace("second", "req-second", "2026-04-17 15:00:02.000")
|
||||||
|
|
||||||
|
common_args = [
|
||||||
|
"--output-root",
|
||||||
|
str(analysis_root),
|
||||||
|
"--dataset-name",
|
||||||
|
"same-dataset",
|
||||||
|
"--segment-mode",
|
||||||
|
"bytes",
|
||||||
|
"--block-size",
|
||||||
|
"8",
|
||||||
|
]
|
||||||
|
self.assertEqual(analyzer_main(["analyze", str(first_path), *common_args]), 0)
|
||||||
|
analysis_dir = analysis_root / "same-dataset"
|
||||||
|
self.assertIn("req-first", (analysis_dir / "features.csv").read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
self.assertEqual(analyzer_main(["analyze", str(second_path), *common_args]), 0)
|
||||||
|
|
||||||
|
features_text = (analysis_dir / "features.csv").read_text(encoding="utf-8")
|
||||||
|
self.assertIn("req-second", features_text)
|
||||||
|
self.assertNotIn("req-first", features_text)
|
||||||
|
details_summary = json.loads((analysis_dir / "details" / "details_summary.json").read_text(encoding="utf-8"))
|
||||||
|
self.assertTrue(str(details_summary["release_path"]).endswith("second.jsonl"))
|
||||||
|
|
||||||
|
def test_pyproject_includes_trace_model_meta_package_data(self):
|
||||||
|
pyproject = tomllib.loads(Path("pyproject.toml").read_text(encoding="utf-8"))
|
||||||
|
package_data = pyproject["tool"]["setuptools"]["package-data"]
|
||||||
|
|
||||||
|
self.assertIn("trace_model_meta", package_data)
|
||||||
|
self.assertIn("**/*.json", package_data["trace_model_meta"])
|
||||||
|
self.assertIn("**/*.jinja", package_data["trace_model_meta"])
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ from .reporting import write_reports
|
|||||||
from .resume_advanced import collect_existing_detail_paths, run_advanced_from_existing
|
from .resume_advanced import collect_existing_detail_paths, run_advanced_from_existing
|
||||||
from .study import parse_input_length_bucket_thresholds, run_study
|
from .study import parse_input_length_bucket_thresholds, run_study
|
||||||
|
|
||||||
|
ANALYSIS_PROVENANCE_FILENAME = "analysis_provenance.json"
|
||||||
|
ANALYSIS_PROVENANCE_SCHEMA_VERSION = 1
|
||||||
|
|
||||||
|
|
||||||
def build_parser():
|
def build_parser():
|
||||||
parser = argparse.ArgumentParser(description="Analyze coding-agent trace patterns.")
|
parser = argparse.ArgumentParser(description="Analyze coding-agent trace patterns.")
|
||||||
@@ -282,6 +285,70 @@ def _existing_detail_outputs(output_dir):
|
|||||||
return collect_existing_detail_paths(output_dir)
|
return collect_existing_detail_paths(output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def _file_fingerprint(path: str | Path) -> dict:
|
||||||
|
resolved = Path(path).resolve()
|
||||||
|
stat = resolved.stat()
|
||||||
|
return {
|
||||||
|
"path": str(resolved),
|
||||||
|
"size": stat.st_size,
|
||||||
|
"mtime_ns": stat.st_mtime_ns,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _path_option(value: str | None) -> str | None:
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
path = Path(value)
|
||||||
|
return str(path.resolve()) if path.exists() else str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _expected_analysis_provenance(args, release_input_path: Path, input_length_bucket_thresholds: list[int]) -> dict:
|
||||||
|
return {
|
||||||
|
"schema_version": ANALYSIS_PROVENANCE_SCHEMA_VERSION,
|
||||||
|
"raw_input": _file_fingerprint(args.input),
|
||||||
|
"release_input": _file_fingerprint(release_input_path),
|
||||||
|
"options": {
|
||||||
|
"block_size": args.block_size,
|
||||||
|
"segment_mode": args.segment_mode,
|
||||||
|
"tokenizer_path": _path_option(args.tokenizer_path),
|
||||||
|
"tokenizer_batch_size": args.tokenizer_batch_size,
|
||||||
|
"model_family": args.model_family,
|
||||||
|
"model_meta_dir": _path_option(args.model_meta_dir),
|
||||||
|
"input_length_buckets": input_length_bucket_thresholds,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _analysis_provenance_path(output_dir: Path) -> Path:
|
||||||
|
return output_dir / ANALYSIS_PROVENANCE_FILENAME
|
||||||
|
|
||||||
|
|
||||||
|
def _load_analysis_provenance(output_dir: Path) -> dict | None:
|
||||||
|
path = _analysis_provenance_path(output_dir)
|
||||||
|
if not path.exists():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
with path.open("r", encoding="utf-8") as handle:
|
||||||
|
data = json.load(handle)
|
||||||
|
except (OSError, json.JSONDecodeError):
|
||||||
|
return None
|
||||||
|
return data if isinstance(data, dict) else None
|
||||||
|
|
||||||
|
|
||||||
|
def _analysis_provenance_matches(output_dir: Path, expected: dict) -> bool:
|
||||||
|
return _load_analysis_provenance(output_dir) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def _write_analysis_provenance(output_dir: Path, provenance: dict) -> None:
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
destination = _analysis_provenance_path(output_dir)
|
||||||
|
temp_path = destination.with_suffix(destination.suffix + ".tmp")
|
||||||
|
with temp_path.open("w", encoding="utf-8") as handle:
|
||||||
|
json.dump(provenance, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
||||||
|
handle.write("\n")
|
||||||
|
temp_path.replace(destination)
|
||||||
|
|
||||||
|
|
||||||
def _stage_message(progress, step: int, total_steps: int, message: str) -> None:
|
def _stage_message(progress, step: int, total_steps: int, message: str) -> None:
|
||||||
tqdm.write(f"Stage {step}/{total_steps}: {message}")
|
tqdm.write(f"Stage {step}/{total_steps}: {message}")
|
||||||
progress.update(1)
|
progress.update(1)
|
||||||
@@ -301,6 +368,8 @@ def main(argv=None):
|
|||||||
f"Release trace not found for raw trace {args.input}. "
|
f"Release trace not found for raw trace {args.input}. "
|
||||||
"Run `python -m trace_formatter build-release <raw-trace>` first, or pass --release-input."
|
"Run `python -m trace_formatter build-release <raw-trace>` first, or pass --release-input."
|
||||||
)
|
)
|
||||||
|
expected_provenance = _expected_analysis_provenance(args, release_input_path, input_length_bucket_thresholds)
|
||||||
|
reuse_allowed = _analysis_provenance_matches(output_dir, expected_provenance)
|
||||||
total_steps = 4
|
total_steps = 4
|
||||||
progress = tqdm(
|
progress = tqdm(
|
||||||
total=total_steps,
|
total=total_steps,
|
||||||
@@ -310,7 +379,7 @@ def main(argv=None):
|
|||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
prepare_result = None
|
prepare_result = None
|
||||||
reusable_base = _existing_base_outputs(output_dir)
|
reusable_base = _existing_base_outputs(output_dir) if reuse_allowed else None
|
||||||
if reusable_base:
|
if reusable_base:
|
||||||
_stage_message(progress, 1, total_steps, "reuse existing features.csv")
|
_stage_message(progress, 1, total_steps, "reuse existing features.csv")
|
||||||
prepare_result = {
|
prepare_result = {
|
||||||
@@ -320,7 +389,7 @@ def main(argv=None):
|
|||||||
else:
|
else:
|
||||||
_stage_message(progress, 1, total_steps, "prepare features.csv")
|
_stage_message(progress, 1, total_steps, "prepare features.csv")
|
||||||
prepare_result = stream_prepare(args.input, output_dir, show_progress=True)
|
prepare_result = stream_prepare(args.input, output_dir, show_progress=True)
|
||||||
reusable_details = _existing_detail_outputs(output_dir)
|
reusable_details = _existing_detail_outputs(output_dir) if reuse_allowed else None
|
||||||
if reusable_details:
|
if reusable_details:
|
||||||
_stage_message(progress, 2, total_steps, "reuse existing details/")
|
_stage_message(progress, 2, total_steps, "reuse existing details/")
|
||||||
advanced_paths = reusable_details
|
advanced_paths = reusable_details
|
||||||
@@ -362,6 +431,7 @@ def main(argv=None):
|
|||||||
dataset_title=dataset_name,
|
dataset_title=dataset_name,
|
||||||
show_progress=True,
|
show_progress=True,
|
||||||
)
|
)
|
||||||
|
_write_analysis_provenance(output_dir, expected_provenance)
|
||||||
finally:
|
finally:
|
||||||
progress.close()
|
progress.close()
|
||||||
print(
|
print(
|
||||||
|
|||||||
@@ -6,14 +6,13 @@ import json
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import subprocess
|
|
||||||
import tempfile
|
import tempfile
|
||||||
from array import array
|
from array import array
|
||||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||||
from contextlib import contextmanager, nullcontext
|
from contextlib import contextmanager, nullcontext
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, TextIO
|
from typing import Iterator
|
||||||
|
|
||||||
from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int
|
from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
@@ -32,6 +31,7 @@ from .sessionization import (
|
|||||||
encode_roles,
|
encode_roles,
|
||||||
extract_user_id,
|
extract_user_id,
|
||||||
)
|
)
|
||||||
|
from .trace_io import open_trace_text
|
||||||
from .time_windows import infer_time_offset_ms, infer_time_window, parse_time_to_ms
|
from .time_windows import infer_time_offset_ms, infer_time_window, parse_time_to_ms
|
||||||
|
|
||||||
|
|
||||||
@@ -88,34 +88,6 @@ def discover_source_files(input_dir: str | Path) -> list[Path]:
|
|||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def open_trace_text(path: str | Path) -> Iterator[TextIO]:
|
|
||||||
resolved = Path(path)
|
|
||||||
if resolved.suffix == ".zst":
|
|
||||||
proc = subprocess.Popen(
|
|
||||||
["zstdcat", str(resolved)],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
encoding="utf-8",
|
|
||||||
)
|
|
||||||
if proc.stdout is None:
|
|
||||||
raise RuntimeError(f"Failed to stream {resolved}")
|
|
||||||
try:
|
|
||||||
yield proc.stdout
|
|
||||||
finally:
|
|
||||||
stdout = proc.stdout
|
|
||||||
stdout.close()
|
|
||||||
stderr = proc.stderr.read() if proc.stderr else ""
|
|
||||||
return_code = proc.wait()
|
|
||||||
if return_code != 0:
|
|
||||||
raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}")
|
|
||||||
return
|
|
||||||
|
|
||||||
with resolved.open("r", encoding="utf-8") as handle:
|
|
||||||
yield handle
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_time_ms(*, raw_time_ms: int, wall_clock_ms: int, time_offset_ms: int) -> int:
|
def _normalize_time_ms(*, raw_time_ms: int, wall_clock_ms: int, time_offset_ms: int) -> int:
|
||||||
if not raw_time_ms:
|
if not raw_time_ms:
|
||||||
return wall_clock_ms - time_offset_ms if wall_clock_ms and time_offset_ms else wall_clock_ms
|
return wall_clock_ms - time_offset_ms if wall_clock_ms and time_offset_ms else wall_clock_ms
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ from pathlib import Path
|
|||||||
|
|
||||||
from trace_analyzer.helpers import parse_jsonish, safe_int
|
from trace_analyzer.helpers import parse_jsonish, safe_int
|
||||||
|
|
||||||
|
from .trace_io import open_trace_text
|
||||||
|
|
||||||
WINDOW_RE = re.compile(r"(?P<day>\d{4})-(?P<start>\d{4})-(?P<end>\d{4})$")
|
WINDOW_RE = re.compile(r"(?P<day>\d{4})-(?P<start>\d{4})-(?P<end>\d{4})$")
|
||||||
UTC_PLUS_8 = timezone(timedelta(hours=8))
|
UTC_PLUS_8 = timezone(timedelta(hours=8))
|
||||||
|
|
||||||
@@ -32,8 +34,17 @@ def parse_time_to_ms(value: str) -> int:
|
|||||||
raise ValueError(f"Unsupported timestamp format: {value!r}")
|
raise ValueError(f"Unsupported timestamp format: {value!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _trace_window_name(path: Path) -> str:
|
||||||
|
name = path.name
|
||||||
|
if name.endswith(".jsonl.zst"):
|
||||||
|
return name[: -len(".jsonl.zst")]
|
||||||
|
if name.endswith(".jsonl"):
|
||||||
|
return name[: -len(".jsonl")]
|
||||||
|
return path.stem
|
||||||
|
|
||||||
|
|
||||||
def _read_first_timestamp(path: Path) -> str:
|
def _read_first_timestamp(path: Path) -> str:
|
||||||
with path.open("r", encoding="utf-8") as handle:
|
with open_trace_text(path) as handle:
|
||||||
for line in handle:
|
for line in handle:
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
if not stripped:
|
if not stripped:
|
||||||
@@ -46,7 +57,7 @@ def _read_first_timestamp(path: Path) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _read_first_timestamp_and_ready_ms(path: Path) -> tuple[str, int]:
|
def _read_first_timestamp_and_ready_ms(path: Path) -> tuple[str, int]:
|
||||||
with path.open("r", encoding="utf-8") as handle:
|
with open_trace_text(path) as handle:
|
||||||
for line in handle:
|
for line in handle:
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
if not stripped:
|
if not stripped:
|
||||||
@@ -90,8 +101,8 @@ def infer_time_window(
|
|||||||
if not source_files:
|
if not source_files:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
first_match = WINDOW_RE.match(source_files[0].stem)
|
first_match = WINDOW_RE.match(_trace_window_name(source_files[0]))
|
||||||
last_match = WINDOW_RE.match(source_files[-1].stem)
|
last_match = WINDOW_RE.match(_trace_window_name(source_files[-1]))
|
||||||
if first_match is None or last_match is None:
|
if first_match is None or last_match is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
33
trace_formatter/trace_io.py
Normal file
33
trace_formatter/trace_io.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator, TextIO
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def open_trace_text(path: str | Path) -> Iterator[TextIO]:
|
||||||
|
resolved = Path(path)
|
||||||
|
if resolved.suffix == ".zst":
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
["zstdcat", str(resolved)],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
if proc.stdout is None:
|
||||||
|
raise RuntimeError(f"Failed to stream {resolved}")
|
||||||
|
try:
|
||||||
|
yield proc.stdout
|
||||||
|
finally:
|
||||||
|
proc.stdout.close()
|
||||||
|
stderr = proc.stderr.read() if proc.stderr else ""
|
||||||
|
return_code = proc.wait()
|
||||||
|
if return_code != 0:
|
||||||
|
raise RuntimeError(f"zstdcat failed for {resolved}: {stderr.strip()}")
|
||||||
|
return
|
||||||
|
|
||||||
|
with resolved.open("r", encoding="utf-8") as handle:
|
||||||
|
yield handle
|
||||||
Reference in New Issue
Block a user