fix: tool parser and illegal char and /tmp space
This commit is contained in:
@@ -3,11 +3,12 @@ import tempfile
|
||||
import unittest
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from trace_analyzer.cli import main as analyzer_main
|
||||
from trace_analyzer.parser import load_records
|
||||
from trace_formatter.cli import main as formatter_main
|
||||
from trace_formatter.formatting import build_unified_row, discover_source_files, format_and_sort_trace
|
||||
from trace_formatter.formatting import build_unified_row, discover_source_files, export_release_ready_trace, format_and_sort_trace
|
||||
|
||||
|
||||
def utc_ms(value: str) -> int:
|
||||
@@ -476,6 +477,74 @@ class AliTracePipelineTest(unittest.TestCase):
|
||||
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
|
||||
self.assertEqual([row["meta"]["request_id"] for row in formatted_rows], ["req-kept"])
|
||||
|
||||
def test_format_and_sort_trace_normalizes_invalid_surrogates_before_chunking(self):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
root = Path(temp_dir)
|
||||
input_dir = root / "raw"
|
||||
input_dir.mkdir()
|
||||
output_path = root / "formatted.jsonl"
|
||||
|
||||
row = make_raw_row(
|
||||
"req-\ud83c",
|
||||
utc_ms("2026-04-17 15:00:03.000"),
|
||||
messages=[{"role": "user", "content": "bad \ud83c content"}],
|
||||
)
|
||||
|
||||
with (input_dir / "0417-1500-1530.jsonl").open("w", encoding="utf-8") as handle:
|
||||
handle.write(json.dumps(row) + "\n")
|
||||
|
||||
stats = format_and_sort_trace(input_dir=input_dir, output_path=output_path, chunk_bytes=256)
|
||||
|
||||
self.assertEqual(stats["row_count"], 1)
|
||||
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
|
||||
self.assertEqual(formatted_rows[0]["meta"]["request_id"], "req-\uFFFD")
|
||||
self.assertEqual(formatted_rows[0]["message_events"][0]["text_len"], len("bad \uFFFD content"))
|
||||
self.assertEqual(formatted_rows[0]["raw_messages"][0]["content"], "bad \uFFFD content")
|
||||
self.assertIn("\uFFFD", formatted_rows[0]["canonical_prompt"])
|
||||
|
||||
def test_format_and_sort_trace_normalizes_nonstandard_glm_tool_call_shapes(self):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
root = Path(temp_dir)
|
||||
input_dir = root / "raw"
|
||||
input_dir.mkdir()
|
||||
output_path = root / "formatted.jsonl"
|
||||
|
||||
row = make_raw_row(
|
||||
"req-tool-call-shape",
|
||||
utc_ms("2026-04-17 15:00:04.000"),
|
||||
messages=[
|
||||
{"role": "user", "content": "hello"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "calling tool",
|
||||
"tool_calls": {
|
||||
"id": "call-1",
|
||||
"type": "function",
|
||||
"arguments": "{\"path\":\"/tmp/a.txt\"}",
|
||||
},
|
||||
},
|
||||
],
|
||||
)
|
||||
request_params = json.loads(row["request_params"])
|
||||
request_params["payload"]["parameters"]["tools"] = {
|
||||
"type": "function",
|
||||
"name": "read_file",
|
||||
"parameters": {"type": "object", "properties": {"path": {"type": "string"}}},
|
||||
}
|
||||
row["request_params"] = json.dumps(request_params)
|
||||
|
||||
with (input_dir / "0417-1500-1530.jsonl").open("w", encoding="utf-8") as handle:
|
||||
handle.write(json.dumps(row) + "\n")
|
||||
|
||||
stats = format_and_sort_trace(input_dir=input_dir, output_path=output_path, chunk_bytes=256)
|
||||
|
||||
self.assertEqual(stats["row_count"], 1)
|
||||
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
|
||||
self.assertIn("<tools>", formatted_rows[0]["canonical_prompt"])
|
||||
self.assertIn("\"name\": \"read_file\"", formatted_rows[0]["canonical_prompt"])
|
||||
self.assertIn("<tool_call>", formatted_rows[0]["canonical_prompt"])
|
||||
self.assertIn("<arg_key>path</arg_key>", formatted_rows[0]["canonical_prompt"])
|
||||
|
||||
def test_trace_formatter_cli_formats_one_raw_jsonl_file(self):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
root = Path(temp_dir)
|
||||
@@ -564,6 +633,66 @@ class AliTracePipelineTest(unittest.TestCase):
|
||||
self.assertTrue(log_path.exists())
|
||||
self.assertIn("Scan raw trace", log_path.read_text(encoding="utf-8"))
|
||||
|
||||
def test_format_and_sort_trace_defaults_temp_dir_to_output_parent(self):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
root = Path(temp_dir)
|
||||
input_dir = root / "raw"
|
||||
input_dir.mkdir()
|
||||
output_dir = root / "formatted"
|
||||
output_path = output_dir / "formatted.jsonl"
|
||||
with (input_dir / "0417-1500-1530.jsonl").open("w", encoding="utf-8") as handle:
|
||||
handle.write(json.dumps(make_raw_row("req-1", utc_ms("2026-04-17 15:00:01.000"))) + "\n")
|
||||
|
||||
captured = {}
|
||||
real_temporary_directory = tempfile.TemporaryDirectory
|
||||
|
||||
def recording_temporary_directory(*args, **kwargs):
|
||||
captured["dir"] = kwargs.get("dir")
|
||||
return real_temporary_directory(*args, **kwargs)
|
||||
|
||||
with mock.patch("trace_formatter.formatting.tempfile.TemporaryDirectory", side_effect=recording_temporary_directory):
|
||||
format_and_sort_trace(input_dir=input_dir, output_path=output_path, chunk_bytes=256)
|
||||
|
||||
self.assertEqual(Path(captured["dir"]), output_dir)
|
||||
|
||||
def test_export_release_ready_trace_defaults_temp_dir_to_output_parent(self):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
root = Path(temp_dir)
|
||||
raw_input_path = root / "trace-raw.jsonl"
|
||||
release_output_dir = root / "release"
|
||||
release_output_path = release_output_dir / "trace.jsonl"
|
||||
raw_row = {
|
||||
"schema_version": "2026.04.21",
|
||||
"sort_time_ms": utc_ms("2026-04-17 15:00:01.000"),
|
||||
"meta": {
|
||||
"model_family": "glm5",
|
||||
"request_ready_time_ms": utc_ms("2026-04-17 15:00:01.000"),
|
||||
"chat_id": 0,
|
||||
"parent_chat_id": -1,
|
||||
"turn": 1,
|
||||
},
|
||||
"canonical_prompt": "hello",
|
||||
"usage": {"input_tokens": 1, "output_tokens": 1},
|
||||
}
|
||||
raw_input_path.write_text(json.dumps(raw_row) + "\n", encoding="utf-8")
|
||||
|
||||
captured = {}
|
||||
real_temporary_directory = tempfile.TemporaryDirectory
|
||||
|
||||
def recording_temporary_directory(*args, **kwargs):
|
||||
captured["dir"] = kwargs.get("dir")
|
||||
return real_temporary_directory(*args, **kwargs)
|
||||
|
||||
with mock.patch("trace_formatter.formatting.tempfile.TemporaryDirectory", side_effect=recording_temporary_directory):
|
||||
export_release_ready_trace(
|
||||
raw_input_path=raw_input_path,
|
||||
release_output_path=release_output_path,
|
||||
jobs=1,
|
||||
block_size=8,
|
||||
)
|
||||
|
||||
self.assertEqual(Path(captured["dir"]), release_output_dir)
|
||||
|
||||
def test_format_and_sort_trace_infers_window_in_ready_time_scale_when_wall_clock_has_timezone_offset(self):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
root = Path(temp_dir)
|
||||
|
||||
Reference in New Issue
Block a user