fix: tool parser and illegal char and /tmp space

This commit is contained in:
2026-04-22 01:19:09 +00:00
parent bce3fe1395
commit 152f01613b
4 changed files with 304 additions and 33 deletions

View File

@@ -3,11 +3,12 @@ import tempfile
import unittest
from datetime import datetime, timezone
from pathlib import Path
from unittest import mock
from trace_analyzer.cli import main as analyzer_main
from trace_analyzer.parser import load_records
from trace_formatter.cli import main as formatter_main
from trace_formatter.formatting import build_unified_row, discover_source_files, format_and_sort_trace
from trace_formatter.formatting import build_unified_row, discover_source_files, export_release_ready_trace, format_and_sort_trace
def utc_ms(value: str) -> int:
@@ -476,6 +477,74 @@ class AliTracePipelineTest(unittest.TestCase):
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
self.assertEqual([row["meta"]["request_id"] for row in formatted_rows], ["req-kept"])
def test_format_and_sort_trace_normalizes_invalid_surrogates_before_chunking(self):
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
input_dir = root / "raw"
input_dir.mkdir()
output_path = root / "formatted.jsonl"
row = make_raw_row(
"req-\ud83c",
utc_ms("2026-04-17 15:00:03.000"),
messages=[{"role": "user", "content": "bad \ud83c content"}],
)
with (input_dir / "0417-1500-1530.jsonl").open("w", encoding="utf-8") as handle:
handle.write(json.dumps(row) + "\n")
stats = format_and_sort_trace(input_dir=input_dir, output_path=output_path, chunk_bytes=256)
self.assertEqual(stats["row_count"], 1)
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
self.assertEqual(formatted_rows[0]["meta"]["request_id"], "req-\uFFFD")
self.assertEqual(formatted_rows[0]["message_events"][0]["text_len"], len("bad \uFFFD content"))
self.assertEqual(formatted_rows[0]["raw_messages"][0]["content"], "bad \uFFFD content")
self.assertIn("\uFFFD", formatted_rows[0]["canonical_prompt"])
def test_format_and_sort_trace_normalizes_nonstandard_glm_tool_call_shapes(self):
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
input_dir = root / "raw"
input_dir.mkdir()
output_path = root / "formatted.jsonl"
row = make_raw_row(
"req-tool-call-shape",
utc_ms("2026-04-17 15:00:04.000"),
messages=[
{"role": "user", "content": "hello"},
{
"role": "assistant",
"content": "calling tool",
"tool_calls": {
"id": "call-1",
"type": "function",
"arguments": "{\"path\":\"/tmp/a.txt\"}",
},
},
],
)
request_params = json.loads(row["request_params"])
request_params["payload"]["parameters"]["tools"] = {
"type": "function",
"name": "read_file",
"parameters": {"type": "object", "properties": {"path": {"type": "string"}}},
}
row["request_params"] = json.dumps(request_params)
with (input_dir / "0417-1500-1530.jsonl").open("w", encoding="utf-8") as handle:
handle.write(json.dumps(row) + "\n")
stats = format_and_sort_trace(input_dir=input_dir, output_path=output_path, chunk_bytes=256)
self.assertEqual(stats["row_count"], 1)
formatted_rows = [json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines()]
self.assertIn("<tools>", formatted_rows[0]["canonical_prompt"])
self.assertIn("\"name\": \"read_file\"", formatted_rows[0]["canonical_prompt"])
self.assertIn("<tool_call>", formatted_rows[0]["canonical_prompt"])
self.assertIn("<arg_key>path</arg_key>", formatted_rows[0]["canonical_prompt"])
def test_trace_formatter_cli_formats_one_raw_jsonl_file(self):
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
@@ -564,6 +633,66 @@ class AliTracePipelineTest(unittest.TestCase):
self.assertTrue(log_path.exists())
self.assertIn("Scan raw trace", log_path.read_text(encoding="utf-8"))
def test_format_and_sort_trace_defaults_temp_dir_to_output_parent(self):
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
input_dir = root / "raw"
input_dir.mkdir()
output_dir = root / "formatted"
output_path = output_dir / "formatted.jsonl"
with (input_dir / "0417-1500-1530.jsonl").open("w", encoding="utf-8") as handle:
handle.write(json.dumps(make_raw_row("req-1", utc_ms("2026-04-17 15:00:01.000"))) + "\n")
captured = {}
real_temporary_directory = tempfile.TemporaryDirectory
def recording_temporary_directory(*args, **kwargs):
captured["dir"] = kwargs.get("dir")
return real_temporary_directory(*args, **kwargs)
with mock.patch("trace_formatter.formatting.tempfile.TemporaryDirectory", side_effect=recording_temporary_directory):
format_and_sort_trace(input_dir=input_dir, output_path=output_path, chunk_bytes=256)
self.assertEqual(Path(captured["dir"]), output_dir)
def test_export_release_ready_trace_defaults_temp_dir_to_output_parent(self):
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
raw_input_path = root / "trace-raw.jsonl"
release_output_dir = root / "release"
release_output_path = release_output_dir / "trace.jsonl"
raw_row = {
"schema_version": "2026.04.21",
"sort_time_ms": utc_ms("2026-04-17 15:00:01.000"),
"meta": {
"model_family": "glm5",
"request_ready_time_ms": utc_ms("2026-04-17 15:00:01.000"),
"chat_id": 0,
"parent_chat_id": -1,
"turn": 1,
},
"canonical_prompt": "hello",
"usage": {"input_tokens": 1, "output_tokens": 1},
}
raw_input_path.write_text(json.dumps(raw_row) + "\n", encoding="utf-8")
captured = {}
real_temporary_directory = tempfile.TemporaryDirectory
def recording_temporary_directory(*args, **kwargs):
captured["dir"] = kwargs.get("dir")
return real_temporary_directory(*args, **kwargs)
with mock.patch("trace_formatter.formatting.tempfile.TemporaryDirectory", side_effect=recording_temporary_directory):
export_release_ready_trace(
raw_input_path=raw_input_path,
release_output_path=release_output_path,
jobs=1,
block_size=8,
)
self.assertEqual(Path(captured["dir"]), release_output_dir)
def test_format_and_sort_trace_infers_window_in_ready_time_scale_when_wall_clock_has_timezone_offset(self):
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)