fix: tool parser and illegal char and /tmp space

This commit is contained in:
2026-04-22 01:19:09 +00:00
parent bce3fe1395
commit 152f01613b
4 changed files with 304 additions and 33 deletions

View File

@@ -15,7 +15,7 @@ from dataclasses import asdict
from pathlib import Path
from typing import Iterator, TextIO
from trace_analyzer.helpers import parse_jsonish, safe_int
from trace_analyzer.helpers import normalize_unicode_text, parse_jsonish, safe_int
from tokenizers import Tokenizer
from tqdm.auto import tqdm
from trace_model_meta import infer_model_family_from_request_model, resolve_tokenizer_path
@@ -225,18 +225,18 @@ def _build_unified_row_from_components(
"sort_time_ms": sort_time_ms,
"meta": {
"model_family": model_family,
"request_id": str(raw.get("request_id", "")),
"request_id": normalize_unicode_text(str(raw.get("request_id", ""))),
"session_id": "",
"raw_session_id": str(raw.get("session_id", "")),
"raw_session_id": normalize_unicode_text(str(raw.get("session_id", ""))),
"user_id": user_id,
"parent_request_id": "",
"parent_chat_id": -1,
"chat_id": -1,
"turn": 0,
"request_model": str(raw.get("request_model", "")),
"time": str(raw.get("time", "")),
"status_code": str(raw.get("status_code", "")),
"status_name": str(raw.get("status_name", "")),
"request_model": normalize_unicode_text(str(raw.get("request_model", ""))),
"time": normalize_unicode_text(str(raw.get("time", ""))),
"status_code": normalize_unicode_text(str(raw.get("status_code", ""))),
"status_name": normalize_unicode_text(str(raw.get("status_name", ""))),
"request_ready_time_ms": sort_time_ms,
"request_end_time_ms": request_end_time_ms,
"total_cost_time_ms": total_cost_time_ms,
@@ -417,6 +417,15 @@ def _open_progress_stream(log_file: str | Path | None):
yield _TeeStream(sys.stderr, handle)
def _resolve_temp_root_dir(*, tmp_dir: str | Path | None, output_path: str | Path) -> Path:
if tmp_dir is not None:
root = Path(tmp_dir)
else:
root = Path(output_path).parent
root.mkdir(parents=True, exist_ok=True)
return root
def _block_digest(block: list[int]) -> bytes:
digest = hashlib.blake2b(digest_size=16)
digest.update(len(block).to_bytes(4, "little", signed=False))
@@ -564,6 +573,7 @@ def export_release_ready_trace(
input_path = Path(raw_input_path)
release_destination = Path(release_output_path)
release_destination.parent.mkdir(parents=True, exist_ok=True)
temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=release_destination)
requested_jobs = jobs if jobs is not None else min(os.cpu_count() or 1, 16)
shard_jobs = max(1, requested_jobs)
@@ -573,7 +583,7 @@ def export_release_ready_trace(
block_ids_by_digest: dict[str, int] = {}
row_count = 0
with tempfile.TemporaryDirectory(dir=tmp_dir) as temp_root:
with tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root:
shard_root = Path(temp_root) / "release-shards"
shard_root.mkdir(parents=True, exist_ok=True)
shard_specs = [
@@ -682,12 +692,13 @@ def format_and_sort_trace(
source_files = discover_source_files(input_dir)
destination = Path(output_path)
destination.parent.mkdir(parents=True, exist_ok=True)
temp_root_dir = _resolve_temp_root_dir(tmp_dir=tmp_dir, output_path=destination)
time_offset_ms = infer_time_offset_ms(source_files[0]) if source_files else 0
time_window = infer_time_window(source_files, start_time=start_time, end_time=end_time) if truncate_to_window else None
total_input_bytes = sum(path.stat().st_size for path in source_files if path.suffix != ".zst")
has_zst = any(path.suffix == ".zst" for path in source_files)
with _open_progress_stream(log_file) as progress_stream, tempfile.TemporaryDirectory(dir=tmp_dir) as temp_root:
with _open_progress_stream(log_file) as progress_stream, tempfile.TemporaryDirectory(dir=temp_root_dir) as temp_root:
temp_raw_destination = Path(temp_root) / "formatted-raw.tmp.jsonl"
chunk_root = Path(temp_root)
chunk_paths: list[Path] = []

View File

@@ -128,20 +128,33 @@ def _normalize_message_content_for_template(content, role=""):
def _normalize_tool_call_for_template(tool_call):
if not isinstance(tool_call, dict):
return tool_call
return {
"function": {
"name": "",
"arguments": {"__raw_tool_call__": parse_jsonish(tool_call)},
}
}
normalized = dict(tool_call)
function = normalized.get("function")
normalized_function = dict(function) if isinstance(function, dict) else None
if normalized_function is None and ("name" in normalized or "arguments" in normalized):
normalized_function = {}
if normalized_function is not None:
if "name" not in normalized_function and normalized.get("name"):
normalized_function["name"] = normalized["name"]
if "arguments" not in normalized_function and "arguments" in normalized:
normalized_function["arguments"] = normalized["arguments"]
arguments = parse_jsonish(normalized_function.get("arguments", {}))
normalized_function["arguments"] = arguments if isinstance(arguments, dict) else {"__raw_arguments__": arguments}
normalized["function"] = normalized_function
normalized_function = dict(function) if isinstance(function, dict) else {}
tool_name = (
normalized_function.get("name")
or normalized.get("name")
or normalized.get("tool_name")
or normalized.get("function_name")
or ""
)
raw_arguments = (
normalized_function.get("arguments")
if "arguments" in normalized_function
else normalized.get("arguments", normalized.get("parameters", normalized.get("args", {})))
)
arguments = parse_jsonish(raw_arguments)
normalized_function["name"] = str(tool_name or "")
normalized_function["arguments"] = arguments if isinstance(arguments, dict) else {"__raw_arguments__": arguments}
normalized["function"] = normalized_function
return normalized
@@ -150,11 +163,23 @@ def _normalize_tool_spec_for_template(tool):
return tool
normalized = dict(tool)
function = normalized.get("function")
if isinstance(function, dict):
normalized_function = dict(function)
parameters = parse_jsonish(normalized_function.get("parameters", {}))
normalized_function = dict(function) if isinstance(function, dict) else {}
tool_name = (
normalized_function.get("name")
or normalized.get("name")
or normalized.get("tool_name")
or normalized.get("function_name")
or ""
)
if tool_name or function is not None or "name" in normalized or "parameters" in normalized:
parameters = parse_jsonish(
normalized_function.get("parameters", normalized.get("parameters", normalized.get("args", {})))
)
normalized_function["name"] = str(tool_name or "")
if isinstance(parameters, dict):
normalized_function["parameters"] = parameters
else:
normalized_function["parameters"] = {"__raw_parameters__": parameters}
normalized["function"] = normalized_function
return normalized
@@ -165,7 +190,12 @@ def _normalize_qwen_message_for_template(message):
normalized_message = dict(message)
normalized_message["content"] = _stringify_message_content_for_template(message.get("content"))
normalized_tool_calls = []
for tool_call in message.get("tool_calls", []):
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, dict):
tool_calls = [tool_calls]
elif not isinstance(tool_calls, list):
tool_calls = [tool_calls]
for tool_call in tool_calls:
normalized_tool_call = _normalize_tool_call_for_template(tool_call)
if isinstance(normalized_tool_call, dict):
function = normalized_tool_call.get("function")
@@ -244,11 +274,21 @@ def build_glm5_canonical_prompt(payload):
message.get("content"),
role=str(message.get("role", "")),
)
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, dict):
tool_calls = [tool_calls]
elif not isinstance(tool_calls, list):
tool_calls = [tool_calls]
normalized_message["tool_calls"] = [
_normalize_tool_call_for_template(tool_call) for tool_call in message.get("tool_calls", [])
_normalize_tool_call_for_template(tool_call) for tool_call in tool_calls
]
messages.append(normalized_message)
tools = [_normalize_tool_spec_for_template(tool) for tool in parameters.get("tools", []) if isinstance(tool, dict)]
tools_payload = parameters.get("tools", [])
if isinstance(tools_payload, dict):
tools_payload = [tools_payload]
elif not isinstance(tools_payload, list):
tools_payload = [tools_payload]
tools = [_normalize_tool_spec_for_template(tool) for tool in tools_payload if isinstance(tool, dict)]
return _load_glm5_chat_template().render(
messages=messages,
tools=tools,
@@ -266,7 +306,12 @@ def build_qwen3_coder_canonical_prompt(payload):
]
if not messages:
messages = [{"role": "system", "content": ""}]
tools = [_normalize_tool_spec_for_template(tool) for tool in parameters.get("tools", []) if isinstance(tool, dict)]
tools_payload = parameters.get("tools", [])
if isinstance(tools_payload, dict):
tools_payload = [tools_payload]
elif not isinstance(tools_payload, list):
tools_payload = [tools_payload]
tools = [_normalize_tool_spec_for_template(tool) for tool in tools_payload if isinstance(tool, dict)]
return _load_qwen3_coder_chat_template().render(
messages=messages,
tools=tools,