fix: tool parser and illegal char and /tmp space

2026-04-22 01:19:09 +00:00
parent bce3fe1395
commit 152f01613b
4 changed files with 304 additions and 33 deletions
--- a/trace_analyzer/helpers.py
+++ b/trace_analyzer/helpers.py
@@ -2,17 +2,103 @@ import json
 from statistics import mean, median


+def _text_has_surrogates(value):
+    for char in value:
+        codepoint = ord(char)
+        if 0xD800 <= codepoint <= 0xDFFF:
+            return True
+    return False
+
+
+def normalize_unicode_text(value):
+    if not isinstance(value, str) or not value:
+        return value
+    if not _text_has_surrogates(value):
+        return value
+
+    normalized = []
+    index = 0
+    while index < len(value):
+        codepoint = ord(value[index])
+        if 0xD800 <= codepoint <= 0xDBFF:
+            if index + 1 < len(value):
+                next_codepoint = ord(value[index + 1])
+                if 0xDC00 <= next_codepoint <= 0xDFFF:
+                    combined = 0x10000 + ((codepoint - 0xD800) << 10) + (next_codepoint - 0xDC00)
+                    normalized.append(chr(combined))
+                    index += 2
+                    continue
+            normalized.append("\uFFFD")
+            index += 1
+            continue
+        if 0xDC00 <= codepoint <= 0xDFFF:
+            normalized.append("\uFFFD")
+            index += 1
+            continue
+        normalized.append(value[index])
+        index += 1
+    return "".join(normalized)
+
+
+def normalize_unicode_value(value):
+    if isinstance(value, str):
+        return normalize_unicode_text(value)
+    if isinstance(value, list):
+        normalized = None
+        for index, item in enumerate(value):
+            normalized_item = normalize_unicode_value(item)
+            if normalized is not None:
+                normalized.append(normalized_item)
+                continue
+            if normalized_item is not item:
+                normalized = list(value[:index])
+                normalized.append(normalized_item)
+        return normalized if normalized is not None else value
+    if isinstance(value, tuple):
+        normalized = None
+        for index, item in enumerate(value):
+            normalized_item = normalize_unicode_value(item)
+            if normalized is not None:
+                normalized.append(normalized_item)
+                continue
+            if normalized_item is not item:
+                normalized = list(value[:index])
+                normalized.append(normalized_item)
+        return tuple(normalized) if normalized is not None else value
+    if isinstance(value, dict):
+        normalized = None
+        for key, item in value.items():
+            normalized_key = normalize_unicode_text(key) if isinstance(key, str) else key
+            normalized_item = normalize_unicode_value(item)
+            if normalized is not None:
+                normalized[normalized_key] = normalized_item
+                continue
+            if normalized_key is not key or normalized_item is not item:
+                normalized = {}
+                for original_key, original_item in value.items():
+                    if original_key == key:
+                        break
+                    normalized[original_key] = original_item
+                normalized[normalized_key] = normalized_item
+        return normalized if normalized is not None else value
+    return value
+
+
 def parse_jsonish(value):
    """Parse nested JSON strings until a non-string value is reached."""
+    if not isinstance(value, str):
+        return normalize_unicode_value(value)
+
    current = value
    while isinstance(current, str):
        text = current.strip()
        if not text:
-            return current
+            return normalize_unicode_text(current)
        try:
-            current = json.loads(text)
+            parsed = json.loads(text)
        except json.JSONDecodeError:
-            return current
+            return normalize_unicode_text(current)
+        current = normalize_unicode_value(parsed) if "\\ud" in text else parsed
    return current


@@ -75,4 +161,4 @@ def safe_div(numerator, denominator):


 def compact_json(data):
-    return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
+    return normalize_unicode_text(json.dumps(data, ensure_ascii=False, separators=(",", ":")))