fix: tool parser and illegal char and /tmp space
This commit is contained in:
@@ -2,17 +2,103 @@ import json
|
||||
from statistics import mean, median
|
||||
|
||||
|
||||
def _text_has_surrogates(value):
|
||||
for char in value:
|
||||
codepoint = ord(char)
|
||||
if 0xD800 <= codepoint <= 0xDFFF:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def normalize_unicode_text(value):
|
||||
if not isinstance(value, str) or not value:
|
||||
return value
|
||||
if not _text_has_surrogates(value):
|
||||
return value
|
||||
|
||||
normalized = []
|
||||
index = 0
|
||||
while index < len(value):
|
||||
codepoint = ord(value[index])
|
||||
if 0xD800 <= codepoint <= 0xDBFF:
|
||||
if index + 1 < len(value):
|
||||
next_codepoint = ord(value[index + 1])
|
||||
if 0xDC00 <= next_codepoint <= 0xDFFF:
|
||||
combined = 0x10000 + ((codepoint - 0xD800) << 10) + (next_codepoint - 0xDC00)
|
||||
normalized.append(chr(combined))
|
||||
index += 2
|
||||
continue
|
||||
normalized.append("\uFFFD")
|
||||
index += 1
|
||||
continue
|
||||
if 0xDC00 <= codepoint <= 0xDFFF:
|
||||
normalized.append("\uFFFD")
|
||||
index += 1
|
||||
continue
|
||||
normalized.append(value[index])
|
||||
index += 1
|
||||
return "".join(normalized)
|
||||
|
||||
|
||||
def normalize_unicode_value(value):
|
||||
if isinstance(value, str):
|
||||
return normalize_unicode_text(value)
|
||||
if isinstance(value, list):
|
||||
normalized = None
|
||||
for index, item in enumerate(value):
|
||||
normalized_item = normalize_unicode_value(item)
|
||||
if normalized is not None:
|
||||
normalized.append(normalized_item)
|
||||
continue
|
||||
if normalized_item is not item:
|
||||
normalized = list(value[:index])
|
||||
normalized.append(normalized_item)
|
||||
return normalized if normalized is not None else value
|
||||
if isinstance(value, tuple):
|
||||
normalized = None
|
||||
for index, item in enumerate(value):
|
||||
normalized_item = normalize_unicode_value(item)
|
||||
if normalized is not None:
|
||||
normalized.append(normalized_item)
|
||||
continue
|
||||
if normalized_item is not item:
|
||||
normalized = list(value[:index])
|
||||
normalized.append(normalized_item)
|
||||
return tuple(normalized) if normalized is not None else value
|
||||
if isinstance(value, dict):
|
||||
normalized = None
|
||||
for key, item in value.items():
|
||||
normalized_key = normalize_unicode_text(key) if isinstance(key, str) else key
|
||||
normalized_item = normalize_unicode_value(item)
|
||||
if normalized is not None:
|
||||
normalized[normalized_key] = normalized_item
|
||||
continue
|
||||
if normalized_key is not key or normalized_item is not item:
|
||||
normalized = {}
|
||||
for original_key, original_item in value.items():
|
||||
if original_key == key:
|
||||
break
|
||||
normalized[original_key] = original_item
|
||||
normalized[normalized_key] = normalized_item
|
||||
return normalized if normalized is not None else value
|
||||
return value
|
||||
|
||||
|
||||
def parse_jsonish(value):
|
||||
"""Parse nested JSON strings until a non-string value is reached."""
|
||||
if not isinstance(value, str):
|
||||
return normalize_unicode_value(value)
|
||||
|
||||
current = value
|
||||
while isinstance(current, str):
|
||||
text = current.strip()
|
||||
if not text:
|
||||
return current
|
||||
return normalize_unicode_text(current)
|
||||
try:
|
||||
current = json.loads(text)
|
||||
parsed = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return current
|
||||
return normalize_unicode_text(current)
|
||||
current = normalize_unicode_value(parsed) if "\\ud" in text else parsed
|
||||
return current
|
||||
|
||||
|
||||
@@ -75,4 +161,4 @@ def safe_div(numerator, denominator):
|
||||
|
||||
|
||||
def compact_json(data):
|
||||
return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
|
||||
return normalize_unicode_text(json.dumps(data, ensure_ascii=False, separators=(",", ":")))
|
||||
|
||||
Reference in New Issue
Block a user