core: Speed up json parse for large strings (#24036)

for a large string:
- old 4.657918874989264
- new 0.023724667000351474
This commit is contained in:
Nuno Campos 2024-07-09 20:26:50 +01:00 committed by GitHub
parent 160fc7f246
commit 859e434932
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -58,7 +58,7 @@ def parse_partial_json(s: str, *, strict: bool = False) -> Any:
pass
# Initialize variables.
new_s = ""
new_chars = []
stack = []
is_inside_string = False
escaped = False
@ -90,29 +90,27 @@ def parse_partial_json(s: str, *, strict: bool = False) -> Any:
return None
# Append the processed character to the new string.
new_s += char
new_chars.append(char)
# If we're still inside a string at the end of processing,
# we need to close the string.
if is_inside_string:
new_s += '"'
new_chars.append('"')
# Reverse the stack to get the closing characters.
stack.reverse()
# Try to parse mods of string until we succeed or run out of characters.
while new_s:
final_s = new_s
while new_chars:
# Close any remaining open structures in the reverse
# order that they were opened.
for closing_char in reversed(stack):
final_s += closing_char
# Attempt to parse the modified string as JSON.
try:
return json.loads(final_s, strict=strict)
return json.loads("".join(new_chars + stack), strict=strict)
except json.JSONDecodeError:
# If we still can't parse the string as JSON,
# try removing the last character
new_s = new_s[:-1]
new_chars.pop()
# If we got here, we ran out of characters to remove
# and still couldn't parse the string as JSON, so return the parse error
@ -120,6 +118,9 @@ def parse_partial_json(s: str, *, strict: bool = False) -> Any:
return json.loads(s, strict=strict)
_json_markdown_re = re.compile(r"```(json)?(.*)", re.DOTALL)
def parse_json_markdown(
json_string: str, *, parser: Callable[[str], Any] = parse_partial_json
) -> dict:
@ -136,7 +137,7 @@ def parse_json_markdown(
return _parse_json(json_string, parser=parser)
except json.JSONDecodeError:
# Try to find JSON string within triple backticks
match = re.search(r"```(json)?(.*)", json_string, re.DOTALL)
match = _json_markdown_re.search(json_string)
# If no match found, assume the entire string is a JSON string
if match is None:
@ -147,11 +148,14 @@ def parse_json_markdown(
return _parse_json(json_str, parser=parser)
_json_strip_chars = " \n\r\t`"
def _parse_json(
json_str: str, *, parser: Callable[[str], Any] = parse_partial_json
) -> dict:
# Strip whitespace and newlines from the start and end
json_str = json_str.strip().strip("`")
# Strip whitespace,newlines,backtick from the start and end
json_str = json_str.strip(_json_strip_chars)
# handle newlines and other special characters inside the returned value
json_str = _custom_parser(json_str)