From 3eaf561561b3f562709670d69b70eb3b3605c222 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Fri, 7 Feb 2025 15:18:21 -0800 Subject: [PATCH] core: Handle unterminated escape character when parsing partial JSON (#29065) **Description** Currently, when parsing a partial JSON, if a string ends with the escape character, the whole key/value is removed. For example: ``` >>> from langchain_core.utils.json import parse_partial_json >>> my_str = '{"foo": "bar", "baz": "qux\\' >>> >>> parse_partial_json(my_str) {'foo': 'bar'} ``` My expectation (and with this fix) would be for `parse_partial_json()` to return: ``` >>> from langchain_core.utils.json import parse_partial_json >>> >>> my_str = '{"foo": "bar", "baz": "qux\\' >>> parse_partial_json(my_str) {'foo': 'bar', 'baz': 'qux'} ``` Notes: 1. It could be argued that current behavior is still desired. 2. I have experienced this issue when the streaming output from an LLM and the chunk happens to end with `\\` 3. I haven't included tests. Will do if change is accepted. 4. This is specially troublesome when this function is used by https://github.com/langchain-ai/langchain/blob/187131c55c1b788da38124c6e7917151249746d6/libs/core/langchain_core/output_parsers/transform.py#L111 since what happens is that, for example, if the received sequence of chunks are: `{"foo": "b` , `ar\\` : Then, the result of calling `self.parse_result()` is: ``` {"foo": "b"} ``` and the second time: ``` {} ``` Co-authored-by: Erick Friis --- libs/core/langchain_core/utils/json.py | 2 ++ libs/core/tests/unit_tests/output_parsers/test_json.py | 1 + 2 files changed, 3 insertions(+) diff --git a/libs/core/langchain_core/utils/json.py b/libs/core/langchain_core/utils/json.py index 8aedfaf339b..472ef94b7a3 100644 --- a/libs/core/langchain_core/utils/json.py +++ b/libs/core/langchain_core/utils/json.py @@ -94,6 +94,8 @@ def parse_partial_json(s: str, *, strict: bool = False) -> Any: # If we're still inside a string at the end of processing, # we need to close the string. if is_inside_string: + if escaped: # Remoe unterminated escape character + new_chars.pop() new_chars.append('"') # Reverse the stack to get the closing characters. diff --git a/libs/core/tests/unit_tests/output_parsers/test_json.py b/libs/core/tests/unit_tests/output_parsers/test_json.py index 108b9f2a9cf..4198a467143 100644 --- a/libs/core/tests/unit_tests/output_parsers/test_json.py +++ b/libs/core/tests/unit_tests/output_parsers/test_json.py @@ -242,6 +242,7 @@ TEST_CASES_PARTIAL = [ ('{"foo": "bar", "bar":', '{"foo": "bar"}'), ('{"foo": "bar", "bar"', '{"foo": "bar"}'), ('{"foo": "bar", ', '{"foo": "bar"}'), + ('{"foo":"bar\\', '{"foo": "bar"}'), ]