mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-06 07:04:01 +00:00
Make json output parser handle newlines inside markdown code blocks (#8682)
Update to #8528 Newlines and other special characters within markdown code blocks returned as `action_input` should be handled correctly (in particular, unescaped `"` => `\"` and `\n` => `\\n`) so they don't break JSON parsing. @baskaryan
This commit is contained in:
parent
ce3666c28b
commit
d56eff042a
@ -8,6 +8,36 @@ from typing import Any, List
|
|||||||
from langchain.schema import BaseOutputParser, OutputParserException
|
from langchain.schema import BaseOutputParser, OutputParserException
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_new_line(match: re.Match[str]) -> str:
|
||||||
|
value = match.group(2)
|
||||||
|
value = re.sub(r"\n", r"\\n", value)
|
||||||
|
value = re.sub(r"\r", r"\\r", value)
|
||||||
|
value = re.sub(r"\t", r"\\t", value)
|
||||||
|
value = re.sub('"', r"\"", value)
|
||||||
|
|
||||||
|
return match.group(1) + value + match.group(3)
|
||||||
|
|
||||||
|
|
||||||
|
def _custom_parser(multiline_string: str) -> str:
|
||||||
|
"""
|
||||||
|
The LLM response for `action_input` may be a multiline
|
||||||
|
string containing unescaped newlines, tabs or quotes. This function
|
||||||
|
replaces those characters with their escaped counterparts.
|
||||||
|
(newlines in JSON must be double-escaped: `\\n`)
|
||||||
|
"""
|
||||||
|
if isinstance(multiline_string, (bytes, bytearray)):
|
||||||
|
multiline_string = multiline_string.decode()
|
||||||
|
|
||||||
|
multiline_string = re.sub(
|
||||||
|
r'("action_input"\:\s*")(.*)(")',
|
||||||
|
_replace_new_line,
|
||||||
|
multiline_string,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
return multiline_string
|
||||||
|
|
||||||
|
|
||||||
def parse_json_markdown(json_string: str) -> dict:
|
def parse_json_markdown(json_string: str) -> dict:
|
||||||
"""
|
"""
|
||||||
Parse a JSON string from a Markdown string.
|
Parse a JSON string from a Markdown string.
|
||||||
@ -31,6 +61,9 @@ def parse_json_markdown(json_string: str) -> dict:
|
|||||||
# Strip whitespace and newlines from the start and end
|
# Strip whitespace and newlines from the start and end
|
||||||
json_str = json_str.strip()
|
json_str = json_str.strip()
|
||||||
|
|
||||||
|
# handle newlines and other special characters inside the returned value
|
||||||
|
json_str = _custom_parser(json_str)
|
||||||
|
|
||||||
# Parse the JSON string into a Python dictionary
|
# Parse the JSON string into a Python dictionary
|
||||||
parsed = json.loads(json_str)
|
parsed = json.loads(json_str)
|
||||||
|
|
||||||
|
@ -60,6 +60,13 @@ JSON_WITH_MARKDOWN_CODE_BLOCK = """```json
|
|||||||
}
|
}
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
|
JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES = """```json
|
||||||
|
{
|
||||||
|
"action": "Final Answer",
|
||||||
|
"action_input": "```bar\n<div id="1" class=\"value\">\n\ttext\n</div>```"
|
||||||
|
}
|
||||||
|
```"""
|
||||||
|
|
||||||
NO_TICKS = """{
|
NO_TICKS = """{
|
||||||
"foo": "bar"
|
"foo": "bar"
|
||||||
}"""
|
}"""
|
||||||
@ -114,6 +121,13 @@ def test_parse_json(json_string: str) -> None:
|
|||||||
assert parsed == {"foo": "bar"}
|
assert parsed == {"foo": "bar"}
|
||||||
|
|
||||||
|
|
||||||
def test_parse_json_with_code_block() -> None:
|
def test_parse_json_with_code_blocks() -> None:
|
||||||
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
|
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
|
||||||
assert parsed == {"foo": "```bar```"}
|
assert parsed == {"foo": "```bar```"}
|
||||||
|
|
||||||
|
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES)
|
||||||
|
|
||||||
|
assert parsed == {
|
||||||
|
"action": "Final Answer",
|
||||||
|
"action_input": '```bar\n<div id="1" class="value">\n\ttext\n</div>```',
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user