mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-06 07:04:01 +00:00
Make json output parser handle newlines inside markdown code blocks (#8682)
Update to #8528 Newlines and other special characters within markdown code blocks returned as `action_input` should be handled correctly (in particular, unescaped `"` => `\"` and `\n` => `\\n`) so they don't break JSON parsing. @baskaryan
This commit is contained in:
parent
ce3666c28b
commit
d56eff042a
@ -8,6 +8,36 @@ from typing import Any, List
|
||||
from langchain.schema import BaseOutputParser, OutputParserException
|
||||
|
||||
|
||||
def _replace_new_line(match: re.Match[str]) -> str:
|
||||
value = match.group(2)
|
||||
value = re.sub(r"\n", r"\\n", value)
|
||||
value = re.sub(r"\r", r"\\r", value)
|
||||
value = re.sub(r"\t", r"\\t", value)
|
||||
value = re.sub('"', r"\"", value)
|
||||
|
||||
return match.group(1) + value + match.group(3)
|
||||
|
||||
|
||||
def _custom_parser(multiline_string: str) -> str:
|
||||
"""
|
||||
The LLM response for `action_input` may be a multiline
|
||||
string containing unescaped newlines, tabs or quotes. This function
|
||||
replaces those characters with their escaped counterparts.
|
||||
(newlines in JSON must be double-escaped: `\\n`)
|
||||
"""
|
||||
if isinstance(multiline_string, (bytes, bytearray)):
|
||||
multiline_string = multiline_string.decode()
|
||||
|
||||
multiline_string = re.sub(
|
||||
r'("action_input"\:\s*")(.*)(")',
|
||||
_replace_new_line,
|
||||
multiline_string,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
return multiline_string
|
||||
|
||||
|
||||
def parse_json_markdown(json_string: str) -> dict:
|
||||
"""
|
||||
Parse a JSON string from a Markdown string.
|
||||
@ -31,6 +61,9 @@ def parse_json_markdown(json_string: str) -> dict:
|
||||
# Strip whitespace and newlines from the start and end
|
||||
json_str = json_str.strip()
|
||||
|
||||
# handle newlines and other special characters inside the returned value
|
||||
json_str = _custom_parser(json_str)
|
||||
|
||||
# Parse the JSON string into a Python dictionary
|
||||
parsed = json.loads(json_str)
|
||||
|
||||
|
@ -60,6 +60,13 @@ JSON_WITH_MARKDOWN_CODE_BLOCK = """```json
|
||||
}
|
||||
```"""
|
||||
|
||||
JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES = """```json
|
||||
{
|
||||
"action": "Final Answer",
|
||||
"action_input": "```bar\n<div id="1" class=\"value\">\n\ttext\n</div>```"
|
||||
}
|
||||
```"""
|
||||
|
||||
NO_TICKS = """{
|
||||
"foo": "bar"
|
||||
}"""
|
||||
@ -114,6 +121,13 @@ def test_parse_json(json_string: str) -> None:
|
||||
assert parsed == {"foo": "bar"}
|
||||
|
||||
|
||||
def test_parse_json_with_code_block() -> None:
|
||||
def test_parse_json_with_code_blocks() -> None:
|
||||
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
|
||||
assert parsed == {"foo": "```bar```"}
|
||||
|
||||
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES)
|
||||
|
||||
assert parsed == {
|
||||
"action": "Final Answer",
|
||||
"action_input": '```bar\n<div id="1" class="value">\n\ttext\n</div>```',
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user