Move json and xml parsers to core (#15026)

<!-- Thank you for contributing to LangChain!

Please title your PR "<package>: <description>", where <package> is
whichever of langchain, community, core, experimental, etc. is being
modified.

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes if applicable,
  - **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
This commit is contained in:
Nuno Campos
2023-12-21 12:36:56 -08:00
committed by GitHub
parent d5533b7081
commit 71076cceaf
10 changed files with 840 additions and 420 deletions

View File

@@ -11,6 +11,8 @@ EXPECTED_ALL = [
"StrOutputParser",
"BaseTransformOutputParser",
"BaseCumulativeTransformOutputParser",
"SimpleJsonOutputParser",
"XMLOutputParser",
]

View File

@@ -0,0 +1,488 @@
import json
from typing import Any, AsyncIterator, Iterator, Tuple
import pytest
from langchain_core.output_parsers.json import (
SimpleJsonOutputParser,
parse_json_markdown,
parse_partial_json,
)
GOOD_JSON = """```json
{
"foo": "bar"
}
```"""
JSON_WITH_NEW_LINES = """
```json
{
"foo": "bar"
}
```
"""
JSON_WITH_NEW_LINES_INSIDE = """```json
{
"foo": "bar"
}
```"""
JSON_WITH_NEW_LINES_EVERYWHERE = """
```json
{
"foo": "bar"
}
```
"""
TICKS_WITH_NEW_LINES_EVERYWHERE = """
```
{
"foo": "bar"
}
```
"""
JSON_WITH_MARKDOWN_CODE_BLOCK = """```json
{
"foo": "```bar```"
}
```"""
JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES = """```json
{
"action": "Final Answer",
"action_input": "```bar\n<div id="1" class=\"value\">\n\ttext\n</div>```"
}
```"""
JSON_WITH_UNESCAPED_QUOTES_IN_NESTED_JSON = """```json
{
"action": "Final Answer",
"action_input": "{"foo": "bar", "bar": "foo"}"
}
```"""
JSON_WITH_ESCAPED_QUOTES_IN_NESTED_JSON = """```json
{
"action": "Final Answer",
"action_input": "{\"foo\": \"bar\", \"bar\": \"foo\"}"
}
```"""
JSON_WITH_PYTHON_DICT = """```json
{
"action": "Final Answer",
"action_input": {"foo": "bar", "bar": "foo"}
}
```"""
JSON_WITH_ESCAPED_DOUBLE_QUOTES_IN_NESTED_JSON = """```json
{
"action": "Final Answer",
"action_input": "{\\"foo\\": \\"bar\\", \\"bar\\": \\"foo\\"}"
}
```"""
NO_TICKS = """{
"foo": "bar"
}"""
NO_TICKS_WHITE_SPACE = """
{
"foo": "bar"
}
"""
TEXT_BEFORE = """Thought: I need to use the search tool
Action:
```
{
"foo": "bar"
}
```"""
TEXT_AFTER = """```
{
"foo": "bar"
}
```
This should do the trick"""
TEXT_BEFORE_AND_AFTER = """Action: Testing
```
{
"foo": "bar"
}
```
This should do the trick"""
TEST_CASES = [
GOOD_JSON,
JSON_WITH_NEW_LINES,
JSON_WITH_NEW_LINES_INSIDE,
JSON_WITH_NEW_LINES_EVERYWHERE,
TICKS_WITH_NEW_LINES_EVERYWHERE,
NO_TICKS,
NO_TICKS_WHITE_SPACE,
TEXT_BEFORE,
TEXT_AFTER,
]
@pytest.mark.parametrize("json_string", TEST_CASES)
def test_parse_json(json_string: str) -> None:
parsed = parse_json_markdown(json_string)
assert parsed == {"foo": "bar"}
def test_parse_json_with_code_blocks() -> None:
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
assert parsed == {"foo": "```bar```"}
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES)
assert parsed == {
"action": "Final Answer",
"action_input": '```bar\n<div id="1" class="value">\n\ttext\n</div>```',
}
TEST_CASES_ESCAPED_QUOTES = [
JSON_WITH_UNESCAPED_QUOTES_IN_NESTED_JSON,
JSON_WITH_ESCAPED_QUOTES_IN_NESTED_JSON,
JSON_WITH_ESCAPED_DOUBLE_QUOTES_IN_NESTED_JSON,
]
@pytest.mark.parametrize("json_string", TEST_CASES_ESCAPED_QUOTES)
def test_parse_nested_json_with_escaped_quotes(json_string: str) -> None:
parsed = parse_json_markdown(json_string)
assert parsed == {
"action": "Final Answer",
"action_input": '{"foo": "bar", "bar": "foo"}',
}
def test_parse_json_with_python_dict() -> None:
parsed = parse_json_markdown(JSON_WITH_PYTHON_DICT)
assert parsed == {
"action": "Final Answer",
"action_input": {"foo": "bar", "bar": "foo"},
}
TEST_CASES_PARTIAL = [
('{"foo": "bar", "bar": "foo"}', '{"foo": "bar", "bar": "foo"}'),
('{"foo": "bar", "bar": "foo', '{"foo": "bar", "bar": "foo"}'),
('{"foo": "bar", "bar": "foo}', '{"foo": "bar", "bar": "foo}"}'),
('{"foo": "bar", "bar": "foo[', '{"foo": "bar", "bar": "foo["}'),
('{"foo": "bar", "bar": "foo\\"', '{"foo": "bar", "bar": "foo\\""}'),
]
@pytest.mark.parametrize("json_strings", TEST_CASES_PARTIAL)
def test_parse_partial_json(json_strings: Tuple[str, str]) -> None:
case, expected = json_strings
parsed = parse_partial_json(case)
assert parsed == json.loads(expected)
STREAMED_TOKENS = """
{
"
setup
":
"
Why
did
the
bears
start
a
band
called
Bears
Bears
Bears
?
"
,
"
punchline
":
"
Because
they
wanted
to
play
bear
-y
good
music
!
"
,
"
audience
":
[
"
Haha
"
,
"
So
funny
"
]
}
""".splitlines()
EXPECTED_STREAMED_JSON = [
{},
{"setup": ""},
{"setup": "Why"},
{"setup": "Why did"},
{"setup": "Why did the"},
{"setup": "Why did the bears"},
{"setup": "Why did the bears start"},
{"setup": "Why did the bears start a"},
{"setup": "Why did the bears start a band"},
{"setup": "Why did the bears start a band called"},
{"setup": "Why did the bears start a band called Bears"},
{"setup": "Why did the bears start a band called Bears Bears"},
{"setup": "Why did the bears start a band called Bears Bears Bears"},
{"setup": "Why did the bears start a band called Bears Bears Bears ?"},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted to",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted to play",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted to play bear",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted to play bear -y",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted to play bear -y good",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted to play bear -y good music",
},
{
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"punchline": "Because they wanted to play bear -y good music !",
},
{
"punchline": "Because they wanted to play bear -y good music !",
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"audience": [],
},
{
"punchline": "Because they wanted to play bear -y good music !",
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"audience": [""],
},
{
"punchline": "Because they wanted to play bear -y good music !",
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"audience": ["Haha"],
},
{
"punchline": "Because they wanted to play bear -y good music !",
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"audience": ["Haha", ""],
},
{
"punchline": "Because they wanted to play bear -y good music !",
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"audience": ["Haha", "So"],
},
{
"punchline": "Because they wanted to play bear -y good music !",
"setup": "Why did the bears start a band called Bears Bears Bears ?",
"audience": ["Haha", "So funny"],
},
]
EXPECTED_STREAMED_JSON_DIFF = [
[{"op": "replace", "path": "", "value": {}}],
[{"op": "add", "path": "/setup", "value": ""}],
[{"op": "replace", "path": "/setup", "value": "Why"}],
[{"op": "replace", "path": "/setup", "value": "Why did"}],
[{"op": "replace", "path": "/setup", "value": "Why did the"}],
[{"op": "replace", "path": "/setup", "value": "Why did the bears"}],
[{"op": "replace", "path": "/setup", "value": "Why did the bears start"}],
[{"op": "replace", "path": "/setup", "value": "Why did the bears start a"}],
[{"op": "replace", "path": "/setup", "value": "Why did the bears start a band"}],
[
{
"op": "replace",
"path": "/setup",
"value": "Why did the bears start a band called",
}
],
[
{
"op": "replace",
"path": "/setup",
"value": "Why did the bears start a band called Bears",
}
],
[
{
"op": "replace",
"path": "/setup",
"value": "Why did the bears start a band called Bears Bears",
}
],
[
{
"op": "replace",
"path": "/setup",
"value": "Why did the bears start a band called Bears Bears Bears",
}
],
[
{
"op": "replace",
"path": "/setup",
"value": "Why did the bears start a band called Bears Bears Bears ?",
}
],
[{"op": "add", "path": "/punchline", "value": ""}],
[{"op": "replace", "path": "/punchline", "value": "Because"}],
[{"op": "replace", "path": "/punchline", "value": "Because they"}],
[{"op": "replace", "path": "/punchline", "value": "Because they wanted"}],
[{"op": "replace", "path": "/punchline", "value": "Because they wanted to"}],
[{"op": "replace", "path": "/punchline", "value": "Because they wanted to play"}],
[
{
"op": "replace",
"path": "/punchline",
"value": "Because they wanted to play bear",
}
],
[
{
"op": "replace",
"path": "/punchline",
"value": "Because they wanted to play bear -y",
}
],
[
{
"op": "replace",
"path": "/punchline",
"value": "Because they wanted to play bear -y good",
}
],
[
{
"op": "replace",
"path": "/punchline",
"value": "Because they wanted to play bear -y good music",
}
],
[
{
"op": "replace",
"path": "/punchline",
"value": "Because they wanted to play bear -y good music !",
}
],
[{"op": "add", "path": "/audience", "value": []}],
[{"op": "add", "path": "/audience/0", "value": ""}],
[{"op": "replace", "path": "/audience/0", "value": "Haha"}],
[{"op": "add", "path": "/audience/1", "value": ""}],
[{"op": "replace", "path": "/audience/1", "value": "So"}],
[{"op": "replace", "path": "/audience/1", "value": "So funny"}],
]
def test_partial_text_json_output_parser() -> None:
def input_iter(_: Any) -> Iterator[str]:
for token in STREAMED_TOKENS:
yield token
chain = input_iter | SimpleJsonOutputParser()
assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON
def test_partial_text_json_output_parser_diff() -> None:
def input_iter(_: Any) -> Iterator[str]:
for token in STREAMED_TOKENS:
yield token
chain = input_iter | SimpleJsonOutputParser(diff=True)
assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON_DIFF
async def test_partial_text_json_output_parser_async() -> None:
async def input_iter(_: Any) -> AsyncIterator[str]:
for token in STREAMED_TOKENS:
yield token
chain = input_iter | SimpleJsonOutputParser()
assert [p async for p in chain.astream(None)] == EXPECTED_STREAMED_JSON
async def test_partial_text_json_output_parser_diff_async() -> None:
async def input_iter(_: Any) -> AsyncIterator[str]:
for token in STREAMED_TOKENS:
yield token
chain = input_iter | SimpleJsonOutputParser(diff=True)
assert [p async for p in chain.astream(None)] == EXPECTED_STREAMED_JSON_DIFF

View File

@@ -0,0 +1,49 @@
"""Test XMLOutputParser"""
import pytest
from langchain_core.output_parsers.xml import XMLOutputParser
DEF_RESULT_ENCODING = """<?xml version="1.0" encoding="UTF-8"?>
<foo>
<bar>
<baz></baz>
<baz>slim.shady</baz>
</bar>
<baz>tag</baz>
</foo>"""
DEF_RESULT_EXPECTED = {
"foo": [
{"bar": [{"baz": None}, {"baz": "slim.shady"}]},
{"baz": "tag"},
],
}
@pytest.mark.parametrize(
"result",
[DEF_RESULT_ENCODING, DEF_RESULT_ENCODING[DEF_RESULT_ENCODING.find("\n") :]],
)
def test_xml_output_parser(result: str) -> None:
"""Test XMLOutputParser."""
xml_parser = XMLOutputParser()
xml_result = xml_parser.parse(result)
assert DEF_RESULT_EXPECTED == xml_result
assert list(xml_parser.transform(iter(result))) == [
{"foo": [{"bar": [{"baz": None}]}]},
{"foo": [{"bar": [{"baz": "slim.shady"}]}]},
{"foo": [{"baz": "tag"}]},
]
@pytest.mark.parametrize("result", ["foo></foo>", "<foo></foo", "foo></foo", "foofoo"])
def test_xml_output_parser_fail(result: str) -> None:
"""Test XMLOutputParser where complete output is not in XML format."""
xml_parser = XMLOutputParser()
with pytest.raises(ValueError) as e:
xml_parser.parse(result)
assert "Could not parse output" in str(e)