core[patch]: XMLOutputParser fix to handle changes to xml standard library (#19612)

Newest python micro releases broke streaming in the XMLOutputParser. This fixes the parsing code to work with trailing junk after the XML content.
This commit is contained in:
Eugene Yurtsev
2024-03-27 09:25:28 -04:00
committed by GitHub
parent 3a7d2cf443
commit 8ab7bb3166
2 changed files with 127 additions and 55 deletions

View File

@@ -1,10 +1,12 @@
"""Test XMLOutputParser"""
from typing import AsyncIterator, Iterable
import pytest
from langchain_core.exceptions import OutputParserException
from langchain_core.output_parsers.xml import XMLOutputParser
DEF_RESULT_ENCODING = """<?xml version="1.0" encoding="UTF-8"?>
DATA = """
<foo>
<bar>
<baz></baz>
@@ -13,6 +15,25 @@ DEF_RESULT_ENCODING = """<?xml version="1.0" encoding="UTF-8"?>
<baz>tag</baz>
</foo>"""
WITH_XML_HEADER = f"""<?xml version="1.0" encoding="UTF-8"?>
{DATA}"""
IN_XML_TAGS_WITH_XML_HEADER = f"""
```xml
{WITH_XML_HEADER}
```
"""
IN_XML_TAGS_WITH_HEADER_AND_TRAILING_JUNK = f"""
Some random text
```xml
{WITH_XML_HEADER}
```
More random text
"""
DEF_RESULT_EXPECTED = {
"foo": [
{"bar": [{"baz": None}, {"baz": "slim.shady"}]},
@@ -24,23 +45,13 @@ DEF_RESULT_EXPECTED = {
@pytest.mark.parametrize(
"result",
[
DEF_RESULT_ENCODING,
DEF_RESULT_ENCODING[DEF_RESULT_ENCODING.find("\n") :],
f"""
```xml
{DEF_RESULT_ENCODING}
```
""",
f"""
Some random text
```xml
{DEF_RESULT_ENCODING}
```
More random text
""",
DATA, # has no xml header
WITH_XML_HEADER,
IN_XML_TAGS_WITH_XML_HEADER,
IN_XML_TAGS_WITH_HEADER_AND_TRAILING_JUNK,
],
)
def test_xml_output_parser(result: str) -> None:
async def test_xml_output_parser(result: str) -> None:
"""Test XMLOutputParser."""
xml_parser = XMLOutputParser()
@@ -48,12 +59,23 @@ def test_xml_output_parser(result: str) -> None:
xml_result = xml_parser.parse(result)
assert DEF_RESULT_EXPECTED == xml_result
# TODO(Eugene): Fix this test for newer python version
# assert list(xml_parser.transform(iter(result))) == [
# {"foo": [{"bar": [{"baz": None}]}]},
# {"foo": [{"bar": [{"baz": "slim.shady"}]}]},
# {"foo": [{"baz": "tag"}]},
# ]
assert list(xml_parser.transform(iter(result))) == [
{"foo": [{"bar": [{"baz": None}]}]},
{"foo": [{"bar": [{"baz": "slim.shady"}]}]},
{"foo": [{"baz": "tag"}]},
]
async def _as_iter(iterable: Iterable[str]) -> AsyncIterator[str]:
for item in iterable:
yield item
chunks = [chunk async for chunk in xml_parser.atransform(_as_iter(result))]
assert list(chunks) == [
{"foo": [{"bar": [{"baz": None}]}]},
{"foo": [{"bar": [{"baz": "slim.shady"}]}]},
{"foo": [{"baz": "tag"}]},
]
@pytest.mark.parametrize("result", ["foo></foo>", "<foo></foo", "foo></foo", "foofoo"])