mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-15 03:57:19 +00:00
Add XMLOutputParser (#10051)
**Description** Adds new output parser, this time enabling the output of LLM to be of an XML format. Seems to be particularly useful together with Claude model. Addresses [issue 9820](https://github.com/langchain-ai/langchain/issues/9820). **Twitter handle** @deepsense_ai @matt_wosinski
This commit is contained in:
parent
d6df288380
commit
720f6dbaac
358
docs/extras/modules/model_io/output_parsers/xml.ipynb
Normal file
358
docs/extras/modules/model_io/output_parsers/xml.ipynb
Normal file
@ -0,0 +1,358 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "181b5b6d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# XML parser\n",
|
||||
"This output parser allows users to obtain results from LLM in the popular XML format. \n",
|
||||
"\n",
|
||||
"Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed XML. \n",
|
||||
"\n",
|
||||
"In the following example we use Claude model (https://docs.anthropic.com/claude/docs) which works really well with XML tags."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "3b10fc55",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"from langchain.llms import Anthropic\n",
|
||||
"from langchain.output_parsers import XMLOutputParser"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "909161d1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/mateusz/Documents/Projects/langchain/libs/langchain/langchain/llms/anthropic.py:170: UserWarning: This Anthropic LLM is deprecated. Please use `from langchain.chat_models import ChatAnthropic` instead\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = Anthropic(model=\"claude-2\", max_tokens_to_sample=512, temperature=0.1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "da312f86-0d2a-4aef-a09d-1e72bd0ea9b1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's start with the simple request to the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b03785af-69fc-40a1-a1be-c04ed6fade70",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Here is the shortened filmography for Tom Hanks enclosed in <movie> tags:\n",
|
||||
"\n",
|
||||
"<movie>Splash (1984)</movie>\n",
|
||||
"<movie>Big (1988)</movie> \n",
|
||||
"<movie>A League of Their Own (1992)</movie>\n",
|
||||
"<movie>Sleepless in Seattle (1993)</movie> \n",
|
||||
"<movie>Forrest Gump (1994)</movie>\n",
|
||||
"<movie>Apollo 13 (1995)</movie>\n",
|
||||
"<movie>Toy Story (1995)</movie>\n",
|
||||
"<movie>Saving Private Ryan (1998)</movie>\n",
|
||||
"<movie>Cast Away (2000)</movie>\n",
|
||||
"<movie>The Da Vinci Code (2006)</movie>\n",
|
||||
"<movie>Toy Story 3 (2010)</movie>\n",
|
||||
"<movie>Captain Phillips (2013)</movie>\n",
|
||||
"<movie>Bridge of Spies (2015)</movie>\n",
|
||||
"<movie>Toy Story 4 (2019)</movie>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"actor_query = \"Generate the shortened filmography for Tom Hanks.\"\n",
|
||||
"output = model(\n",
|
||||
" f\"\"\"\n",
|
||||
"\n",
|
||||
"Human:\n",
|
||||
"{actor_query}\n",
|
||||
"Please enclose the movies in <movie></movie> tags\n",
|
||||
"Assistant:\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4db65781-3d54-4ba6-ae26-5b4ead47a4c8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we will use the XMLOutputParser in order to get the structured output."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "87ba8d11",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"<filmography>\n",
|
||||
" <movie>\n",
|
||||
" <title>Splash</title>\n",
|
||||
" <year>1984</year>\n",
|
||||
" </movie>\n",
|
||||
" \n",
|
||||
" <movie>\n",
|
||||
" <title>Big</title> \n",
|
||||
" <year>1988</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>A League of Their Own</title>\n",
|
||||
" <year>1992</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Sleepless in Seattle</title>\n",
|
||||
" <year>1993</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Forrest Gump</title>\n",
|
||||
" <year>1994</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Toy Story</title>\n",
|
||||
" <year>1995</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Apollo 13</title>\n",
|
||||
" <year>1995</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Saving Private Ryan</title>\n",
|
||||
" <year>1998</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Cast Away</title>\n",
|
||||
" <year>2000</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Catch Me If You Can</title>\n",
|
||||
" <year>2002</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>The Polar Express</title>\n",
|
||||
" <year>2004</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Charlie Wilson's War</title>\n",
|
||||
" <year>2007</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Toy Story 3</title>\n",
|
||||
" <year>2010</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Captain Phillips</title>\n",
|
||||
" <year>2013</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>Bridge of Spies</title>\n",
|
||||
" <year>2015</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>The Post</title>\n",
|
||||
" <year>2017</year>\n",
|
||||
" </movie>\n",
|
||||
"\n",
|
||||
" <movie>\n",
|
||||
" <title>A Beautiful Day in the Neighborhood</title> \n",
|
||||
" <year>2019</year>\n",
|
||||
" </movie>\n",
|
||||
"</filmography>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"parser = XMLOutputParser()\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(\n",
|
||||
" template=\"\"\"\n",
|
||||
" \n",
|
||||
" Human:\n",
|
||||
" {query}\n",
|
||||
" {format_instructions}\n",
|
||||
" Assistant:\"\"\",\n",
|
||||
" input_variables=[\"query\"],\n",
|
||||
" partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"_input = prompt.format_prompt(query=actor_query)\n",
|
||||
"\n",
|
||||
"output = model(_input.to_string())\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1c4c47ee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And here parsed output is shown:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "4c864dc9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'filmography': [{'movie': [{'title': 'Splash'}, {'year': '1984'}]},\n",
|
||||
" {'movie': [{'title': 'Big'}, {'year': '1988'}]},\n",
|
||||
" {'movie': [{'title': 'A League of Their Own'}, {'year': '1992'}]},\n",
|
||||
" {'movie': [{'title': 'Sleepless in Seattle'}, {'year': '1993'}]},\n",
|
||||
" {'movie': [{'title': 'Forrest Gump'}, {'year': '1994'}]},\n",
|
||||
" {'movie': [{'title': 'Toy Story'}, {'year': '1995'}]},\n",
|
||||
" {'movie': [{'title': 'Apollo 13'}, {'year': '1995'}]},\n",
|
||||
" {'movie': [{'title': 'Saving Private Ryan'}, {'year': '1998'}]},\n",
|
||||
" {'movie': [{'title': 'Cast Away'}, {'year': '2000'}]},\n",
|
||||
" {'movie': [{'title': 'Catch Me If You Can'}, {'year': '2002'}]},\n",
|
||||
" {'movie': [{'title': 'The Polar Express'}, {'year': '2004'}]},\n",
|
||||
" {'movie': [{'title': \"Charlie Wilson's War\"}, {'year': '2007'}]},\n",
|
||||
" {'movie': [{'title': 'Toy Story 3'}, {'year': '2010'}]},\n",
|
||||
" {'movie': [{'title': 'Captain Phillips'}, {'year': '2013'}]},\n",
|
||||
" {'movie': [{'title': 'Bridge of Spies'}, {'year': '2015'}]},\n",
|
||||
" {'movie': [{'title': 'The Post'}, {'year': '2017'}]},\n",
|
||||
" {'movie': [{'title': 'A Beautiful Day in the Neighborhood'},\n",
|
||||
" {'year': '2019'}]}]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"parser.parse(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "327f5479-77e0-4549-8393-2cd7a286d491",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, let's add some tags to tailor the output to our needs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "b722a235",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'movies': [{'actor': [{'name': 'Tom Hanks'},\n",
|
||||
" {'film': [{'name': 'Splash'}, {'genre': 'Comedy'}]},\n",
|
||||
" {'film': [{'name': 'Big'}, {'genre': 'Comedy'}]},\n",
|
||||
" {'film': [{'name': 'A League of Their Own'}, {'genre': 'Drama'}]},\n",
|
||||
" {'film': [{'name': 'Sleepless in Seattle'}, {'genre': 'Romance'}]},\n",
|
||||
" {'film': [{'name': 'Forrest Gump'}, {'genre': 'Drama'}]},\n",
|
||||
" {'film': [{'name': 'Toy Story'}, {'genre': 'Animation'}]},\n",
|
||||
" {'film': [{'name': 'Apollo 13'}, {'genre': 'Drama'}]},\n",
|
||||
" {'film': [{'name': 'Saving Private Ryan'}, {'genre': 'War'}]},\n",
|
||||
" {'film': [{'name': 'Cast Away'}, {'genre': 'Adventure'}]},\n",
|
||||
" {'film': [{'name': 'The Green Mile'}, {'genre': 'Drama'}]}]}]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"parser = XMLOutputParser(tags=[\"movies\", \"actor\", \"film\", \"name\", \"genre\"])\n",
|
||||
"prompt = PromptTemplate(\n",
|
||||
" template=\"\"\"\n",
|
||||
" \n",
|
||||
" Human:\n",
|
||||
" {query}\n",
|
||||
" {format_instructions}\n",
|
||||
" Assistant:\"\"\",\n",
|
||||
" input_variables=[\"query\"],\n",
|
||||
" partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"_input = prompt.format_prompt(query=actor_query)\n",
|
||||
"\n",
|
||||
"output = model(_input.to_string())\n",
|
||||
"\n",
|
||||
"parser.parse(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "808a5df5-b11e-42a0-bd7a-6b95ca0c3eba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -28,6 +28,7 @@ from langchain.output_parsers.regex import RegexParser
|
||||
from langchain.output_parsers.regex_dict import RegexDictParser
|
||||
from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser
|
||||
from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser
|
||||
from langchain.output_parsers.xml import XMLOutputParser
|
||||
|
||||
__all__ = [
|
||||
"BooleanOutputParser",
|
||||
@ -46,4 +47,5 @@ __all__ = [
|
||||
"RetryOutputParser",
|
||||
"RetryWithErrorOutputParser",
|
||||
"StructuredOutputParser",
|
||||
"XMLOutputParser",
|
||||
]
|
||||
|
@ -25,3 +25,19 @@ Here is the output schema:
|
||||
```
|
||||
{schema}
|
||||
```"""
|
||||
|
||||
|
||||
XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file.
|
||||
1. Output should conform to the tags below.
|
||||
2. If tags are not given, make them on your own.
|
||||
3. Remember to always open and close all the tags.
|
||||
|
||||
As an example, for the tags ["foo", "bar", "baz"]:
|
||||
1. String "<foo>\n <bar>\n <baz></baz>\n </bar>\n</foo>" is a well-formatted instance of the schema.
|
||||
2. String "<foo>\n <bar>\n </foo>" is a badly-formatted instance.
|
||||
3. String "<foo>\n <tag>\n </tag>\n</foo>" is a badly-formatted instance.
|
||||
|
||||
Here are the output tags:
|
||||
```
|
||||
{tags}
|
||||
```"""
|
||||
|
45
libs/langchain/langchain/output_parsers/xml.py
Normal file
45
libs/langchain/langchain/output_parsers/xml.py
Normal file
@ -0,0 +1,45 @@
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain.output_parsers.format_instructions import XML_FORMAT_INSTRUCTIONS
|
||||
from langchain.schema import BaseOutputParser
|
||||
|
||||
|
||||
class XMLOutputParser(BaseOutputParser):
|
||||
"""Parse an output using xml format."""
|
||||
|
||||
tags: Optional[List[str]] = None
|
||||
encoding_matcher: re.Pattern = re.compile(
|
||||
r"<([^>]*encoding[^>]*)>\n(.*)", re.MULTILINE | re.DOTALL
|
||||
)
|
||||
|
||||
def get_format_instructions(self) -> str:
|
||||
return XML_FORMAT_INSTRUCTIONS.format(tags=self.tags)
|
||||
|
||||
def parse(self, text: str) -> Dict[str, List[Any]]:
|
||||
text = text.strip("`").strip("xml")
|
||||
encoding_match = self.encoding_matcher.search(text)
|
||||
if encoding_match:
|
||||
text = encoding_match.group(2)
|
||||
if (text.startswith("<") or text.startswith("\n<")) and (
|
||||
text.endswith(">") or text.endswith(">\n")
|
||||
):
|
||||
root = ET.fromstring(text)
|
||||
return self._root_to_dict(root)
|
||||
else:
|
||||
raise ValueError(f"Could not parse output: {text}")
|
||||
|
||||
def _root_to_dict(self, root: ET.Element) -> Dict[str, List[Any]]:
|
||||
"""Converts xml tree to python dictionary."""
|
||||
result: Dict[str, List[Any]] = {root.tag: []}
|
||||
for child in root:
|
||||
if len(child) == 0:
|
||||
result[root.tag].append({child.tag: child.text})
|
||||
else:
|
||||
result[root.tag].append(self._root_to_dict(child))
|
||||
return result
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "xml"
|
@ -0,0 +1,44 @@
|
||||
"""Test XMLOutputParser"""
|
||||
import pytest
|
||||
|
||||
from langchain.output_parsers.xml import XMLOutputParser
|
||||
|
||||
DEF_RESULT_ENCODING = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<foo>
|
||||
<bar>
|
||||
<baz></baz>
|
||||
<baz>slim.shady</baz>
|
||||
</bar>
|
||||
<baz>tag</baz>
|
||||
</foo>"""
|
||||
|
||||
DEF_RESULT_EXPECTED = {
|
||||
"foo": [
|
||||
{"bar": [{"baz": None}, {"baz": "slim.shady"}]},
|
||||
{"baz": "tag"},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"result",
|
||||
[DEF_RESULT_ENCODING, DEF_RESULT_ENCODING[DEF_RESULT_ENCODING.find("\n") :]],
|
||||
)
|
||||
def test_xml_output_parser(result: str) -> None:
|
||||
"""Test XMLOutputParser."""
|
||||
|
||||
xml_parser = XMLOutputParser()
|
||||
|
||||
xml_result = xml_parser.parse(result)
|
||||
assert DEF_RESULT_EXPECTED == xml_result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("result", ["foo></foo>", "<foo></foo", "foo></foo", "foofoo"])
|
||||
def test_xml_output_parser_fail(result: str) -> None:
|
||||
"""Test XMLOutputParser where complete output is not in XML format."""
|
||||
|
||||
xml_parser = XMLOutputParser()
|
||||
|
||||
with pytest.raises(ValueError) as e:
|
||||
xml_parser.parse(result)
|
||||
assert "Could not parse output" in str(e)
|
Loading…
Reference in New Issue
Block a user