diff --git a/docs/extras/modules/model_io/output_parsers/xml.ipynb b/docs/extras/modules/model_io/output_parsers/xml.ipynb new file mode 100644 index 00000000000..d0ca341d42f --- /dev/null +++ b/docs/extras/modules/model_io/output_parsers/xml.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "181b5b6d", + "metadata": {}, + "source": [ + "# XML parser\n", + "This output parser allows users to obtain results from LLM in the popular XML format. \n", + "\n", + "Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed XML. \n", + "\n", + "In the following example we use Claude model (https://docs.anthropic.com/claude/docs) which works really well with XML tags." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3b10fc55", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "from langchain.llms import Anthropic\n", + "from langchain.output_parsers import XMLOutputParser" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "909161d1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/mateusz/Documents/Projects/langchain/libs/langchain/langchain/llms/anthropic.py:170: UserWarning: This Anthropic LLM is deprecated. Please use `from langchain.chat_models import ChatAnthropic` instead\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "model = Anthropic(model=\"claude-2\", max_tokens_to_sample=512, temperature=0.1)" + ] + }, + { + "cell_type": "markdown", + "id": "da312f86-0d2a-4aef-a09d-1e72bd0ea9b1", + "metadata": {}, + "source": [ + "Let's start with the simple request to the model." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b03785af-69fc-40a1-a1be-c04ed6fade70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Here is the shortened filmography for Tom Hanks enclosed in tags:\n", + "\n", + "Splash (1984)\n", + "Big (1988) \n", + "A League of Their Own (1992)\n", + "Sleepless in Seattle (1993) \n", + "Forrest Gump (1994)\n", + "Apollo 13 (1995)\n", + "Toy Story (1995)\n", + "Saving Private Ryan (1998)\n", + "Cast Away (2000)\n", + "The Da Vinci Code (2006)\n", + "Toy Story 3 (2010)\n", + "Captain Phillips (2013)\n", + "Bridge of Spies (2015)\n", + "Toy Story 4 (2019)\n" + ] + } + ], + "source": [ + "actor_query = \"Generate the shortened filmography for Tom Hanks.\"\n", + "output = model(\n", + " f\"\"\"\n", + "\n", + "Human:\n", + "{actor_query}\n", + "Please enclose the movies in tags\n", + "Assistant:\n", + "\"\"\"\n", + ")\n", + "print(output)" + ] + }, + { + "cell_type": "markdown", + "id": "4db65781-3d54-4ba6-ae26-5b4ead47a4c8", + "metadata": {}, + "source": [ + "Now we will use the XMLOutputParser in order to get the structured output." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "87ba8d11", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + " \n", + " Splash\n", + " 1984\n", + " \n", + " \n", + " \n", + " Big \n", + " 1988\n", + " \n", + "\n", + " \n", + " A League of Their Own\n", + " 1992\n", + " \n", + "\n", + " \n", + " Sleepless in Seattle\n", + " 1993\n", + " \n", + "\n", + " \n", + " Forrest Gump\n", + " 1994\n", + " \n", + "\n", + " \n", + " Toy Story\n", + " 1995\n", + " \n", + "\n", + " \n", + " Apollo 13\n", + " 1995\n", + " \n", + "\n", + " \n", + " Saving Private Ryan\n", + " 1998\n", + " \n", + "\n", + " \n", + " Cast Away\n", + " 2000\n", + " \n", + "\n", + " \n", + " Catch Me If You Can\n", + " 2002\n", + " \n", + "\n", + " \n", + " The Polar Express\n", + " 2004\n", + " \n", + "\n", + " \n", + " Charlie Wilson's War\n", + " 2007\n", + " \n", + "\n", + " \n", + " Toy Story 3\n", + " 2010\n", + " \n", + "\n", + " \n", + " Captain Phillips\n", + " 2013\n", + " \n", + "\n", + " \n", + " Bridge of Spies\n", + " 2015\n", + " \n", + "\n", + " \n", + " The Post\n", + " 2017\n", + " \n", + "\n", + " \n", + " A Beautiful Day in the Neighborhood \n", + " 2019\n", + " \n", + "\n" + ] + } + ], + "source": [ + "parser = XMLOutputParser()\n", + "\n", + "prompt = PromptTemplate(\n", + " template=\"\"\"\n", + " \n", + " Human:\n", + " {query}\n", + " {format_instructions}\n", + " Assistant:\"\"\",\n", + " input_variables=[\"query\"],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()},\n", + ")\n", + "\n", + "_input = prompt.format_prompt(query=actor_query)\n", + "\n", + "output = model(_input.to_string())\n", + "print(output)" + ] + }, + { + "cell_type": "markdown", + "id": "1c4c47ee", + "metadata": {}, + "source": [ + "And here parsed output is shown:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4c864dc9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'filmography': [{'movie': [{'title': 'Splash'}, {'year': '1984'}]},\n", + " {'movie': [{'title': 'Big'}, {'year': '1988'}]},\n", + " {'movie': [{'title': 'A League of Their Own'}, {'year': '1992'}]},\n", + " {'movie': [{'title': 'Sleepless in Seattle'}, {'year': '1993'}]},\n", + " {'movie': [{'title': 'Forrest Gump'}, {'year': '1994'}]},\n", + " {'movie': [{'title': 'Toy Story'}, {'year': '1995'}]},\n", + " {'movie': [{'title': 'Apollo 13'}, {'year': '1995'}]},\n", + " {'movie': [{'title': 'Saving Private Ryan'}, {'year': '1998'}]},\n", + " {'movie': [{'title': 'Cast Away'}, {'year': '2000'}]},\n", + " {'movie': [{'title': 'Catch Me If You Can'}, {'year': '2002'}]},\n", + " {'movie': [{'title': 'The Polar Express'}, {'year': '2004'}]},\n", + " {'movie': [{'title': \"Charlie Wilson's War\"}, {'year': '2007'}]},\n", + " {'movie': [{'title': 'Toy Story 3'}, {'year': '2010'}]},\n", + " {'movie': [{'title': 'Captain Phillips'}, {'year': '2013'}]},\n", + " {'movie': [{'title': 'Bridge of Spies'}, {'year': '2015'}]},\n", + " {'movie': [{'title': 'The Post'}, {'year': '2017'}]},\n", + " {'movie': [{'title': 'A Beautiful Day in the Neighborhood'},\n", + " {'year': '2019'}]}]}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parser.parse(output)" + ] + }, + { + "cell_type": "markdown", + "id": "327f5479-77e0-4549-8393-2cd7a286d491", + "metadata": {}, + "source": [ + "Finally, let's add some tags to tailor the output to our needs." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b722a235", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'movies': [{'actor': [{'name': 'Tom Hanks'},\n", + " {'film': [{'name': 'Splash'}, {'genre': 'Comedy'}]},\n", + " {'film': [{'name': 'Big'}, {'genre': 'Comedy'}]},\n", + " {'film': [{'name': 'A League of Their Own'}, {'genre': 'Drama'}]},\n", + " {'film': [{'name': 'Sleepless in Seattle'}, {'genre': 'Romance'}]},\n", + " {'film': [{'name': 'Forrest Gump'}, {'genre': 'Drama'}]},\n", + " {'film': [{'name': 'Toy Story'}, {'genre': 'Animation'}]},\n", + " {'film': [{'name': 'Apollo 13'}, {'genre': 'Drama'}]},\n", + " {'film': [{'name': 'Saving Private Ryan'}, {'genre': 'War'}]},\n", + " {'film': [{'name': 'Cast Away'}, {'genre': 'Adventure'}]},\n", + " {'film': [{'name': 'The Green Mile'}, {'genre': 'Drama'}]}]}]}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parser = XMLOutputParser(tags=[\"movies\", \"actor\", \"film\", \"name\", \"genre\"])\n", + "prompt = PromptTemplate(\n", + " template=\"\"\"\n", + " \n", + " Human:\n", + " {query}\n", + " {format_instructions}\n", + " Assistant:\"\"\",\n", + " input_variables=[\"query\"],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()},\n", + ")\n", + "\n", + "\n", + "_input = prompt.format_prompt(query=actor_query)\n", + "\n", + "output = model(_input.to_string())\n", + "\n", + "parser.parse(output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "808a5df5-b11e-42a0-bd7a-6b95ca0c3eba", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/output_parsers/__init__.py b/libs/langchain/langchain/output_parsers/__init__.py index d3d49c6bd2f..a98ad9caec9 100644 --- a/libs/langchain/langchain/output_parsers/__init__.py +++ b/libs/langchain/langchain/output_parsers/__init__.py @@ -28,6 +28,7 @@ from langchain.output_parsers.regex import RegexParser from langchain.output_parsers.regex_dict import RegexDictParser from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser +from langchain.output_parsers.xml import XMLOutputParser __all__ = [ "BooleanOutputParser", @@ -46,4 +47,5 @@ __all__ = [ "RetryOutputParser", "RetryWithErrorOutputParser", "StructuredOutputParser", + "XMLOutputParser", ] diff --git a/libs/langchain/langchain/output_parsers/format_instructions.py b/libs/langchain/langchain/output_parsers/format_instructions.py index 11b782a1b76..a614cf3f0d6 100644 --- a/libs/langchain/langchain/output_parsers/format_instructions.py +++ b/libs/langchain/langchain/output_parsers/format_instructions.py @@ -25,3 +25,19 @@ Here is the output schema: ``` {schema} ```""" + + +XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file. +1. Output should conform to the tags below. +2. If tags are not given, make them on your own. +3. Remember to always open and close all the tags. + +As an example, for the tags ["foo", "bar", "baz"]: +1. String "\n \n \n \n" is a well-formatted instance of the schema. +2. String "\n \n " is a badly-formatted instance. +3. String "\n \n \n" is a badly-formatted instance. + +Here are the output tags: +``` +{tags} +```""" diff --git a/libs/langchain/langchain/output_parsers/xml.py b/libs/langchain/langchain/output_parsers/xml.py new file mode 100644 index 00000000000..16dde71accd --- /dev/null +++ b/libs/langchain/langchain/output_parsers/xml.py @@ -0,0 +1,45 @@ +import re +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional + +from langchain.output_parsers.format_instructions import XML_FORMAT_INSTRUCTIONS +from langchain.schema import BaseOutputParser + + +class XMLOutputParser(BaseOutputParser): + """Parse an output using xml format.""" + + tags: Optional[List[str]] = None + encoding_matcher: re.Pattern = re.compile( + r"<([^>]*encoding[^>]*)>\n(.*)", re.MULTILINE | re.DOTALL + ) + + def get_format_instructions(self) -> str: + return XML_FORMAT_INSTRUCTIONS.format(tags=self.tags) + + def parse(self, text: str) -> Dict[str, List[Any]]: + text = text.strip("`").strip("xml") + encoding_match = self.encoding_matcher.search(text) + if encoding_match: + text = encoding_match.group(2) + if (text.startswith("<") or text.startswith("\n<")) and ( + text.endswith(">") or text.endswith(">\n") + ): + root = ET.fromstring(text) + return self._root_to_dict(root) + else: + raise ValueError(f"Could not parse output: {text}") + + def _root_to_dict(self, root: ET.Element) -> Dict[str, List[Any]]: + """Converts xml tree to python dictionary.""" + result: Dict[str, List[Any]] = {root.tag: []} + for child in root: + if len(child) == 0: + result[root.tag].append({child.tag: child.text}) + else: + result[root.tag].append(self._root_to_dict(child)) + return result + + @property + def _type(self) -> str: + return "xml" diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py b/libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py new file mode 100644 index 00000000000..c26fa57912f --- /dev/null +++ b/libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py @@ -0,0 +1,44 @@ +"""Test XMLOutputParser""" +import pytest + +from langchain.output_parsers.xml import XMLOutputParser + +DEF_RESULT_ENCODING = """ + + + + slim.shady + + tag +""" + +DEF_RESULT_EXPECTED = { + "foo": [ + {"bar": [{"baz": None}, {"baz": "slim.shady"}]}, + {"baz": "tag"}, + ], +} + + +@pytest.mark.parametrize( + "result", + [DEF_RESULT_ENCODING, DEF_RESULT_ENCODING[DEF_RESULT_ENCODING.find("\n") :]], +) +def test_xml_output_parser(result: str) -> None: + """Test XMLOutputParser.""" + + xml_parser = XMLOutputParser() + + xml_result = xml_parser.parse(result) + assert DEF_RESULT_EXPECTED == xml_result + + +@pytest.mark.parametrize("result", ["foo>", " None: + """Test XMLOutputParser where complete output is not in XML format.""" + + xml_parser = XMLOutputParser() + + with pytest.raises(ValueError) as e: + xml_parser.parse(result) + assert "Could not parse output" in str(e)