Enhancement on feature/yaml output parser (#14674)

Adding to my previously, already merged PR I made some further improvements: * Added documentation to the existing Pydantic Parser notebook, with an example using LCEL and `with_retry()` on `OutputParserException`. * Added an additional output example to the prompt * More lenient parser in terms of LLM output format * Amended unit test FYI @hwchase17 --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2025-06-30 10:23:30 +00:00 · 2024-01-02 01:49:58 +01:00 · 2024-01-02 01:49:58 +01:00 · 9d8468a576
commit 9d8468a576
parent ff10f30149
5 changed files with 153 additions and 8 deletions
--- a/docs/docs/modules/model_io/output_parsers/types/index.mdx
+++ b/docs/docs/modules/model_io/output_parsers/types/index.mdx
@ -25,6 +25,7 @@ This is a list of output parsers LangChain supports. The table below has various
 | [OutputFixing](./output_fixing)    |                    |                               | ✅         | `str \| Message`                 |                      | Wraps another output parser. If that output parser errors, then this will pass the error message and the bad output to an LLM and ask it to fix the output.                                                                                              |   |   |
 | [RetryWithError](./retry)  |                    |                               | ✅         | `str \| Message`                 |                      | Wraps another output parser. If that output parser errors, then this will pass the original inputs, the bad output, and the error message to an LLM and ask it to fix it. Compared to OutputFixingParser, this one also sends the original instructions. |   |   |
 | [Pydantic](./pydantic)        |                    | ✅                             |           | `str \| Message`                 | `pydantic.BaseModel` | Takes a user defined Pydantic model and returns data in that format.                                                                                                                                                                                     |   |   |
+| [YAML](./yaml)        |                    | ✅                             |           | `str \| Message`                 | `pydantic.BaseModel` | Takes a user defined Pydantic model and returns data in that format. Uses YAML to encode it.                                                                                                                                                                                    |   |   |
 | [PandasDataFrame](./pandas_dataframe) |                    | ✅                             |           | `str \| Message`                 | `dict`               | Useful for doing operations with pandas DataFrames.                                                                                                                                                                                                      |   |   |
 | [Enum](./enum)            |                    | ✅                             |           | `str \| Message`                 | `Enum`               | Parses response into one of the provided enum values.                                                                                                                                                                                                    |   |   |
 | [Datetime](./datetime)        |                    | ✅                             |           | `str \| Message`                 | `datetime.datetime`  | Parses response into a datetime string.                                                                                                                                                                                                                  |   |   |
--- a/docs/docs/modules/model_io/output_parsers/types/yaml.ipynb
+++ b/docs/docs/modules/model_io/output_parsers/types/yaml.ipynb
@ -0,0 +1,119 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "72b1b316",
+   "metadata": {},
+   "source": [
+    "# YAML parser\n",
+    "This output parser allows users to specify an arbitrary schema and query LLMs for outputs that conform to that schema, using YAML to format their response.\n",
+    "\n",
+    "Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed YAML. In the OpenAI family, DaVinci can do reliably but Curie's ability already drops off dramatically. \n",
+    "\n",
+    "You can optionally use Pydantic to declare your data model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cd33369f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.output_parsers import YamlOutputParser\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain_core.pydantic_v1 import BaseModel, Field"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9b4d242f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = ChatOpenAI(temperature=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a1090014",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define your desired data structure.\n",
+    "class Joke(BaseModel):\n",
+    "    setup: str = Field(description=\"question to set up a joke\")\n",
+    "    punchline: str = Field(description=\"answer to resolve the joke\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4ccf45a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Joke(setup=\"Why don't scientists trust atoms?\", punchline='Because they make up everything!')"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# And a query intented to prompt a language model to populate the data structure.\n",
+    "joke_query = \"Tell me a joke.\"\n",
+    "\n",
+    "# Set up a parser + inject instructions into the prompt template.\n",
+    "parser = YamlOutputParser(pydantic_object=Joke)\n",
+    "\n",
+    "prompt = PromptTemplate(\n",
+    "    template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
+    "    input_variables=[\"query\"],\n",
+    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "chain = prompt | model | parser\n",
+    "\n",
+    "chain.invoke({\"query\": joke_query})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4d12261",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/langchain/langchain/output_parsers/format_instructions.py
+++ b/libs/langchain/langchain/output_parsers/format_instructions.py
@ -28,11 +28,12 @@ Here is the output schema:

 YAML_FORMAT_INSTRUCTIONS = """The output should be formatted as a YAML instance that conforms to the given JSON schema below.

-As an example, for the schema
+# Examples
+## Schema
 ```
-{{'title': 'Players', 'description': 'A list of players', 'type': 'array', 'items': {{'$ref': '#/definitions/Player'}}, 'definitions': {{'Player': {{'title': 'Player', 'type': 'object', 'properties': {{'name': {{'title': 'Name', 'description': 'Player name', 'type': 'string'}}, 'avg': {{'title': 'Avg', 'description': 'Batting average', 'type': 'number'}}}}, 'required': ['name', 'avg']}}}}}}
+{{"title": "Players", "description": "A list of players", "type": "array", "items": {{"$ref": "#/definitions/Player"}}, "definitions": {{"Player": {{"title": "Player", "type": "object", "properties": {{"name": {{"title": "Name", "description": "Player name", "type": "string"}}, "avg": {{"title": "Avg", "description": "Batting average", "type": "number"}}}}, "required": ["name", "avg"]}}}}}}
 ```
-a well formatted instance would be:
+## Well formatted instance
 ```
 - name: John Doe
  avg: 0.3
@ -40,12 +41,22 @@ a well formatted instance would be:
  avg: 1.4
 ```

+## Schema
+```
+{{"properties": {{"habit": {{ "description": "A common daily habit", "type": "string" }}, "sustainable_alternative": {{ "description": "An environmentally friendly alternative to the habit", "type": "string"}}}}, "required": ["habit", "sustainable_alternative"]}}
+```
+## Well formatted instance
+```
+habit: Using disposable water bottles for daily hydration.
+sustainable_alternative: Switch to a reusable water bottle to reduce plastic waste and decrease your environmental footprint.
+``` 
+
 Please follow the standard YAML formatting conventions with an indent of 2 spaces and make sure that the data types adhere strictly to the following JSON schema: 
 ```
 {schema}
 ```

-Make sure to always enclose the YAML output in triple backticks (```)"""
+Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!"""


 PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS = """The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
--- a/libs/langchain/langchain/output_parsers/yaml.py
+++ b/libs/langchain/langchain/output_parsers/yaml.py
@ -30,6 +30,9 @@ class YamlOutputParser(BaseOutputParser[T]):
            yaml_str = ""
            if match:
                yaml_str = match.group("yaml")
+            else:
+                # If no backticks were present, try to parse the entire output as yaml.
+                yaml_str = text

            json_object = yaml.safe_load(yaml_str)
            return self.pydantic_object.parse_obj(json_object)
@ -37,7 +40,7 @@ class YamlOutputParser(BaseOutputParser[T]):
        except (yaml.YAMLError, ValidationError) as e:
            name = self.pydantic_object.__name__
            msg = f"Failed to parse {name} from completion {text}. Got: {e}"
-            raise OutputParserException(msg, llm_output=text)
+            raise OutputParserException(msg, llm_output=text) from e

    def get_format_instructions(self) -> str:
        schema = self.pydantic_object.schema()
--- a/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
@ -2,6 +2,7 @@
 from enum import Enum
 from typing import Optional

+import pytest
 from langchain_core.exceptions import OutputParserException
 from langchain_core.pydantic_v1 import BaseModel, Field

@ -39,6 +40,15 @@ for_new_lines: |
   escape_newline: 

 ```"""
+DEF_RESULT_NO_BACKTICKS = """
+action: Update
+action_input: The yamlOutputParser class is powerful
+additional_fields: null
+for_new_lines: |
+  not_escape_newline:
+   escape_newline: 
+
+"""

 # action 'update' with a lowercase 'u' to test schema validation failure.
 DEF_RESULT_FAIL = """```yaml
@ -55,16 +65,17 @@ DEF_EXPECTED_RESULT = TestModel(
 )


-def test_yaml_output_parser() -> None:
+@pytest.mark.parametrize("result", [DEF_RESULT, DEF_RESULT_NO_BACKTICKS])
+def test_yaml_output_parser(result: str) -> None:
    """Test yamlOutputParser."""

    yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
        pydantic_object=TestModel
    )

-    result = yaml_parser.parse(DEF_RESULT)
+    model = yaml_parser.parse(result)
    print("parse_result:", result)
-    assert DEF_EXPECTED_RESULT == result
+    assert DEF_EXPECTED_RESULT == model


 def test_yaml_output_parser_fail() -> None: