feat: Yaml output parser (#14496)

## Description New YAML output parser as a drop-in replacement for the Pydantic output parser. Yaml is a much more token-efficient format than JSON, proving to be **~35% faster and using the same percentage fewer completion tokens**. ☑️ Formatted ☑️ Linted ☑️ Tested (analogous to the existing`test_pydantic_parser.py`) The YAML parser excels in situations where a list of objects is required, where the root object needs no key: ```python class Products(BaseModel): __root__: list[Product] ``` I ran the prompt `Generate 10 healthy, organic products` 10 times on one chain using the `PydanticOutputParser`, the other one using the`YamlOutputParser` with `Products` (see below) being the targeted model to be created. LLMs used were Fireworks' `lama-v2-34b-code-instruct` and OpenAI `gpt-3.5-turbo`. All runs succeeded without validation errors. ```python class Nutrition(BaseModel): sugar: int = Field(description="Sugar in grams") fat: float = Field(description="% of daily fat intake") class Product(BaseModel): name: str = Field(description="Product name") stats: Nutrition class Products(BaseModel): """A list of products""" products: list[Product] # Used `__root__` for the yaml chain ``` Stats after 10 runs reach were as follows: ### JSON ø time: 7.75s ø tokens: 380.8 ### YAML ø time: 5.12s ø tokens: 242.2 Looking forward to feedback, tips and contributions!
2025-08-10 05:20:39 +00:00 · 2023-12-13 02:04:31 +01:00 · 2023-12-13 02:04:31 +01:00 · b4e3e47c92
commit b4e3e47c92
parent d31ff30df6
5 changed files with 164 additions and 0 deletions
--- a/libs/langchain/langchain/output_parsers/init.py
+++ b/libs/langchain/langchain/output_parsers/init.py
@ -36,6 +36,7 @@ from langchain.output_parsers.regex_dict import RegexDictParser
 from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser
 from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser
 from langchain.output_parsers.xml import XMLOutputParser
 from langchain.output_parsers.yaml import YamlOutputParser
 __all__ = [
    "BooleanOutputParser",
@ -60,4 +61,5 @@ __all__ = [
    "JsonOutputToolsParser",
    "PydanticToolsParser",
    "JsonOutputKeyToolsParser",
    "YamlOutputParser",
 ]
--- a/libs/langchain/langchain/output_parsers/format_instructions.py
+++ b/libs/langchain/langchain/output_parsers/format_instructions.py
@ -26,6 +26,26 @@ Here is the output schema:
 {schema}
 ```"""
 YAML_FORMAT_INSTRUCTIONS = """The output should be formatted as a YAML instance that conforms to the given JSON schema below.
 As an example, for the schema
 ```
 {{'title': 'Players', 'description': 'A list of players', 'type': 'array', 'items': {{'$ref': '#/definitions/Player'}}, 'definitions': {{'Player': {{'title': 'Player', 'type': 'object', 'properties': {{'name': {{'title': 'Name', 'description': 'Player name', 'type': 'string'}}, 'avg': {{'title': 'Avg', 'description': 'Batting average', 'type': 'number'}}}}, 'required': ['name', 'avg']}}}}}}
 ```
 a well formatted instance would be:
 ```
 - name: John Doe
  avg: 0.3
 - name: Jane Maxfield
  avg: 1.4
 ```
 Please follow the standard YAML formatting conventions with an indent of 2 spaces and make sure that the data types adhere strictly to the following JSON schema: 
 ```
 {schema}
 ```
 Make sure to always enclose the YAML output in triple backticks (```)"""
 XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file.
 1. Output should conform to the tags below. 
--- a/libs/langchain/langchain/output_parsers/yaml.py
+++ b/libs/langchain/langchain/output_parsers/yaml.py
@ -0,0 +1,58 @@
 import json
 import re
 from typing import Type, TypeVar
 import yaml
 from langchain_core.exceptions import OutputParserException
 from langchain_core.output_parsers import BaseOutputParser
 from langchain_core.pydantic_v1 import BaseModel, ValidationError
 from langchain.output_parsers.format_instructions import YAML_FORMAT_INSTRUCTIONS
 T = TypeVar("T", bound=BaseModel)
 class YamlOutputParser(BaseOutputParser[T]):
    """Parse YAML output using a pydantic model."""
    pydantic_object: Type[T]
    """The pydantic model to parse."""
    pattern: re.Pattern = re.compile(
        r"^```(?:ya?ml)?(?P<yaml>[^`]*)", re.MULTILINE | re.DOTALL
    )
    """Regex pattern to match yaml code blocks 
    within triple backticks with optional yaml or yml prefix."""
    def parse(self, text: str) -> T:
        try:
            # Greedy search for 1st yaml candidate.
            match = re.search(self.pattern, text.strip())
            yaml_str = ""
            if match:
                yaml_str = match.group("yaml")
            json_object = yaml.safe_load(yaml_str)
            return self.pydantic_object.parse_obj(json_object)
        except (yaml.YAMLError, ValidationError) as e:
            name = self.pydantic_object.__name__
            msg = f"Failed to parse {name} from completion {text}. Got: {e}"
            raise OutputParserException(msg, llm_output=text)
    def get_format_instructions(self) -> str:
        schema = self.pydantic_object.schema()
        # Remove extraneous fields.
        reduced_schema = schema
        if "title" in reduced_schema:
            del reduced_schema["title"]
        if "type" in reduced_schema:
            del reduced_schema["type"]
        # Ensure yaml in context is well-formed with double quotes.
        schema_str = json.dumps(reduced_schema)
        return YAML_FORMAT_INSTRUCTIONS.format(schema=schema_str)
    @property
    def _type(self) -> str:
        return "yaml"
--- a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
@ -23,6 +23,7 @@ EXPECTED_ALL = [
    "JsonOutputToolsParser",
    "PydanticToolsParser",
    "JsonOutputKeyToolsParser",
    "YamlOutputParser",
 ]
--- a/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
@ -0,0 +1,83 @@
 """Test yamlOutputParser"""
 from enum import Enum
 from typing import Optional
 from langchain_core.exceptions import OutputParserException
 from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain.output_parsers.yaml import YamlOutputParser
 class Actions(Enum):
    SEARCH = "Search"
    CREATE = "Create"
    UPDATE = "Update"
    DELETE = "Delete"
 class TestModel(BaseModel):
    action: Actions = Field(description="Action to be performed")
    action_input: str = Field(description="Input to be used in the action")
    additional_fields: Optional[str] = Field(
        description="Additional fields", default=None
    )
    for_new_lines: str = Field(description="To be used to test newlines")
 # Prevent pytest from trying to run tests on TestModel
 TestModel.__test__ = False  # type: ignore[attr-defined]
 DEF_RESULT = """```yaml
 ---
 action: Update
 action_input: The yamlOutputParser class is powerful
 additional_fields: null
 for_new_lines: |
  not_escape_newline:
   escape_newline: 
 ```"""
 # action 'update' with a lowercase 'u' to test schema validation failure.
 DEF_RESULT_FAIL = """```yaml
 action: update
 action_input: The yamlOutputParser class is powerful
 additional_fields: null
 ```"""
 DEF_EXPECTED_RESULT = TestModel(
    action=Actions.UPDATE,
    action_input="The yamlOutputParser class is powerful",
    additional_fields=None,
    for_new_lines="not_escape_newline:\n escape_newline: \n",
 )
 def test_yaml_output_parser() -> None:
    """Test yamlOutputParser."""
    yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
        pydantic_object=TestModel
    )
    result = yaml_parser.parse(DEF_RESULT)
    print("parse_result:", result)
    assert DEF_EXPECTED_RESULT == result
 def test_yaml_output_parser_fail() -> None:
    """Test YamlOutputParser where completion result fails schema validation."""
    yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
        pydantic_object=TestModel
    )
    try:
        yaml_parser.parse(DEF_RESULT_FAIL)
    except OutputParserException as e:
        print("parse_result:", e)
        assert "Failed to parse TestModel from completion" in str(e)
    else:
        assert False, "Expected OutputParserException"