From b4e3e47c92563fb88455077b5a3f7b138eaf55a7 Mon Sep 17 00:00:00 2001
From: Thomas B <thobra@gmail.com>
Date: Wed, 13 Dec 2023 02:04:31 +0100
Subject: [PATCH] feat: Yaml output parser (#14496)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Description
New YAML output parser as a drop-in replacement for the Pydantic output
parser. Yaml is a much more token-efficient format than JSON, proving to
be **~35% faster and using the same percentage fewer completion
tokens**.

☑️ Formatted
☑️ Linted
☑️ Tested (analogous to the existing`test_pydantic_parser.py`)

The YAML parser excels in situations where a list of objects is
required, where the root object needs no key:
```python
class Products(BaseModel):
   __root__: list[Product]
```

I ran the prompt `Generate 10 healthy, organic products` 10 times on one
chain using the `PydanticOutputParser`, the other one using
the`YamlOutputParser` with `Products` (see below) being the targeted
model to be created.

LLMs used were Fireworks' `lama-v2-34b-code-instruct` and OpenAI
`gpt-3.5-turbo`. All runs succeeded without validation errors.

```python
class Nutrition(BaseModel):
    sugar: int = Field(description="Sugar in grams")
    fat: float = Field(description="% of daily fat intake")

class Product(BaseModel):
    name: str = Field(description="Product name")
    stats: Nutrition

class Products(BaseModel):
    """A list of products"""

    products: list[Product] # Used `__root__` for the yaml chain
```
Stats after 10 runs reach were as follows:
### JSON
ø time: 7.75s
ø tokens: 380.8

### YAML
ø time: 5.12s
ø tokens: 242.2


Looking forward to feedback, tips and contributions!
---
 .../langchain/output_parsers/__init__.py      |  2 +
 .../output_parsers/format_instructions.py     | 20 +++++
 .../langchain/output_parsers/yaml.py          | 58 +++++++++++++
 .../unit_tests/output_parsers/test_imports.py |  1 +
 .../output_parsers/test_yaml_parser.py        | 83 +++++++++++++++++++
 5 files changed, 164 insertions(+)
 create mode 100644 libs/langchain/langchain/output_parsers/yaml.py
 create mode 100644 libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py

diff --git a/libs/langchain/langchain/output_parsers/__init__.py b/libs/langchain/langchain/output_parsers/__init__.py
index e37803b81d0..56e5a80b974 100644
--- a/libs/langchain/langchain/output_parsers/__init__.py
+++ b/libs/langchain/langchain/output_parsers/__init__.py
@@ -36,6 +36,7 @@ from langchain.output_parsers.regex_dict import RegexDictParser
 from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser
 from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser
 from langchain.output_parsers.xml import XMLOutputParser
+from langchain.output_parsers.yaml import YamlOutputParser
 
 __all__ = [
     "BooleanOutputParser",
@@ -60,4 +61,5 @@ __all__ = [
     "JsonOutputToolsParser",
     "PydanticToolsParser",
     "JsonOutputKeyToolsParser",
+    "YamlOutputParser",
 ]
diff --git a/libs/langchain/langchain/output_parsers/format_instructions.py b/libs/langchain/langchain/output_parsers/format_instructions.py
index 91f2007b702..94ef87210fa 100644
--- a/libs/langchain/langchain/output_parsers/format_instructions.py
+++ b/libs/langchain/langchain/output_parsers/format_instructions.py
@@ -26,6 +26,26 @@ Here is the output schema:
 {schema}
 ```"""
 
+YAML_FORMAT_INSTRUCTIONS = """The output should be formatted as a YAML instance that conforms to the given JSON schema below.
+
+As an example, for the schema
+```
+{{'title': 'Players', 'description': 'A list of players', 'type': 'array', 'items': {{'$ref': '#/definitions/Player'}}, 'definitions': {{'Player': {{'title': 'Player', 'type': 'object', 'properties': {{'name': {{'title': 'Name', 'description': 'Player name', 'type': 'string'}}, 'avg': {{'title': 'Avg', 'description': 'Batting average', 'type': 'number'}}}}, 'required': ['name', 'avg']}}}}}}
+```
+a well formatted instance would be:
+```
+- name: John Doe
+  avg: 0.3
+- name: Jane Maxfield
+  avg: 1.4
+```
+
+Please follow the standard YAML formatting conventions with an indent of 2 spaces and make sure that the data types adhere strictly to the following JSON schema: 
+```
+{schema}
+```
+
+Make sure to always enclose the YAML output in triple backticks (```)"""
 
 XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file.
 1. Output should conform to the tags below. 
diff --git a/libs/langchain/langchain/output_parsers/yaml.py b/libs/langchain/langchain/output_parsers/yaml.py
new file mode 100644
index 00000000000..69304c3e65b
--- /dev/null
+++ b/libs/langchain/langchain/output_parsers/yaml.py
@@ -0,0 +1,58 @@
+import json
+import re
+from typing import Type, TypeVar
+
+import yaml
+from langchain_core.exceptions import OutputParserException
+from langchain_core.output_parsers import BaseOutputParser
+from langchain_core.pydantic_v1 import BaseModel, ValidationError
+
+from langchain.output_parsers.format_instructions import YAML_FORMAT_INSTRUCTIONS
+
+T = TypeVar("T", bound=BaseModel)
+
+
+class YamlOutputParser(BaseOutputParser[T]):
+    """Parse YAML output using a pydantic model."""
+
+    pydantic_object: Type[T]
+    """The pydantic model to parse."""
+    pattern: re.Pattern = re.compile(
+        r"^```(?:ya?ml)?(?P<yaml>[^`]*)", re.MULTILINE | re.DOTALL
+    )
+    """Regex pattern to match yaml code blocks 
+    within triple backticks with optional yaml or yml prefix."""
+
+    def parse(self, text: str) -> T:
+        try:
+            # Greedy search for 1st yaml candidate.
+            match = re.search(self.pattern, text.strip())
+            yaml_str = ""
+            if match:
+                yaml_str = match.group("yaml")
+
+            json_object = yaml.safe_load(yaml_str)
+            return self.pydantic_object.parse_obj(json_object)
+
+        except (yaml.YAMLError, ValidationError) as e:
+            name = self.pydantic_object.__name__
+            msg = f"Failed to parse {name} from completion {text}. Got: {e}"
+            raise OutputParserException(msg, llm_output=text)
+
+    def get_format_instructions(self) -> str:
+        schema = self.pydantic_object.schema()
+
+        # Remove extraneous fields.
+        reduced_schema = schema
+        if "title" in reduced_schema:
+            del reduced_schema["title"]
+        if "type" in reduced_schema:
+            del reduced_schema["type"]
+        # Ensure yaml in context is well-formed with double quotes.
+        schema_str = json.dumps(reduced_schema)
+
+        return YAML_FORMAT_INSTRUCTIONS.format(schema=schema_str)
+
+    @property
+    def _type(self) -> str:
+        return "yaml"
diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
index 1bd2bce22fc..a161448bbed 100644
--- a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
@@ -23,6 +23,7 @@ EXPECTED_ALL = [
     "JsonOutputToolsParser",
     "PydanticToolsParser",
     "JsonOutputKeyToolsParser",
+    "YamlOutputParser",
 ]
 
 
diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
new file mode 100644
index 00000000000..0fc9646fee7
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
@@ -0,0 +1,83 @@
+"""Test yamlOutputParser"""
+from enum import Enum
+from typing import Optional
+
+from langchain_core.exceptions import OutputParserException
+from langchain_core.pydantic_v1 import BaseModel, Field
+
+from langchain.output_parsers.yaml import YamlOutputParser
+
+
+class Actions(Enum):
+    SEARCH = "Search"
+    CREATE = "Create"
+    UPDATE = "Update"
+    DELETE = "Delete"
+
+
+class TestModel(BaseModel):
+    action: Actions = Field(description="Action to be performed")
+    action_input: str = Field(description="Input to be used in the action")
+    additional_fields: Optional[str] = Field(
+        description="Additional fields", default=None
+    )
+    for_new_lines: str = Field(description="To be used to test newlines")
+
+
+# Prevent pytest from trying to run tests on TestModel
+TestModel.__test__ = False  # type: ignore[attr-defined]
+
+
+DEF_RESULT = """```yaml
+---
+
+action: Update
+action_input: The yamlOutputParser class is powerful
+additional_fields: null
+for_new_lines: |
+  not_escape_newline:
+   escape_newline: 
+
+```"""
+
+# action 'update' with a lowercase 'u' to test schema validation failure.
+DEF_RESULT_FAIL = """```yaml
+action: update
+action_input: The yamlOutputParser class is powerful
+additional_fields: null
+```"""
+
+DEF_EXPECTED_RESULT = TestModel(
+    action=Actions.UPDATE,
+    action_input="The yamlOutputParser class is powerful",
+    additional_fields=None,
+    for_new_lines="not_escape_newline:\n escape_newline: \n",
+)
+
+
+def test_yaml_output_parser() -> None:
+    """Test yamlOutputParser."""
+
+    yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
+        pydantic_object=TestModel
+    )
+
+    result = yaml_parser.parse(DEF_RESULT)
+    print("parse_result:", result)
+    assert DEF_EXPECTED_RESULT == result
+
+
+def test_yaml_output_parser_fail() -> None:
+    """Test YamlOutputParser where completion result fails schema validation."""
+
+    yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
+        pydantic_object=TestModel
+    )
+
+    try:
+        yaml_parser.parse(DEF_RESULT_FAIL)
+    except OutputParserException as e:
+        print("parse_result:", e)
+        assert "Failed to parse TestModel from completion" in str(e)
+    else:
+        assert False, "Expected OutputParserException"