From b4e3e47c92563fb88455077b5a3f7b138eaf55a7 Mon Sep 17 00:00:00 2001 From: Thomas B Date: Wed, 13 Dec 2023 02:04:31 +0100 Subject: [PATCH] feat: Yaml output parser (#14496) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description New YAML output parser as a drop-in replacement for the Pydantic output parser. Yaml is a much more token-efficient format than JSON, proving to be **~35% faster and using the same percentage fewer completion tokens**. ☑️ Formatted ☑️ Linted ☑️ Tested (analogous to the existing`test_pydantic_parser.py`) The YAML parser excels in situations where a list of objects is required, where the root object needs no key: ```python class Products(BaseModel): __root__: list[Product] ``` I ran the prompt `Generate 10 healthy, organic products` 10 times on one chain using the `PydanticOutputParser`, the other one using the`YamlOutputParser` with `Products` (see below) being the targeted model to be created. LLMs used were Fireworks' `lama-v2-34b-code-instruct` and OpenAI `gpt-3.5-turbo`. All runs succeeded without validation errors. ```python class Nutrition(BaseModel): sugar: int = Field(description="Sugar in grams") fat: float = Field(description="% of daily fat intake") class Product(BaseModel): name: str = Field(description="Product name") stats: Nutrition class Products(BaseModel): """A list of products""" products: list[Product] # Used `__root__` for the yaml chain ``` Stats after 10 runs reach were as follows: ### JSON ø time: 7.75s ø tokens: 380.8 ### YAML ø time: 5.12s ø tokens: 242.2 Looking forward to feedback, tips and contributions! --- .../langchain/output_parsers/__init__.py | 2 + .../output_parsers/format_instructions.py | 20 +++++ .../langchain/output_parsers/yaml.py | 58 +++++++++++++ .../unit_tests/output_parsers/test_imports.py | 1 + .../output_parsers/test_yaml_parser.py | 83 +++++++++++++++++++ 5 files changed, 164 insertions(+) create mode 100644 libs/langchain/langchain/output_parsers/yaml.py create mode 100644 libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py diff --git a/libs/langchain/langchain/output_parsers/__init__.py b/libs/langchain/langchain/output_parsers/__init__.py index e37803b81d0..56e5a80b974 100644 --- a/libs/langchain/langchain/output_parsers/__init__.py +++ b/libs/langchain/langchain/output_parsers/__init__.py @@ -36,6 +36,7 @@ from langchain.output_parsers.regex_dict import RegexDictParser from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser from langchain.output_parsers.xml import XMLOutputParser +from langchain.output_parsers.yaml import YamlOutputParser __all__ = [ "BooleanOutputParser", @@ -60,4 +61,5 @@ __all__ = [ "JsonOutputToolsParser", "PydanticToolsParser", "JsonOutputKeyToolsParser", + "YamlOutputParser", ] diff --git a/libs/langchain/langchain/output_parsers/format_instructions.py b/libs/langchain/langchain/output_parsers/format_instructions.py index 91f2007b702..94ef87210fa 100644 --- a/libs/langchain/langchain/output_parsers/format_instructions.py +++ b/libs/langchain/langchain/output_parsers/format_instructions.py @@ -26,6 +26,26 @@ Here is the output schema: {schema} ```""" +YAML_FORMAT_INSTRUCTIONS = """The output should be formatted as a YAML instance that conforms to the given JSON schema below. + +As an example, for the schema +``` +{{'title': 'Players', 'description': 'A list of players', 'type': 'array', 'items': {{'$ref': '#/definitions/Player'}}, 'definitions': {{'Player': {{'title': 'Player', 'type': 'object', 'properties': {{'name': {{'title': 'Name', 'description': 'Player name', 'type': 'string'}}, 'avg': {{'title': 'Avg', 'description': 'Batting average', 'type': 'number'}}}}, 'required': ['name', 'avg']}}}}}} +``` +a well formatted instance would be: +``` +- name: John Doe + avg: 0.3 +- name: Jane Maxfield + avg: 1.4 +``` + +Please follow the standard YAML formatting conventions with an indent of 2 spaces and make sure that the data types adhere strictly to the following JSON schema: +``` +{schema} +``` + +Make sure to always enclose the YAML output in triple backticks (```)""" XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file. 1. Output should conform to the tags below. diff --git a/libs/langchain/langchain/output_parsers/yaml.py b/libs/langchain/langchain/output_parsers/yaml.py new file mode 100644 index 00000000000..69304c3e65b --- /dev/null +++ b/libs/langchain/langchain/output_parsers/yaml.py @@ -0,0 +1,58 @@ +import json +import re +from typing import Type, TypeVar + +import yaml +from langchain_core.exceptions import OutputParserException +from langchain_core.output_parsers import BaseOutputParser +from langchain_core.pydantic_v1 import BaseModel, ValidationError + +from langchain.output_parsers.format_instructions import YAML_FORMAT_INSTRUCTIONS + +T = TypeVar("T", bound=BaseModel) + + +class YamlOutputParser(BaseOutputParser[T]): + """Parse YAML output using a pydantic model.""" + + pydantic_object: Type[T] + """The pydantic model to parse.""" + pattern: re.Pattern = re.compile( + r"^```(?:ya?ml)?(?P[^`]*)", re.MULTILINE | re.DOTALL + ) + """Regex pattern to match yaml code blocks + within triple backticks with optional yaml or yml prefix.""" + + def parse(self, text: str) -> T: + try: + # Greedy search for 1st yaml candidate. + match = re.search(self.pattern, text.strip()) + yaml_str = "" + if match: + yaml_str = match.group("yaml") + + json_object = yaml.safe_load(yaml_str) + return self.pydantic_object.parse_obj(json_object) + + except (yaml.YAMLError, ValidationError) as e: + name = self.pydantic_object.__name__ + msg = f"Failed to parse {name} from completion {text}. Got: {e}" + raise OutputParserException(msg, llm_output=text) + + def get_format_instructions(self) -> str: + schema = self.pydantic_object.schema() + + # Remove extraneous fields. + reduced_schema = schema + if "title" in reduced_schema: + del reduced_schema["title"] + if "type" in reduced_schema: + del reduced_schema["type"] + # Ensure yaml in context is well-formed with double quotes. + schema_str = json.dumps(reduced_schema) + + return YAML_FORMAT_INSTRUCTIONS.format(schema=schema_str) + + @property + def _type(self) -> str: + return "yaml" diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py index 1bd2bce22fc..a161448bbed 100644 --- a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py +++ b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py @@ -23,6 +23,7 @@ EXPECTED_ALL = [ "JsonOutputToolsParser", "PydanticToolsParser", "JsonOutputKeyToolsParser", + "YamlOutputParser", ] diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py new file mode 100644 index 00000000000..0fc9646fee7 --- /dev/null +++ b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py @@ -0,0 +1,83 @@ +"""Test yamlOutputParser""" +from enum import Enum +from typing import Optional + +from langchain_core.exceptions import OutputParserException +from langchain_core.pydantic_v1 import BaseModel, Field + +from langchain.output_parsers.yaml import YamlOutputParser + + +class Actions(Enum): + SEARCH = "Search" + CREATE = "Create" + UPDATE = "Update" + DELETE = "Delete" + + +class TestModel(BaseModel): + action: Actions = Field(description="Action to be performed") + action_input: str = Field(description="Input to be used in the action") + additional_fields: Optional[str] = Field( + description="Additional fields", default=None + ) + for_new_lines: str = Field(description="To be used to test newlines") + + +# Prevent pytest from trying to run tests on TestModel +TestModel.__test__ = False # type: ignore[attr-defined] + + +DEF_RESULT = """```yaml +--- + +action: Update +action_input: The yamlOutputParser class is powerful +additional_fields: null +for_new_lines: | + not_escape_newline: + escape_newline: + +```""" + +# action 'update' with a lowercase 'u' to test schema validation failure. +DEF_RESULT_FAIL = """```yaml +action: update +action_input: The yamlOutputParser class is powerful +additional_fields: null +```""" + +DEF_EXPECTED_RESULT = TestModel( + action=Actions.UPDATE, + action_input="The yamlOutputParser class is powerful", + additional_fields=None, + for_new_lines="not_escape_newline:\n escape_newline: \n", +) + + +def test_yaml_output_parser() -> None: + """Test yamlOutputParser.""" + + yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser( + pydantic_object=TestModel + ) + + result = yaml_parser.parse(DEF_RESULT) + print("parse_result:", result) + assert DEF_EXPECTED_RESULT == result + + +def test_yaml_output_parser_fail() -> None: + """Test YamlOutputParser where completion result fails schema validation.""" + + yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser( + pydantic_object=TestModel + ) + + try: + yaml_parser.parse(DEF_RESULT_FAIL) + except OutputParserException as e: + print("parse_result:", e) + assert "Failed to parse TestModel from completion" in str(e) + else: + assert False, "Expected OutputParserException"