diff --git a/libs/langchain/langchain/output_parsers/__init__.py b/libs/langchain/langchain/output_parsers/__init__.py index e37803b81d0..56e5a80b974 100644 --- a/libs/langchain/langchain/output_parsers/__init__.py +++ b/libs/langchain/langchain/output_parsers/__init__.py @@ -36,6 +36,7 @@ from langchain.output_parsers.regex_dict import RegexDictParser from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser from langchain.output_parsers.xml import XMLOutputParser +from langchain.output_parsers.yaml import YamlOutputParser __all__ = [ "BooleanOutputParser", @@ -60,4 +61,5 @@ __all__ = [ "JsonOutputToolsParser", "PydanticToolsParser", "JsonOutputKeyToolsParser", + "YamlOutputParser", ] diff --git a/libs/langchain/langchain/output_parsers/format_instructions.py b/libs/langchain/langchain/output_parsers/format_instructions.py index 91f2007b702..94ef87210fa 100644 --- a/libs/langchain/langchain/output_parsers/format_instructions.py +++ b/libs/langchain/langchain/output_parsers/format_instructions.py @@ -26,6 +26,26 @@ Here is the output schema: {schema} ```""" +YAML_FORMAT_INSTRUCTIONS = """The output should be formatted as a YAML instance that conforms to the given JSON schema below. + +As an example, for the schema +``` +{{'title': 'Players', 'description': 'A list of players', 'type': 'array', 'items': {{'$ref': '#/definitions/Player'}}, 'definitions': {{'Player': {{'title': 'Player', 'type': 'object', 'properties': {{'name': {{'title': 'Name', 'description': 'Player name', 'type': 'string'}}, 'avg': {{'title': 'Avg', 'description': 'Batting average', 'type': 'number'}}}}, 'required': ['name', 'avg']}}}}}} +``` +a well formatted instance would be: +``` +- name: John Doe + avg: 0.3 +- name: Jane Maxfield + avg: 1.4 +``` + +Please follow the standard YAML formatting conventions with an indent of 2 spaces and make sure that the data types adhere strictly to the following JSON schema: +``` +{schema} +``` + +Make sure to always enclose the YAML output in triple backticks (```)""" XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file. 1. Output should conform to the tags below. diff --git a/libs/langchain/langchain/output_parsers/yaml.py b/libs/langchain/langchain/output_parsers/yaml.py new file mode 100644 index 00000000000..69304c3e65b --- /dev/null +++ b/libs/langchain/langchain/output_parsers/yaml.py @@ -0,0 +1,58 @@ +import json +import re +from typing import Type, TypeVar + +import yaml +from langchain_core.exceptions import OutputParserException +from langchain_core.output_parsers import BaseOutputParser +from langchain_core.pydantic_v1 import BaseModel, ValidationError + +from langchain.output_parsers.format_instructions import YAML_FORMAT_INSTRUCTIONS + +T = TypeVar("T", bound=BaseModel) + + +class YamlOutputParser(BaseOutputParser[T]): + """Parse YAML output using a pydantic model.""" + + pydantic_object: Type[T] + """The pydantic model to parse.""" + pattern: re.Pattern = re.compile( + r"^```(?:ya?ml)?(?P[^`]*)", re.MULTILINE | re.DOTALL + ) + """Regex pattern to match yaml code blocks + within triple backticks with optional yaml or yml prefix.""" + + def parse(self, text: str) -> T: + try: + # Greedy search for 1st yaml candidate. + match = re.search(self.pattern, text.strip()) + yaml_str = "" + if match: + yaml_str = match.group("yaml") + + json_object = yaml.safe_load(yaml_str) + return self.pydantic_object.parse_obj(json_object) + + except (yaml.YAMLError, ValidationError) as e: + name = self.pydantic_object.__name__ + msg = f"Failed to parse {name} from completion {text}. Got: {e}" + raise OutputParserException(msg, llm_output=text) + + def get_format_instructions(self) -> str: + schema = self.pydantic_object.schema() + + # Remove extraneous fields. + reduced_schema = schema + if "title" in reduced_schema: + del reduced_schema["title"] + if "type" in reduced_schema: + del reduced_schema["type"] + # Ensure yaml in context is well-formed with double quotes. + schema_str = json.dumps(reduced_schema) + + return YAML_FORMAT_INSTRUCTIONS.format(schema=schema_str) + + @property + def _type(self) -> str: + return "yaml" diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py index 1bd2bce22fc..a161448bbed 100644 --- a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py +++ b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py @@ -23,6 +23,7 @@ EXPECTED_ALL = [ "JsonOutputToolsParser", "PydanticToolsParser", "JsonOutputKeyToolsParser", + "YamlOutputParser", ] diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py new file mode 100644 index 00000000000..0fc9646fee7 --- /dev/null +++ b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py @@ -0,0 +1,83 @@ +"""Test yamlOutputParser""" +from enum import Enum +from typing import Optional + +from langchain_core.exceptions import OutputParserException +from langchain_core.pydantic_v1 import BaseModel, Field + +from langchain.output_parsers.yaml import YamlOutputParser + + +class Actions(Enum): + SEARCH = "Search" + CREATE = "Create" + UPDATE = "Update" + DELETE = "Delete" + + +class TestModel(BaseModel): + action: Actions = Field(description="Action to be performed") + action_input: str = Field(description="Input to be used in the action") + additional_fields: Optional[str] = Field( + description="Additional fields", default=None + ) + for_new_lines: str = Field(description="To be used to test newlines") + + +# Prevent pytest from trying to run tests on TestModel +TestModel.__test__ = False # type: ignore[attr-defined] + + +DEF_RESULT = """```yaml +--- + +action: Update +action_input: The yamlOutputParser class is powerful +additional_fields: null +for_new_lines: | + not_escape_newline: + escape_newline: + +```""" + +# action 'update' with a lowercase 'u' to test schema validation failure. +DEF_RESULT_FAIL = """```yaml +action: update +action_input: The yamlOutputParser class is powerful +additional_fields: null +```""" + +DEF_EXPECTED_RESULT = TestModel( + action=Actions.UPDATE, + action_input="The yamlOutputParser class is powerful", + additional_fields=None, + for_new_lines="not_escape_newline:\n escape_newline: \n", +) + + +def test_yaml_output_parser() -> None: + """Test yamlOutputParser.""" + + yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser( + pydantic_object=TestModel + ) + + result = yaml_parser.parse(DEF_RESULT) + print("parse_result:", result) + assert DEF_EXPECTED_RESULT == result + + +def test_yaml_output_parser_fail() -> None: + """Test YamlOutputParser where completion result fails schema validation.""" + + yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser( + pydantic_object=TestModel + ) + + try: + yaml_parser.parse(DEF_RESULT_FAIL) + except OutputParserException as e: + print("parse_result:", e) + assert "Failed to parse TestModel from completion" in str(e) + else: + assert False, "Expected OutputParserException"