Enhancement on feature/yaml output parser (#14674)

Adding to my previously, already merged PR I made some further improvements: * Added documentation to the existing Pydantic Parser notebook, with an example using LCEL and `with_retry()` on `OutputParserException`. * Added an additional output example to the prompt * More lenient parser in terms of LLM output format * Amended unit test FYI @hwchase17 --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2025-09-22 11:00:37 +00:00 · 2024-01-02 01:49:58 +01:00
parent ff10f30149
commit 9d8468a576
5 changed files with 153 additions and 8 deletions
--- a/libs/langchain/langchain/output_parsers/format_instructions.py
+++ b/libs/langchain/langchain/output_parsers/format_instructions.py
@@ -28,11 +28,12 @@ Here is the output schema:

 YAML_FORMAT_INSTRUCTIONS = """The output should be formatted as a YAML instance that conforms to the given JSON schema below.

-As an example, for the schema
+# Examples
+## Schema
 ```
-{{'title': 'Players', 'description': 'A list of players', 'type': 'array', 'items': {{'$ref': '#/definitions/Player'}}, 'definitions': {{'Player': {{'title': 'Player', 'type': 'object', 'properties': {{'name': {{'title': 'Name', 'description': 'Player name', 'type': 'string'}}, 'avg': {{'title': 'Avg', 'description': 'Batting average', 'type': 'number'}}}}, 'required': ['name', 'avg']}}}}}}
+{{"title": "Players", "description": "A list of players", "type": "array", "items": {{"$ref": "#/definitions/Player"}}, "definitions": {{"Player": {{"title": "Player", "type": "object", "properties": {{"name": {{"title": "Name", "description": "Player name", "type": "string"}}, "avg": {{"title": "Avg", "description": "Batting average", "type": "number"}}}}, "required": ["name", "avg"]}}}}}}
 ```
-a well formatted instance would be:
+## Well formatted instance
 ```
 - name: John Doe
  avg: 0.3
@@ -40,12 +41,22 @@ a well formatted instance would be:
  avg: 1.4
 ```

+## Schema
+```
+{{"properties": {{"habit": {{ "description": "A common daily habit", "type": "string" }}, "sustainable_alternative": {{ "description": "An environmentally friendly alternative to the habit", "type": "string"}}}}, "required": ["habit", "sustainable_alternative"]}}
+```
+## Well formatted instance
+```
+habit: Using disposable water bottles for daily hydration.
+sustainable_alternative: Switch to a reusable water bottle to reduce plastic waste and decrease your environmental footprint.
+``` 
+
 Please follow the standard YAML formatting conventions with an indent of 2 spaces and make sure that the data types adhere strictly to the following JSON schema: 
 ```
 {schema}
 ```

-Make sure to always enclose the YAML output in triple backticks (```)"""
+Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!"""


 PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS = """The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
--- a/libs/langchain/langchain/output_parsers/yaml.py
+++ b/libs/langchain/langchain/output_parsers/yaml.py
@@ -30,6 +30,9 @@ class YamlOutputParser(BaseOutputParser[T]):
            yaml_str = ""
            if match:
                yaml_str = match.group("yaml")
+            else:
+                # If no backticks were present, try to parse the entire output as yaml.
+                yaml_str = text

            json_object = yaml.safe_load(yaml_str)
            return self.pydantic_object.parse_obj(json_object)
@@ -37,7 +40,7 @@ class YamlOutputParser(BaseOutputParser[T]):
        except (yaml.YAMLError, ValidationError) as e:
            name = self.pydantic_object.__name__
            msg = f"Failed to parse {name} from completion {text}. Got: {e}"
-            raise OutputParserException(msg, llm_output=text)
+            raise OutputParserException(msg, llm_output=text) from e

    def get_format_instructions(self) -> str:
        schema = self.pydantic_object.schema()
--- a/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_yaml_parser.py
@@ -2,6 +2,7 @@
 from enum import Enum
 from typing import Optional

+import pytest
 from langchain_core.exceptions import OutputParserException
 from langchain_core.pydantic_v1 import BaseModel, Field

@@ -39,6 +40,15 @@ for_new_lines: |
   escape_newline: 

 ```"""
+DEF_RESULT_NO_BACKTICKS = """
+action: Update
+action_input: The yamlOutputParser class is powerful
+additional_fields: null
+for_new_lines: |
+  not_escape_newline:
+   escape_newline: 
+
+"""

 # action 'update' with a lowercase 'u' to test schema validation failure.
 DEF_RESULT_FAIL = """```yaml
@@ -55,16 +65,17 @@ DEF_EXPECTED_RESULT = TestModel(
 )


-def test_yaml_output_parser() -> None:
+@pytest.mark.parametrize("result", [DEF_RESULT, DEF_RESULT_NO_BACKTICKS])
+def test_yaml_output_parser(result: str) -> None:
    """Test yamlOutputParser."""

    yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
        pydantic_object=TestModel
    )

-    result = yaml_parser.parse(DEF_RESULT)
+    model = yaml_parser.parse(result)
    print("parse_result:", result)
-    assert DEF_EXPECTED_RESULT == result
+    assert DEF_EXPECTED_RESULT == model


 def test_yaml_output_parser_fail() -> None: