From ec090745a6f41793b7c72e1710efdd0a00a8dbec Mon Sep 17 00:00:00 2001 From: Nuno Campos Date: Fri, 29 Dec 2023 09:59:21 -0800 Subject: [PATCH] Improve markdown list parser (#15295) - do not match text after - in the middle of a sentence --- libs/core/langchain_core/output_parsers/list.py | 6 +++--- libs/core/tests/unit_tests/output_parsers/test_json.py | 1 + .../tests/unit_tests/output_parsers/test_list_parser.py | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/libs/core/langchain_core/output_parsers/list.py b/libs/core/langchain_core/output_parsers/list.py index ac9f6a5f2bf..36b9f7e9b77 100644 --- a/libs/core/langchain_core/output_parsers/list.py +++ b/libs/core/langchain_core/output_parsers/list.py @@ -154,18 +154,18 @@ class NumberedListOutputParser(ListOutputParser): class MarkdownListOutputParser(ListOutputParser): """Parse a markdown list.""" - pattern = r"-\s([^\n]+)" + pattern = r"^\s*[-*]\s([^\n]+)$" def get_format_instructions(self) -> str: return "Your response should be a markdown list, " "eg: `- foo\n- bar\n- baz`" def parse(self, text: str) -> List[str]: """Parse the output of an LLM call.""" - return re.findall(self.pattern, text) + return re.findall(self.pattern, text, re.MULTILINE) def parse_iter(self, text: str) -> Iterator[re.Match]: """Parse the output of an LLM call.""" - return re.finditer(self.pattern, text) + return re.finditer(self.pattern, text, re.MULTILINE) @property def _type(self) -> str: diff --git a/libs/core/tests/unit_tests/output_parsers/test_json.py b/libs/core/tests/unit_tests/output_parsers/test_json.py index 8b2bc7d29a4..89362057445 100644 --- a/libs/core/tests/unit_tests/output_parsers/test_json.py +++ b/libs/core/tests/unit_tests/output_parsers/test_json.py @@ -147,6 +147,7 @@ TEST_CASES = [ NO_TICKS_WHITE_SPACE, TEXT_BEFORE, TEXT_AFTER, + TEXT_BEFORE_AND_AFTER, ] diff --git a/libs/core/tests/unit_tests/output_parsers/test_list_parser.py b/libs/core/tests/unit_tests/output_parsers/test_list_parser.py index 4e2f506264b..710f9e52cee 100644 --- a/libs/core/tests/unit_tests/output_parsers/test_list_parser.py +++ b/libs/core/tests/unit_tests/output_parsers/test_list_parser.py @@ -51,7 +51,7 @@ def test_numbered_list() -> None: "For example: \n\n1. foo\n\n2. bar\n\n3. baz" ) - text2 = "Items:\n\n1. apple\n\n2. banana\n\n3. cherry" + text2 = "Items:\n\n1. apple\n\n 2. banana\n\n3. cherry" text3 = "No items in the list." @@ -82,11 +82,11 @@ def test_numbered_list() -> None: def test_markdown_list() -> None: parser = MarkdownListOutputParser() text1 = ( - "Your response should be a numbered list with each item on a new line." + "Your response should be a numbered - not a list item - list with each item on a new line." # noqa: E501 "For example: \n- foo\n- bar\n- baz" ) - text2 = "Items:\n- apple\n- banana\n- cherry" + text2 = "Items:\n- apple\n - banana\n- cherry" text3 = "No items in the list."