From 898e2a5b516bff5227556f9a4c023aa6fa7ca19d Mon Sep 17 00:00:00 2001
From: Philippe Prados <github@prados.fr>
Date: Wed, 26 Feb 2025 13:49:46 +0100
Subject: [PATCH] Fix test_parser_with_table

---
 .../parsers/test_pdf_parsers.py               | 90 +++++++++----------
 1 file changed, 43 insertions(+), 47 deletions(-)

diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
index 8d1ef117494..3fd839f2589 100644
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -247,55 +247,9 @@ def test_parser_with_table(
     mode: str,
     extract_tables: str,
 ) -> None:
-    from PIL.Image import Image
 
     from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
 
-    def _std_assert_with_parser(parser: BaseBlobParser) -> None:
-        """Standard tests to verify that the given parser works.
-
-        Args:
-            parser (BaseBlobParser): The parser to test.
-        """
-        blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
-        doc_generator = parser.lazy_parse(blob)
-        docs = list(doc_generator)
-        tables = []
-        for doc in docs:
-            if extract_tables == "markdown":
-                pattern = (
-                    r"(?s)("
-                    r"(?:(?:[^\n]*\|)\n)"
-                    r"(?:\|(?:\s?:?---*:?\s?\|)+)\n"
-                    r"(?:(?:[^\n]*\|)\n)+"
-                    r")"
-                )
-            elif extract_tables == "html":
-                pattern = r"(?s)(<table[^>]*>(?:.*?)<\/table>)"
-            elif extract_tables == "csv":
-                pattern = (
-                    r"((?:(?:"
-                    r'(?:"(?:[^"]*(?:""[^"]*)*)"'
-                    r"|[^\n,]*),){2,}"
-                    r"(?:"
-                    r'(?:"(?:[^"]*(?:""[^"]*)*)"'
-                    r"|[^\n]*))\n){2,})"
-                )
-            else:
-                pattern = None
-            if pattern:
-                matches = re.findall(pattern, doc.page_content)
-                if matches:
-                    tables.extend(matches)
-        if extract_tables:
-            assert len(tables) >= 1
-        else:
-            assert not len(tables)
-
-    class EmptyImageBlobParser(BaseImageBlobParser):
-        def _analyze_image(self, img: Image) -> str:
-            return "![image](.)"
-
     parser_class = getattr(pdf_parsers, parser_factory)
 
     parser = parser_class(
@@ -304,4 +258,46 @@ def test_parser_with_table(
         images_parser=EmptyImageBlobParser(),
         **params,
     )
-    _std_assert_with_parser(parser)
+    _std_assert_table_with_parser(extract_tables, parser)
+
+
+def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) -> None:
+    """Standard tests to verify that the given parser works.
+
+    Args:
+        parser (BaseBlobParser): The parser to test.
+    """
+    blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
+    doc_generator = parser.lazy_parse(blob)
+    docs = list(doc_generator)
+    tables = []
+    for doc in docs:
+        if extract_tables == "markdown":
+            pattern = (
+                r"(?s)("
+                r"(?:(?:[^\n]*\|)\n)"
+                r"(?:\|(?:\s?:?---*:?\s?\|)+)\n"
+                r"(?:(?:[^\n]*\|)\n)+"
+                r")"
+            )
+        elif extract_tables == "html":
+            pattern = r"(?s)(<table[^>]*>(?:.*?)<\/table>)"
+        elif extract_tables == "csv":
+            pattern = (
+                r"((?:(?:"
+                r'(?:"(?:[^"]*(?:""[^"]*)*)"'
+                r"|[^\n,]*),){2,}"
+                r"(?:"
+                r'(?:"(?:[^"]*(?:""[^"]*)*)"'
+                r"|[^\n]*))\n){2,})"
+            )
+        else:
+            pattern = None
+        if pattern:
+            matches = re.findall(pattern, doc.page_content)
+            if matches:
+                tables.extend(matches)
+    if extract_tables:
+        assert len(tables) >= 1
+    else:
+        assert not len(tables)