Fix test_parser_with_table

2025-06-26 00:23:25 +00:00 · 2025-02-26 13:49:46 +01:00 · 2025-02-26 13:49:46 +01:00 · 898e2a5b51
commit 898e2a5b51
parent be47099747
1 changed files with 43 additions and 47 deletions
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -247,55 +247,9 @@ def test_parser_with_table(
    mode: str,
    extract_tables: str,
 ) -> None:
    from PIL.Image import Image
    from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
    def _std_assert_with_parser(parser: BaseBlobParser) -> None:
        """Standard tests to verify that the given parser works.
        Args:
            parser (BaseBlobParser): The parser to test.
        """
        blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
        doc_generator = parser.lazy_parse(blob)
        docs = list(doc_generator)
        tables = []
        for doc in docs:
            if extract_tables == "markdown":
                pattern = (
                    r"(?s)("
                    r"(?:(?:[^\n]*\|)\n)"
                    r"(?:\|(?:\s?:?---*:?\s?\|)+)\n"
                    r"(?:(?:[^\n]*\|)\n)+"
                    r")"
                )
            elif extract_tables == "html":
                pattern = r"(?s)(<table[^>]*>(?:.*?)<\/table>)"
            elif extract_tables == "csv":
                pattern = (
                    r"((?:(?:"
                    r'(?:"(?:[^"]*(?:""[^"]*)*)"'
                    r"|[^\n,]*),){2,}"
                    r"(?:"
                    r'(?:"(?:[^"]*(?:""[^"]*)*)"'
                    r"|[^\n]*))\n){2,})"
                )
            else:
                pattern = None
            if pattern:
                matches = re.findall(pattern, doc.page_content)
                if matches:
                    tables.extend(matches)
        if extract_tables:
            assert len(tables) >= 1
        else:
            assert not len(tables)
    class EmptyImageBlobParser(BaseImageBlobParser):
        def _analyze_image(self, img: Image) -> str:
            return "![image](.)"
    parser_class = getattr(pdf_parsers, parser_factory)
    parser = parser_class(
@ -304,4 +258,46 @@ def test_parser_with_table(
        images_parser=EmptyImageBlobParser(),
        **params,
    )
-    _std_assert_with_parser(parser)
+    _std_assert_table_with_parser(extract_tables, parser)
 def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) -> None:
    """Standard tests to verify that the given parser works.
    Args:
        parser (BaseBlobParser): The parser to test.
    """
    blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
    doc_generator = parser.lazy_parse(blob)
    docs = list(doc_generator)
    tables = []
    for doc in docs:
        if extract_tables == "markdown":
            pattern = (
                r"(?s)("
                r"(?:(?:[^\n]*\|)\n)"
                r"(?:\|(?:\s?:?---*:?\s?\|)+)\n"
                r"(?:(?:[^\n]*\|)\n)+"
                r")"
            )
        elif extract_tables == "html":
            pattern = r"(?s)(<table[^>]*>(?:.*?)<\/table>)"
        elif extract_tables == "csv":
            pattern = (
                r"((?:(?:"
                r'(?:"(?:[^"]*(?:""[^"]*)*)"'
                r"|[^\n,]*),){2,}"
                r"(?:"
                r'(?:"(?:[^"]*(?:""[^"]*)*)"'
                r"|[^\n]*))\n){2,})"
            )
        else:
            pattern = None
        if pattern:
            matches = re.findall(pattern, doc.page_content)
            if matches:
                tables.extend(matches)
    if extract_tables:
        assert len(tables) >= 1
    else:
        assert not len(tables)