From dd909d29146c5f765188ddeffcbbf8085fea9b09 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Thu, 13 Mar 2025 15:41:41 +0100 Subject: [PATCH] Fix revue --- .../parsers/test_pdf_parsers.py | 82 +++++++++++-------- .../document_loaders/test_pdf.py | 19 +++-- .../parsers/test_pdf_parsers.py | 21 +++-- 3 files changed, 71 insertions(+), 51 deletions(-) diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index e49bbcc071c..c61b0074333 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -9,7 +9,16 @@ import pytest import langchain_community.document_loaders.parsers as pdf_parsers from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob -from langchain_community.document_loaders.parsers import BaseImageBlobParser +from langchain_community.document_loaders.parsers import ( + BaseImageBlobParser, + PDFPlumberParser, +) +from langchain_community.document_loaders.parsers.pdf import ( + PDFMinerParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, +) if TYPE_CHECKING: from PIL.Image import Image @@ -102,25 +111,25 @@ class EmptyImageBlobParser(BaseImageBlobParser): [("single", EmptyImageBlobParser()), ("page", None)], ) @pytest.mark.parametrize( - "parser_factory,params", + "parser_class,params", [ - ("PDFMinerParser", {}), - ("PDFPlumberParser", {}), - ("PyMuPDFParser", {}), - ("PyPDFium2Parser", {}), - ("PyPDFParser", {"extraction_mode": "plain"}), - ("PyPDFParser", {"extraction_mode": "layout"}), + (PDFMinerParser, {}), + (PDFPlumberParser, {}), + (PyMuPDFParser, {}), + (PyPDFium2Parser, {}), + (PyPDFParser, {"extraction_mode": "plain"}), + (PyPDFParser, {"extraction_mode": "layout"}), ], ) @pytest.mark.requires("pillow") def test_mode_and_extract_images_variations( - parser_factory: str, + parser_class: Type, params: dict, mode: str, image_parser: BaseImageBlobParser, ) -> None: _test_matrix( - parser_factory, + parser_class, params, mode, image_parser, @@ -133,19 +142,19 @@ def test_mode_and_extract_images_variations( ["text", "markdown-img", "html-img"], ) @pytest.mark.parametrize( - "parser_factory,params", + "parser_class,params", [ - ("PDFMinerParser", {}), - ("PDFPlumberParser", {}), - ("PyMuPDFParser", {}), - ("PyPDFium2Parser", {}), - ("PyPDFParser", {"extraction_mode": "plain"}), - ("PyPDFParser", {"extraction_mode": "layout"}), + (PDFMinerParser, {}), + (PDFPlumberParser, {}), + (PyMuPDFParser, {}), + (PyPDFium2Parser, {}), + (PyPDFParser, {"extraction_mode": "plain"}), + (PyPDFParser, {"extraction_mode": "layout"}), ], ) @pytest.mark.requires("pillow") def test_mode_and_image_formats_variations( - parser_factory: str, + parser_class: Type, params: dict, images_inner_format: str, ) -> None: @@ -153,7 +162,7 @@ def test_mode_and_image_formats_variations( image_parser = EmptyImageBlobParser() _test_matrix( - parser_factory, + parser_class, params, mode, image_parser, @@ -162,7 +171,7 @@ def test_mode_and_image_formats_variations( def _test_matrix( - parser_factory: str, + parser_class: Type, params: dict, mode: str, image_parser: BaseImageBlobParser, @@ -214,8 +223,6 @@ def _test_matrix( assert len(docs) parser.password = old_password - parser_class = getattr(pdf_parsers, parser_factory) - parser = parser_class( mode=mode, images_parser=image_parser, @@ -235,30 +242,25 @@ def _test_matrix( ["markdown", "html", "csv", None], ) @pytest.mark.parametrize( - "parser_factory,params", + "parser_class,params", [ - ("PDFPlumberParser", {}), - ("PyMuPDFParser", {}), + (PDFPlumberParser, {}), + (PyMuPDFParser, {}), ], ) def test_parser_with_table( - parser_factory: str, + parser_class: Type, params: dict, mode: str, extract_tables: str, ) -> None: parser_class = getattr(pdf_parsers, parser_factory) - parser = parser_class( - mode=mode, - extract_tables=extract_tables, - images_parser=EmptyImageBlobParser(), - **params, - ) - _std_assert_table_with_parser(extract_tables, parser) + from PIL.Image import Image + from langchain_community.document_loaders.parsers.images import BaseImageBlobParser -def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) -> None: + def _std_assert_with_parser(parser: BaseBlobParser) -> None: """Standard tests to verify that the given parser works. Args: @@ -298,3 +300,15 @@ def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) - assert len(tables) >= 1 else: assert not len(tables) + + class EmptyImageBlobParser(BaseImageBlobParser): + def _analyze_image(self, img: Image) -> str: + return "![image](.)" + + parser = parser_class( + mode=mode, + extract_tables=extract_tables, + images_parser=EmptyImageBlobParser(), + **params, + ) + _std_assert_with_parser(parser) diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index dc30b15e77f..728ce4d11e1 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -8,7 +8,11 @@ import langchain_community.document_loaders as pdf_loaders from langchain_community.document_loaders import ( AmazonTextractPDFLoader, MathpixPDFLoader, + PDFMinerLoader, PDFMinerPDFasHTMLLoader, + PyMuPDFLoader, + PyPDFium2Loader, + PyPDFLoader, UnstructuredPDFLoader, ) @@ -164,20 +168,19 @@ def test_amazontextract_loader_failures() -> None: @pytest.mark.parametrize( - "parser_factory,params", + "loader_class,params", [ - ("PDFMinerLoader", {}), - ("PDFPlumberLoader", {}), - ("PyMuPDFLoader", {}), - ("PyPDFium2Loader", {}), - ("PyPDFLoader", {}), + (PDFMinerLoader, {}), + (PDFPlumberLoader, {}), + (PyMuPDFLoader, {}), + (PyPDFium2Loader, {}), + (PyPDFLoader, {}), ], ) def test_standard_parameters( - parser_factory: str, + loader_class: Type, params: dict, ) -> None: - loader_class = getattr(pdf_loaders, parser_factory) file_path = Path(__file__).parent.parent / "examples/hello.pdf" loader = loader_class(file_path) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py index 6dfaf9ccf40..a458b4de587 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py @@ -10,6 +10,10 @@ import langchain_community.document_loaders.parsers as pdf_parsers from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers.pdf import ( + PDFMinerParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, _merge_text_and_extras, ) @@ -74,25 +78,24 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True) @pytest.mark.parametrize( - "parser_factory,require,params", + "parser_class,require,params", [ - ("PDFMinerParser", "pdfminer", {"splits_by_page": False}), - ("PDFPlumberParser", "pdfplumber", {}), - ("PyMuPDFParser", "pymupdf", {}), - ("PyPDFParser", "pypdf", {}), - ("PyPDFium2Parser", "pypdfium2", {}), + (PDFMinerParser, "pdfminer", {"splits_by_page": False}), + (PDFPlumberParser, "pdfplumber", {}), + (PyMuPDFParser, "pymupdf", {}), + (PyPDFParser, "pypdf", {}), + (PyPDFium2Parser, "pypdfium2", {}), ], ) def test_parsers( - parser_factory: str, + parser_class: Type, require: str, params: dict[str, Any], ) -> None: try: require = require.replace("-", "") importlib.import_module(require, package=None) - parser_class = getattr(pdf_parsers, parser_factory) parser = parser_class() _assert_with_parser(parser, **params) except ModuleNotFoundError: - pytest.skip(f"{parser_factory} skiped. Require '{require}'") + pytest.skip(f"{parser_class} skiped. Require '{require}'")