Fix test_parser_with_table

2025-08-18 00:51:18 +00:00 · 2025-02-26 14:07:36 +01:00 · 2025-02-26 14:07:36 +01:00 · 4f9bcf285b
commit 4f9bcf285b
parent 898e2a5b51
3 changed files with 26 additions and 18 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/images.py
+++ b/libs/community/langchain_community/document_loaders/parsers/images.py
@ -48,7 +48,10 @@ class BaseImageBlobParser(BaseBlobParser):

            with blob.as_bytes_io() as buf:
                if blob.mimetype == "application/x-npy":
+                    try:
                        img = Img.fromarray(numpy.load(buf))
+                    except EOFError:
+                        return  # Ignore too small images
                else:
                    img = Img.open(buf)
                content = self._analyze_image(img)
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -1476,7 +1476,7 @@ class PDFPlumberParser(BaseBlobParser):
            raise ValueError("mode must be single or page")
        if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
            raise ValueError("mode must be csv, markdown or html")
-        if not extract_images and not images_parser:
+        if extract_images and not images_parser:
            images_parser = RapidOCRBlobParser()
        self.password = password
        self.extract_images = extract_images
@ -1554,6 +1554,10 @@ class PDFPlumberParser(BaseBlobParser):
                    elif isinstance(content, list):  # Table
                        page_text.append(_JOIN_TABLES + self._convert_table(content))
                    else:  # Image
+                        if self.images_parser:
+                            try:
+                                from PIL import Image as Img
+                                Img.fromarray(content) # Check if image is valid
                                image_bytes = io.BytesIO()
                                numpy.save(image_bytes, content)
                                blob = Blob.from_data(
@ -1567,6 +1571,10 @@ class PDFPlumberParser(BaseBlobParser):
                                        blob, text_from_image, self.images_inner_format
                                    )
                                )
+                            except TypeError:
+                                pass
+                            except EOFError:
+                                pass

                all_text = _merge_text_and_extras(extras, "".join(page_text).strip())

--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -247,9 +247,6 @@ def test_parser_with_table(
    mode: str,
    extract_tables: str,
 ) -> None:
-
-    from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
-
    parser_class = getattr(pdf_parsers, parser_factory)

    parser = parser_class(