Fix test_parser_with_table

2025-08-17 00:17:47 +00:00 · 2025-02-26 14:07:36 +01:00 · 2025-02-26 14:07:36 +01:00 · 4f9bcf285b
commit 4f9bcf285b
parent 898e2a5b51
3 changed files with 26 additions and 18 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/images.py
+++ b/libs/community/langchain_community/document_loaders/parsers/images.py
@ -48,7 +48,10 @@ class BaseImageBlobParser(BaseBlobParser):

            with blob.as_bytes_io() as buf:
                if blob.mimetype == "application/x-npy":
-                    img = Img.fromarray(numpy.load(buf))
+                    try:
+                        img = Img.fromarray(numpy.load(buf))
+                    except EOFError:
+                        return  # Ignore too small images
                else:
                    img = Img.open(buf)
                content = self._analyze_image(img)
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -1476,7 +1476,7 @@ class PDFPlumberParser(BaseBlobParser):
            raise ValueError("mode must be single or page")
        if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
            raise ValueError("mode must be csv, markdown or html")
-        if not extract_images and not images_parser:
+        if extract_images and not images_parser:
            images_parser = RapidOCRBlobParser()
        self.password = password
        self.extract_images = extract_images
@ -1554,19 +1554,27 @@ class PDFPlumberParser(BaseBlobParser):
                    elif isinstance(content, list):  # Table
                        page_text.append(_JOIN_TABLES + self._convert_table(content))
                    else:  # Image
-                        image_bytes = io.BytesIO()
-                        numpy.save(image_bytes, content)
-                        blob = Blob.from_data(
-                            image_bytes.getvalue(), mime_type="application/x-npy"
-                        )
-                        text_from_image = next(
-                            self.images_parser.lazy_parse(blob)  # type: ignore
-                        ).page_content
-                        extras.append(
-                            _format_inner_image(
-                                blob, text_from_image, self.images_inner_format
-                            )
-                        )
+                        if self.images_parser:
+                            try:
+                                from PIL import Image as Img
+                                Img.fromarray(content) # Check if image is valid
+                                image_bytes = io.BytesIO()
+                                numpy.save(image_bytes, content)
+                                blob = Blob.from_data(
+                                    image_bytes.getvalue(), mime_type="application/x-npy"
+                                )
+                                text_from_image = next(
+                                    self.images_parser.lazy_parse(blob)  # type: ignore
+                                ).page_content
+                                extras.append(
+                                    _format_inner_image(
+                                        blob, text_from_image, self.images_inner_format
+                                    )
+                                )
+                            except TypeError:
+                                pass
+                            except EOFError:
+                                pass

                all_text = _merge_text_and_extras(extras, "".join(page_text).strip())

--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -247,9 +247,6 @@ def test_parser_with_table(
    mode: str,
    extract_tables: str,
 ) -> None:
-
-    from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
-
    parser_class = getattr(pdf_parsers, parser_factory)

    parser = parser_class(