diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py index b053b94d491..e8df541291f 100644 --- a/libs/community/langchain_community/document_loaders/parsers/images.py +++ b/libs/community/langchain_community/document_loaders/parsers/images.py @@ -48,7 +48,10 @@ class BaseImageBlobParser(BaseBlobParser): with blob.as_bytes_io() as buf: if blob.mimetype == "application/x-npy": - img = Img.fromarray(numpy.load(buf)) + try: + img = Img.fromarray(numpy.load(buf)) + except EOFError: + return # Ignore too small images else: img = Img.open(buf) content = self._analyze_image(img) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 28f2aa8f05f..e198d22752c 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -1476,7 +1476,7 @@ class PDFPlumberParser(BaseBlobParser): raise ValueError("mode must be single or page") if extract_tables and extract_tables not in ["csv", "markdown", "html"]: raise ValueError("mode must be csv, markdown or html") - if not extract_images and not images_parser: + if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.password = password self.extract_images = extract_images @@ -1554,19 +1554,27 @@ class PDFPlumberParser(BaseBlobParser): elif isinstance(content, list): # Table page_text.append(_JOIN_TABLES + self._convert_table(content)) else: # Image - image_bytes = io.BytesIO() - numpy.save(image_bytes, content) - blob = Blob.from_data( - image_bytes.getvalue(), mime_type="application/x-npy" - ) - text_from_image = next( - self.images_parser.lazy_parse(blob) # type: ignore - ).page_content - extras.append( - _format_inner_image( - blob, text_from_image, self.images_inner_format - ) - ) + if self.images_parser: + try: + from PIL import Image as Img + Img.fromarray(content) # Check if image is valid + image_bytes = io.BytesIO() + numpy.save(image_bytes, content) + blob = Blob.from_data( + image_bytes.getvalue(), mime_type="application/x-npy" + ) + text_from_image = next( + self.images_parser.lazy_parse(blob) # type: ignore + ).page_content + extras.append( + _format_inner_image( + blob, text_from_image, self.images_inner_format + ) + ) + except TypeError: + pass + except EOFError: + pass all_text = _merge_text_and_extras(extras, "".join(page_text).strip()) diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index 3fd839f2589..e49bbcc071c 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -247,9 +247,6 @@ def test_parser_with_table( mode: str, extract_tables: str, ) -> None: - - from langchain_community.document_loaders.parsers.images import BaseImageBlobParser - parser_class = getattr(pdf_parsers, parser_factory) parser = parser_class(