From d49df4871d98a8daf97d3b09caa8468242c9d0e3 Mon Sep 17 00:00:00 2001
From: Mohammad Mohtashim <45242107+keenborder786@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:45:48 +0500
Subject: [PATCH] [Community]: Image Extraction Fixed for `PDFPlumberParser`
 (#28491)

- **Description:** One-Bit Images was raising error which has been fixed
in this PR for `PDFPlumberParser`
 - **Issue:** #28480

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
---
 .../document_loaders/parsers/pdf.py           | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index c603dde71eb..702d5998dd7 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -427,6 +427,13 @@ class PDFPlumberParser(BaseBlobParser):
             text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
             dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
         """
+        try:
+            import PIL  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "pillow package not found, please install it with"
+                " `pip install pillow`"
+            )
         self.text_kwargs = text_kwargs or {}
         self.dedupe = dedupe
         self.extract_images = extract_images
@@ -468,17 +475,30 @@ class PDFPlumberParser(BaseBlobParser):
 
     def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
         """Extract images from page and get the text with RapidOCR."""
+        from PIL import Image
+
         if not self.extract_images:
             return ""
 
         images = []
         for img in page.images:
             if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
-                images.append(
-                    np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
-                        img["stream"]["Height"], img["stream"]["Width"], -1
+                if img["stream"]["BitsPerComponent"] == 1:
+                    images.append(
+                        np.array(
+                            Image.frombytes(
+                                "1",
+                                (img["stream"]["Width"], img["stream"]["Height"]),
+                                img["stream"].get_data(),
+                            ).convert("L")
+                        )
+                    )
+                else:
+                    images.append(
+                        np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
+                            img["stream"]["Height"], img["stream"]["Width"], -1
+                        )
                     )
-                )
             elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
                 images.append(img["stream"].get_data())
             else: