[Community]: Image Extraction Fixed for PDFPlumberParser (#28491)

- **Description:** One-Bit Images was raising error which has been fixed in this PR for `PDFPlumberParser` - **Issue:** #28480 --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
2025-08-21 10:26:57 +00:00 · 2024-12-18 21:45:48 +05:00 · 2024-12-18 21:45:48 +05:00 · d49df4871d
commit d49df4871d
parent f723a8456e
1 changed files with 24 additions and 4 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -427,6 +427,13 @@ class PDFPlumberParser(BaseBlobParser):
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        """
+        try:
+            import PIL  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "pillow package not found, please install it with"
+                " `pip install pillow`"
+            )
        self.text_kwargs = text_kwargs or {}
        self.dedupe = dedupe
        self.extract_images = extract_images
@ -468,12 +475,25 @@ class PDFPlumberParser(BaseBlobParser):

    def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
        """Extract images from page and get the text with RapidOCR."""
+        from PIL import Image
+
        if not self.extract_images:
            return ""

        images = []
        for img in page.images:
            if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
+                if img["stream"]["BitsPerComponent"] == 1:
+                    images.append(
+                        np.array(
+                            Image.frombytes(
+                                "1",
+                                (img["stream"]["Width"], img["stream"]["Height"]),
+                                img["stream"].get_data(),
+                            ).convert("L")
+                        )
+                    )
+                else:
                    images.append(
                        np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
                            img["stream"]["Height"], img["stream"]["Width"], -1