[Community]: Image Extraction Fixed for PDFPlumberParser (#28491)

- **Description:** One-Bit Images was raising error which has been fixed in this PR for `PDFPlumberParser` - **Issue:** #28480 --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
2025-08-23 03:22:38 +00:00 · 2024-12-18 21:45:48 +05:00 · 2024-12-18 21:45:48 +05:00 · d49df4871d
commit d49df4871d
parent f723a8456e
1 changed files with 24 additions and 4 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -427,6 +427,13 @@ class PDFPlumberParser(BaseBlobParser):
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        """
        try:
            import PIL  # noqa:F401
        except ImportError:
            raise ImportError(
                "pillow package not found, please install it with"
                " `pip install pillow`"
            )
        self.text_kwargs = text_kwargs or {}
        self.dedupe = dedupe
        self.extract_images = extract_images
@ -468,17 +475,30 @@ class PDFPlumberParser(BaseBlobParser):
    def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
        """Extract images from page and get the text with RapidOCR."""
        from PIL import Image
        if not self.extract_images:
            return ""
        images = []
        for img in page.images:
            if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
-                images.append(
+                if img["stream"]["BitsPerComponent"] == 1:
-                    np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
+                    images.append(
-                        img["stream"]["Height"], img["stream"]["Width"], -1
+                        np.array(
                            Image.frombytes(
                                "1",
                                (img["stream"]["Width"], img["stream"]["Height"]),
                                img["stream"].get_data(),
                            ).convert("L")
                        )
                    )
                else:
                    images.append(
                        np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
                            img["stream"]["Height"], img["stream"]["Width"], -1
                        )
                    )
                )
            elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
                images.append(img["stream"].get_data())
            else: