From d49df4871d98a8daf97d3b09caa8468242c9d0e3 Mon Sep 17 00:00:00 2001 From: Mohammad Mohtashim <45242107+keenborder786@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:45:48 +0500 Subject: [PATCH] [Community]: Image Extraction Fixed for `PDFPlumberParser` (#28491) - **Description:** One-Bit Images was raising error which has been fixed in this PR for `PDFPlumberParser` - **Issue:** #28480 --------- Co-authored-by: Chester Curme --- .../document_loaders/parsers/pdf.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index c603dde71eb..702d5998dd7 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -427,6 +427,13 @@ class PDFPlumberParser(BaseBlobParser): text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` dedupe: Avoiding the error of duplicate characters if `dedupe=True`. """ + try: + import PIL # noqa:F401 + except ImportError: + raise ImportError( + "pillow package not found, please install it with" + " `pip install pillow`" + ) self.text_kwargs = text_kwargs or {} self.dedupe = dedupe self.extract_images = extract_images @@ -468,17 +475,30 @@ class PDFPlumberParser(BaseBlobParser): def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str: """Extract images from page and get the text with RapidOCR.""" + from PIL import Image + if not self.extract_images: return "" images = [] for img in page.images: if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: - images.append( - np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( - img["stream"]["Height"], img["stream"]["Width"], -1 + if img["stream"]["BitsPerComponent"] == 1: + images.append( + np.array( + Image.frombytes( + "1", + (img["stream"]["Width"], img["stream"]["Height"]), + img["stream"].get_data(), + ).convert("L") + ) + ) + else: + images.append( + np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( + img["stream"]["Height"], img["stream"]["Width"], -1 + ) ) - ) elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: images.append(img["stream"].get_data()) else: