[Community]: Image Extraction Fixed for PDFPlumberParser (#28491)

- **Description:** One-Bit Images was raising error which has been fixed
in this PR for `PDFPlumberParser`
 - **Issue:** #28480

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
Mohammad Mohtashim 2024-12-18 21:45:48 +05:00 committed by GitHub
parent f723a8456e
commit d49df4871d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -427,6 +427,13 @@ class PDFPlumberParser(BaseBlobParser):
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
dedupe: Avoiding the error of duplicate characters if `dedupe=True`. dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
""" """
try:
import PIL # noqa:F401
except ImportError:
raise ImportError(
"pillow package not found, please install it with"
" `pip install pillow`"
)
self.text_kwargs = text_kwargs or {} self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe self.dedupe = dedupe
self.extract_images = extract_images self.extract_images = extract_images
@ -468,17 +475,30 @@ class PDFPlumberParser(BaseBlobParser):
def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str: def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
"""Extract images from page and get the text with RapidOCR.""" """Extract images from page and get the text with RapidOCR."""
from PIL import Image
if not self.extract_images: if not self.extract_images:
return "" return ""
images = [] images = []
for img in page.images: for img in page.images:
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
images.append( if img["stream"]["BitsPerComponent"] == 1:
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( images.append(
img["stream"]["Height"], img["stream"]["Width"], -1 np.array(
Image.frombytes(
"1",
(img["stream"]["Width"], img["stream"]["Height"]),
img["stream"].get_data(),
).convert("L")
)
)
else:
images.append(
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
img["stream"]["Height"], img["stream"]["Width"], -1
)
) )
)
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
images.append(img["stream"].get_data()) images.append(img["stream"].get_data())
else: else: