mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-06 21:20:33 +00:00
[Community]: Image Extraction Fixed for PDFPlumberParser
(#28491)
- **Description:** One-Bit Images was raising error which has been fixed in this PR for `PDFPlumberParser` - **Issue:** #28480 --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
f723a8456e
commit
d49df4871d
@ -427,6 +427,13 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
||||||
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
|
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
import PIL # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pillow package not found, please install it with"
|
||||||
|
" `pip install pillow`"
|
||||||
|
)
|
||||||
self.text_kwargs = text_kwargs or {}
|
self.text_kwargs = text_kwargs or {}
|
||||||
self.dedupe = dedupe
|
self.dedupe = dedupe
|
||||||
self.extract_images = extract_images
|
self.extract_images = extract_images
|
||||||
@ -468,17 +475,30 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
|
|
||||||
def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
|
def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
|
||||||
"""Extract images from page and get the text with RapidOCR."""
|
"""Extract images from page and get the text with RapidOCR."""
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
if not self.extract_images:
|
if not self.extract_images:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
images = []
|
images = []
|
||||||
for img in page.images:
|
for img in page.images:
|
||||||
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
|
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
|
||||||
images.append(
|
if img["stream"]["BitsPerComponent"] == 1:
|
||||||
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
|
images.append(
|
||||||
img["stream"]["Height"], img["stream"]["Width"], -1
|
np.array(
|
||||||
|
Image.frombytes(
|
||||||
|
"1",
|
||||||
|
(img["stream"]["Width"], img["stream"]["Height"]),
|
||||||
|
img["stream"].get_data(),
|
||||||
|
).convert("L")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
images.append(
|
||||||
|
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
|
||||||
|
img["stream"]["Height"], img["stream"]["Width"], -1
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
|
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
|
||||||
images.append(img["stream"].get_data())
|
images.append(img["stream"].get_data())
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user