mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 22:03:52 +00:00
community[patch]: update PyPDFParser to take into account filters returned as arrays (#30489)
The image parsing is generating a bug as the the extracted objects for the /Filter returns sometimes an array, sometimes a string. Fix [Issue 30098](https://github.com/langchain-ai/langchain/issues/30098)
This commit is contained in:
parent
422ba4cde5
commit
8e5d2a44ce
@ -428,6 +428,7 @@ class PyPDFParser(BaseBlobParser):
|
||||
"""
|
||||
if not self.images_parser:
|
||||
return ""
|
||||
import pypdf
|
||||
from PIL import Image
|
||||
|
||||
if "/XObject" not in cast(dict, page["/Resources"]).keys():
|
||||
@ -438,13 +439,18 @@ class PyPDFParser(BaseBlobParser):
|
||||
for obj in xObject:
|
||||
np_image: Any = None
|
||||
if xObject[obj]["/Subtype"] == "/Image":
|
||||
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
|
||||
img_filter = (
|
||||
xObject[obj]["/Filter"][1:]
|
||||
if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
|
||||
else xObject[obj]["/Filter"][0][1:]
|
||||
)
|
||||
if img_filter in _PDF_FILTER_WITHOUT_LOSS:
|
||||
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
|
||||
|
||||
np_image = np.frombuffer(
|
||||
xObject[obj].get_data(), dtype=np.uint8
|
||||
).reshape(height, width, -1)
|
||||
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
|
||||
elif img_filter in _PDF_FILTER_WITH_LOSS:
|
||||
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
|
||||
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user