community[patch]: update PyPDFParser to take into account filters returned as arrays (#30489)

The image parsing is generating a bug as the the extracted objects for
the /Filter returns sometimes an array, sometimes a string.

Fix [Issue
30098](https://github.com/langchain-ai/langchain/issues/30098)
This commit is contained in:
Philippe PRADOS 2025-03-26 19:16:54 +01:00 committed by GitHub
parent 422ba4cde5
commit 8e5d2a44ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -428,6 +428,7 @@ class PyPDFParser(BaseBlobParser):
"""
if not self.images_parser:
return ""
import pypdf
from PIL import Image
if "/XObject" not in cast(dict, page["/Resources"]).keys():
@ -438,13 +439,18 @@ class PyPDFParser(BaseBlobParser):
for obj in xObject:
np_image: Any = None
if xObject[obj]["/Subtype"] == "/Image":
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
img_filter = (
xObject[obj]["/Filter"][1:]
if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
else xObject[obj]["/Filter"][0][1:]
)
if img_filter in _PDF_FILTER_WITHOUT_LOSS:
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
np_image = np.frombuffer(
xObject[obj].get_data(), dtype=np.uint8
).reshape(height, width, -1)
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
elif img_filter in _PDF_FILTER_WITH_LOSS:
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
else: