mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
community[patch]: update PyPDFParser to take into account filters returned as arrays (#30489)
The image parsing is generating a bug as the the extracted objects for the /Filter returns sometimes an array, sometimes a string. Fix [Issue 30098](https://github.com/langchain-ai/langchain/issues/30098)
This commit is contained in:
parent
422ba4cde5
commit
8e5d2a44ce
@ -428,6 +428,7 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
"""
|
"""
|
||||||
if not self.images_parser:
|
if not self.images_parser:
|
||||||
return ""
|
return ""
|
||||||
|
import pypdf
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
if "/XObject" not in cast(dict, page["/Resources"]).keys():
|
if "/XObject" not in cast(dict, page["/Resources"]).keys():
|
||||||
@ -438,13 +439,18 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
for obj in xObject:
|
for obj in xObject:
|
||||||
np_image: Any = None
|
np_image: Any = None
|
||||||
if xObject[obj]["/Subtype"] == "/Image":
|
if xObject[obj]["/Subtype"] == "/Image":
|
||||||
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
|
img_filter = (
|
||||||
|
xObject[obj]["/Filter"][1:]
|
||||||
|
if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
|
||||||
|
else xObject[obj]["/Filter"][0][1:]
|
||||||
|
)
|
||||||
|
if img_filter in _PDF_FILTER_WITHOUT_LOSS:
|
||||||
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
|
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
|
||||||
|
|
||||||
np_image = np.frombuffer(
|
np_image = np.frombuffer(
|
||||||
xObject[obj].get_data(), dtype=np.uint8
|
xObject[obj].get_data(), dtype=np.uint8
|
||||||
).reshape(height, width, -1)
|
).reshape(height, width, -1)
|
||||||
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
|
elif img_filter in _PDF_FILTER_WITH_LOSS:
|
||||||
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
|
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user