diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 2b53db28736..df4314ee661 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -428,6 +428,7 @@ class PyPDFParser(BaseBlobParser): """ if not self.images_parser: return "" + import pypdf from PIL import Image if "/XObject" not in cast(dict, page["/Resources"]).keys(): @@ -438,13 +439,18 @@ class PyPDFParser(BaseBlobParser): for obj in xObject: np_image: Any = None if xObject[obj]["/Subtype"] == "/Image": - if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS: + img_filter = ( + xObject[obj]["/Filter"][1:] + if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject + else xObject[obj]["/Filter"][0][1:] + ) + if img_filter in _PDF_FILTER_WITHOUT_LOSS: height, width = xObject[obj]["/Height"], xObject[obj]["/Width"] np_image = np.frombuffer( xObject[obj].get_data(), dtype=np.uint8 ).reshape(height, width, -1) - elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS: + elif img_filter in _PDF_FILTER_WITH_LOSS: np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data()))) else: