community[patch]: update PyPDFParser to take into account filters returned as arrays (#30489)

The image parsing is generating a bug as the the extracted objects for the /Filter returns sometimes an array, sometimes a string. Fix [Issue 30098](https://github.com/langchain-ai/langchain/issues/30098)
2025-08-02 01:23:07 +00:00 · 2025-03-26 19:16:54 +01:00 · 2025-03-26 19:16:54 +01:00 · 8e5d2a44ce
commit 8e5d2a44ce
parent 422ba4cde5
1 changed files with 8 additions and 2 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -428,6 +428,7 @@ class PyPDFParser(BaseBlobParser):
        """
        if not self.images_parser:
            return ""
+        import pypdf
        from PIL import Image

        if "/XObject" not in cast(dict, page["/Resources"]).keys():
@ -438,13 +439,18 @@ class PyPDFParser(BaseBlobParser):
        for obj in xObject:
            np_image: Any = None
            if xObject[obj]["/Subtype"] == "/Image":
-                if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
+                img_filter = (
+                    xObject[obj]["/Filter"][1:]
+                    if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
+                    else xObject[obj]["/Filter"][0][1:]
+                )
+                if img_filter in _PDF_FILTER_WITHOUT_LOSS:
                    height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]

                    np_image = np.frombuffer(
                        xObject[obj].get_data(), dtype=np.uint8
                    ).reshape(height, width, -1)
-                elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
+                elif img_filter in _PDF_FILTER_WITH_LOSS:
                    np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))

                else: