community[minor]: 03 - Refactoring PyPDF parser (#29330)

This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the PyPDF parser. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970).
2025-08-31 18:38:48 +00:00 · 2025-01-31 16:05:07 +01:00
parent b7e3e337b1
commit ceda8bc050
8 changed files with 1379 additions and 168 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -18,6 +18,7 @@ from typing import (
    Optional,
    Sequence,
    Union,
+    cast,
 )
 from urllib.parse import urlparse

@@ -240,86 +241,226 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:


 class PyPDFParser(BaseBlobParser):
-    """Load `PDF` using `pypdf`"""
+    """Parse a blob from a PDF using `pypdf` library.
+
+    This class provides methods to parse a blob from a PDF document, supporting various
+    configurations such as handling password-protected PDFs, extracting images.
+    It integrates the 'pypdf' library for PDF processing and offers synchronous blob
+    parsing.
+
+    Examples:
+        Setup:
+
+        .. code-block:: bash
+
+            pip install -U langchain-community pypdf
+
+        Load a blob from a PDF file:
+
+        .. code-block:: python
+
+            from langchain_core.documents.base import Blob
+
+            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
+
+        Instantiate the parser:
+
+        .. code-block:: python
+
+            from langchain_community.document_loaders.parsers import PyPDFParser
+
+            parser = PyPDFParser(
+                # password = None,
+                mode = "single",
+                pages_delimiter = "\n\f",
+                # extract_images = True,
+                # images_parser = TesseractBlobParser(),
+            )
+
+        Lazily parse the blob:
+
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = parser.lazy_parse(blob)
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+    """

    def __init__(
        self,
        password: Optional[Union[str, bytes]] = None,
        extract_images: bool = False,
        *,
-        extraction_mode: str = "plain",
+        mode: Literal["single", "page"] = "page",
+        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+        images_parser: Optional[BaseImageBlobParser] = None,
+        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+        extraction_mode: Literal["plain", "layout"] = "plain",
        extraction_kwargs: Optional[dict[str, Any]] = None,
    ):
-        self.password = password
+        """Initialize a parser based on PyPDF.
+
+        Args:
+            password: Optional password for opening encrypted PDFs.
+            extract_images: Whether to extract images from the PDF.
+            mode: The extraction mode, either "single" for the entire document or "page"
+                for page-wise extraction.
+            pages_delimiter: A string delimiter to separate pages in single-mode
+                extraction.
+            images_parser: Optional image blob parser.
+            images_inner_format: The format for the parsed output.
+                - "text" = return the content as is
+                - "markdown-img" = wrap the content into an image markdown link, w/ link
+                pointing to (`![body)(#)`]
+                - "html-img" = wrap the content as the `alt` text of an tag and link to
+                (`<img alt="{body}" src="#"/>`)
+            extraction_mode: “plain” for legacy functionality, “layout” extract text
+                in a fixed width format that closely adheres to the rendered layout in
+                the source pdf.
+            extraction_kwargs: Optional additional parameters for the extraction
+                process.
+
+        Raises:
+            ValueError: If the `mode` is not "single" or "page".
+        """
+        super().__init__()
+        if mode not in ["single", "page"]:
+            raise ValueError("mode must be single or page")
        self.extract_images = extract_images
+        if extract_images and not images_parser:
+            images_parser = RapidOCRBlobParser()
+        self.images_parser = images_parser
+        self.images_inner_format = images_inner_format
+        self.password = password
+        self.mode = mode
+        self.pages_delimiter = pages_delimiter
        self.extraction_mode = extraction_mode
        self.extraction_kwargs = extraction_kwargs or {}

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
-        """Lazily parse the blob."""
+        """
+        Lazily parse the blob.
+        Insert image, if possible, between two paragraphs.
+        In this way, a paragraph can be continued on the next page.
+
+        Args:
+            blob: The blob to parse.
+
+        Raises:
+            ImportError: If the `pypdf` package is not found.
+
+        Yield:
+            An iterator over the parsed documents.
+        """
        try:
            import pypdf
        except ImportError:
            raise ImportError(
-                "`pypdf` package not found, please install it with "
-                "`pip install pypdf`"
+                "pypdf package not found, please install it with `pip install pypdf`"
            )

        def _extract_text_from_page(page: pypdf.PageObject) -> str:
-            """Extract text from image given the version of pypdf."""
+            """
+            Extract text from image given the version of pypdf.
+
+            Args:
+                page: The page object to extract text from.
+
+            Returns:
+                str: The extracted text.
+            """
            if pypdf.__version__.startswith("3"):
                return page.extract_text()
            else:
                return page.extract_text(
-                    extraction_mode=self.extraction_mode,  # type: ignore[arg-type]
-                    **self.extraction_kwargs,  # type: ignore[arg-type]
+                    extraction_mode=self.extraction_mode,
+                    **self.extraction_kwargs,
                )

        with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
            pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)

-            yield from [
-                Document(
-                    page_content=_extract_text_from_page(page=page)
-                    + self._extract_images_from_page(page),
-                    metadata={
-                        "source": blob.source,
-                        "page": page_number,
-                        "page_label": pdf_reader.page_labels[page_number],
-                    },
-                    # type: ignore[attr-defined]
+            doc_metadata = _purge_metadata(
+                {"producer": "PyPDF", "creator": "PyPDF", "creationdate": ""}
+                | cast(dict, pdf_reader.metadata or {})
+                | {
+                    "source": blob.source,
+                    "total_pages": len(pdf_reader.pages),
+                }
+            )
+            single_texts = []
+            for page_number, page in enumerate(pdf_reader.pages):
+                text_from_page = _extract_text_from_page(page=page)
+                images_from_page = self.extract_images_from_page(page)
+                all_text = _merge_text_and_extras(
+                    [images_from_page], text_from_page
+                ).strip()
+                if self.mode == "page":
+                    yield Document(
+                        page_content=all_text,
+                        metadata=_validate_metadata(
+                            doc_metadata
+                            | {
+                                "page": page_number,
+                                "page_label": pdf_reader.page_labels[page_number],
+                            }
+                        ),
+                    )
+                else:
+                    single_texts.append(all_text)
+            if self.mode == "single":
+                yield Document(
+                    page_content=self.pages_delimiter.join(single_texts),
+                    metadata=_validate_metadata(doc_metadata),
                )
-                for page_number, page in enumerate(pdf_reader.pages)
-            ]

-    def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
-        """Extract images from page and get the text with RapidOCR."""
-        if not self.extract_images or "/XObject" not in page["/Resources"].keys():  # type: ignore[attr-defined]
+    def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
+        """Extract images from a PDF page and get the text using images_to_text.
+
+        Args:
+            page: The page object from which to extract images.
+
+        Returns:
+            str: The extracted text from the images on the page.
+        """
+        if not self.images_parser:
+            return ""
+        from PIL import Image
+
+        if "/XObject" not in cast(dict, page["/Resources"]).keys():
            return ""

-        xObject = page["/Resources"]["/XObject"].get_object()  # type: ignore
+        xObject = page["/Resources"]["/XObject"].get_object()  # type: ignore[index]
        images = []
        for obj in xObject:
+            np_image: Any = None
            if xObject[obj]["/Subtype"] == "/Image":
                if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
                    height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]

-                    images.append(
-                        np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
-                            height, width, -1
-                        )
-                    )
+                    np_image = np.frombuffer(
+                        xObject[obj].get_data(), dtype=np.uint8
+                    ).reshape(height, width, -1)
                elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
-                    images.append(xObject[obj].get_data())
-                elif (
-                    isinstance(xObject[obj]["/Filter"], list)
-                    and xObject[obj]["/Filter"]
-                    and xObject[obj]["/Filter"][0][1:] in _PDF_FILTER_WITH_LOSS
-                ):
-                    images.append(xObject[obj].get_data())
+                    np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
+
                else:
-                    warnings.warn("Unknown PDF Filter!")
-        return extract_from_images_with_rapidocr(images)
+                    logger.warning("Unknown PDF Filter!")
+                if np_image is not None:
+                    image_bytes = io.BytesIO()
+                    Image.fromarray(np_image).save(image_bytes, format="PNG")
+                    blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
+                    image_text = next(self.images_parser.lazy_parse(blob)).page_content
+                    images.append(
+                        _format_inner_image(blob, image_text, self.images_inner_format)
+                    )
+        return _FORMAT_IMAGE_STR.format(
+            image_text=_JOIN_IMAGES.join(filter(None, images))
+        )


 class PDFMinerParser(BaseBlobParser):
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -184,64 +184,56 @@ class OnlinePDFLoader(BasePDFLoader):


 class PyPDFLoader(BasePDFLoader):
-    """PyPDFLoader document loader integration
+    """Load and parse a PDF file using 'pypdf' library.

-    Setup:
-        Install ``langchain-community``.
+    This class provides methods to load and parse PDF documents, supporting various
+    configurations such as handling password-protected files, extracting images, and
+    defining extraction mode. It integrates the `pypdf` library for PDF processing and
+    offers both synchronous and asynchronous document loading.
+
+    Examples:
+        Setup:

        .. code-block:: bash

-            pip install -U langchain-community
+            pip install -U langchain-community pypdf
+
+        Instantiate the loader:

-    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import PyPDFLoader

            loader = PyPDFLoader(
                file_path = "./example_data/layout-parser-paper.pdf",
-                password = "my-password",
-                extract_images = True,
                # headers = None
-                # extraction_mode = "plain",
-                # extraction_kwargs = None,
+                # password = None,
+                mode = "single",
+                pages_delimiter = "\n\f",
+                # extract_images = True,
+                # images_parser = RapidOCRBlobParser(),
            )

-    Lazy load:
+        Lazy load documents:
+
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

-            # async variant:
-            # docs_lazy = await loader.alazy_load()
-
            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

-        .. code-block:: python
+        Load documents asynchronously:

-            LayoutParser : A Uniﬁed Toolkit for Deep
-            Learning Based Document Image Analysis
-            Zejiang Shen1( ), R
-            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}
-
-    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
-
-        .. code-block:: python
-
-            LayoutParser : A Uniﬁed Toolkit for Deep
-            Learning Based Document Image Analysis
-            Zejiang Shen1( ), R
-            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}
-    """  # noqa: E501
+    """

    def __init__(
        self,
@@ -250,20 +242,50 @@ class PyPDFLoader(BasePDFLoader):
        headers: Optional[dict] = None,
        extract_images: bool = False,
        *,
-        extraction_mode: str = "plain",
+        mode: Literal["single", "page"] = "page",
+        images_parser: Optional[BaseImageBlobParser] = None,
+        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+        extraction_mode: Literal["plain", "layout"] = "plain",
        extraction_kwargs: Optional[dict] = None,
    ) -> None:
-        """Initialize with a file path."""
-        try:
-            import pypdf  # noqa:F401
-        except ImportError:
-            raise ImportError(
-                "pypdf package not found, please install it with `pip install pypdf`"
-            )
+        """Initialize with a file path.
+
+        Args:
+            file_path: The path to the PDF file to be loaded.
+            headers: Optional headers to use for GET request to download a file from a
+              web path.
+            password: Optional password for opening encrypted PDFs.
+            mode: The extraction mode, either "single" for the entire document or "page"
+                for page-wise extraction.
+            pages_delimiter: A string delimiter to separate pages in single-mode
+                extraction.
+            extract_images: Whether to extract images from the PDF.
+            images_parser: Optional image blob parser.
+            images_inner_format: The format for the parsed output.
+                - "text" = return the content as is
+                - "markdown-img" = wrap the content into an image markdown link, w/ link
+                pointing to (`![body)(#)`]
+                - "html-img" = wrap the content as the `alt` text of an tag and link to
+                (`<img alt="{body}" src="#"/>`)
+            extraction_mode: “plain” for legacy functionality, “layout” extract text
+                in a fixed width format that closely adheres to the rendered layout in
+                the source pdf
+            extraction_kwargs: Optional additional parameters for the extraction
+                process.
+
+        Returns:
+            This method does not directly return data. Use the `load`, `lazy_load` or
+            `aload` methods to retrieve parsed documents with content and metadata.
+        """
        super().__init__(file_path, headers=headers)
        self.parser = PyPDFParser(
            password=password,
+            mode=mode,
            extract_images=extract_images,
+            images_parser=images_parser,
+            images_inner_format=images_inner_format,
+            pages_delimiter=pages_delimiter,
            extraction_mode=extraction_mode,
            extraction_kwargs=extraction_kwargs,
        )
@@ -271,12 +293,18 @@ class PyPDFLoader(BasePDFLoader):
    def lazy_load(
        self,
    ) -> Iterator[Document]:
-        """Lazy load given path as pages."""
+        """
+        Lazy load given path as pages.
+        Insert image, if possible, between two paragraphs.
+        In this way, a paragraph can be continued on the next page.
+        """
        if self.web_path:
-            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
+            blob = Blob.from_data(  # type: ignore[attr-defined]
+                open(self.file_path, "rb").read(), path=self.web_path
+            )
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
-        yield from self.parser.parse(blob)
+        yield from self.parser.lazy_parse(blob)


 class PyPDFium2Loader(BasePDFLoader):
@@ -305,9 +333,56 @@ class PyPDFium2Loader(BasePDFLoader):


 class PyPDFDirectoryLoader(BaseLoader):
-    """Load a directory with `PDF` files using `pypdf` and chunks at character level.
+    """Load and parse a directory of PDF files using 'pypdf' library.

-    Loader also stores page numbers in metadata.
+    This class provides methods to load and parse multiple PDF documents in a directory,
+    supporting options for recursive search, handling password-protected files,
+    extracting images, and defining extraction modes. It integrates the `pypdf` library
+    for PDF processing and offers synchronous document loading.
+
+    Examples:
+        Setup:
+
+        .. code-block:: bash
+
+            pip install -U langchain-community pypdf
+
+        Instantiate the loader:
+
+        .. code-block:: python
+
+            from langchain_community.document_loaders import PyPDFDirectoryLoader
+
+            loader = PyPDFDirectoryLoader(
+                path = "./example_data/",
+                glob = "**/[!.]*.pdf",
+                silent_errors = False,
+                load_hidden = False,
+                recursive = False,
+                extract_images = False,
+                password = None,
+                mode = "page",
+                images_to_text = None,
+                headers = None,
+                extraction_mode = "plain",
+                # extraction_kwargs = None,
+            )
+
+        Load documents:
+
+        .. code-block:: python
+
+            docs = loader.load()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        Load documents asynchronously:
+
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
    """

    def __init__(
@@ -318,16 +393,53 @@ class PyPDFDirectoryLoader(BaseLoader):
        load_hidden: bool = False,
        recursive: bool = False,
        extract_images: bool = False,
+        *,
+        password: Optional[str] = None,
+        mode: Literal["single", "page"] = "page",
+        images_parser: Optional[BaseImageBlobParser] = None,
+        headers: Optional[dict] = None,
+        extraction_mode: Literal["plain", "layout"] = "plain",
+        extraction_kwargs: Optional[dict] = None,
    ):
+        """Initialize with a directory path.
+
+        Args:
+            path: The path to the directory containing PDF files to be loaded.
+            glob: The glob pattern to match files in the directory.
+            silent_errors: Whether to log errors instead of raising them.
+            load_hidden: Whether to include hidden files in the search.
+            recursive: Whether to search subdirectories recursively.
+            extract_images: Whether to extract images from PDFs.
+            password: Optional password for opening encrypted PDFs.
+            mode: The extraction mode, either "single" for extracting the entire
+                document or "page" for page-wise extraction.
+            images_parser: Optional image blob parser..
+            headers: Optional headers to use for GET request to download a file from a
+              web path.
+            extraction_mode: “plain” for legacy functionality, “layout” for
+              experimental layout mode functionality
+            extraction_kwargs: Optional additional parameters for the extraction
+              process.
+
+        Returns:
+            This method does not directly return data. Use the `load` method to
+            retrieve parsed documents with content and metadata.
+        """
+        self.password = password
+        self.mode = mode
        self.path = path
        self.glob = glob
        self.load_hidden = load_hidden
        self.recursive = recursive
        self.silent_errors = silent_errors
        self.extract_images = extract_images
+        self.images_parser = images_parser
+        self.headers = headers
+        self.extraction_mode = extraction_mode
+        self.extraction_kwargs = extraction_kwargs

    @staticmethod
-    def _is_visible(path: Path) -> bool:
+    def _is_visible(path: PurePath) -> bool:
        return not any(part.startswith(".") for part in path.parts)

    def load(self) -> list[Document]:
@@ -338,7 +450,16 @@ class PyPDFDirectoryLoader(BaseLoader):
            if i.is_file():
                if self._is_visible(i.relative_to(p)) or self.load_hidden:
                    try:
-                        loader = PyPDFLoader(str(i), extract_images=self.extract_images)
+                        loader = PyPDFLoader(
+                            str(i),
+                            password=self.password,
+                            mode=self.mode,
+                            extract_images=self.extract_images,
+                            images_parser=self.images_parser,
+                            headers=self.headers,
+                            extraction_mode=self.extraction_mode,
+                            extraction_kwargs=self.extraction_kwargs,
+                        )
                        sub_docs = loader.load()
                        for doc in sub_docs:
                            doc.metadata["source"] = str(i)
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -14,7 +14,6 @@ from langchain_community.document_loaders.parsers import (
    PDFMinerParser,
    PDFPlumberParser,
    PyPDFium2Parser,
-    PyPDFParser,
 )

 if TYPE_CHECKING:
@@ -98,11 +97,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
        assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]


-def test_pypdf_parser() -> None:
-    """Test PyPDF parser."""
-    _assert_with_parser(PyPDFParser())
-
-
 def test_pdfminer_parser() -> None:
    """Test PDFMiner parser."""
    # Does not follow defaults to split by page.
@@ -122,11 +116,6 @@ def test_pdfplumber_parser() -> None:
    _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)


-def test_extract_images_text_from_pdf_pypdfparser() -> None:
-    """Test extract image from pdf and recognize text with rapid ocr - PyPDFParser"""
-    _assert_with_parser(PyPDFParser(extract_images=True))
-
-
 def test_extract_images_text_from_pdf_pdfminerparser() -> None:
    """Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
    _assert_with_parser(PDFMinerParser(extract_images=True))
@@ -150,6 +139,8 @@ class EmptyImageBlobParser(BaseImageBlobParser):
    "parser_factory,params",
    [
        ("PyMuPDFParser", {}),
+        ("PyPDFParser", {"extraction_mode": "plain"}),
+        ("PyPDFParser", {"extraction_mode": "layout"}),
    ],
 )
@pytest.mark.requires("pillow")
@@ -176,6 +167,8 @@ def test_mode_and_extract_images_variations(
    "parser_factory,params",
    [
        ("PyMuPDFParser", {}),
+        ("PyPDFParser", {"extraction_mode": "plain"}),
+        ("PyPDFParser", {"extraction_mode": "layout"}),
    ],
 )
@pytest.mark.requires("pillow")
--- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@@ -212,6 +212,7 @@ def test_amazontextract_loader_failures() -> None:
    "parser_factory,params",
    [
        ("PyMuPDFLoader", {}),
+        ("PyPDFLoader", {}),
    ],
 )
 def test_standard_parameters(
@@ -229,12 +230,10 @@ def test_standard_parameters(
    loader = loader_class(
        file_path,
        mode="page",
-        page_delimiter="---",
+        pages_delimiter="---",
        images_parser=None,
        images_inner_format="text",
        password=None,
-        extract_tables=None,
-        extract_tables_settings=None,
    )
    docs = loader.load()
    assert len(docs) == 16
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers.pdf import (
    PDFMinerParser,
    PyPDFium2Parser,
-    PyPDFParser,
    _merge_text_and_extras,
 )

@@ -76,12 +75,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
        assert int(metadata["page"]) == 0


-@pytest.mark.requires("pypdf")
-def test_pypdf_parser() -> None:
-    """Test PyPDF parser."""
-    _assert_with_parser(PyPDFParser())
-
-
@pytest.mark.requires("pdfminer")
 def test_pdfminer_parser() -> None:
    """Test PDFMiner parser."""
@@ -100,6 +93,7 @@ def test_pypdfium2_parser() -> None:
    "parser_factory,require,params",
    [
        ("PyMuPDFParser", "pymupdf", {}),
+        ("PyPDFParser", "pypdf", {}),
    ],
 )
 def test_parsers(
--- a/libs/community/tests/unit_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py
@@ -65,7 +65,8 @@ def test_pypdf_loader_with_layout() -> None:
    expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
    cleaned_first_page = re.sub(r"\x00", "", first_page)
    cleaned_expected = re.sub(r"\x00", "", expected)
-    assert cleaned_first_page == cleaned_expected
+
+    assert cleaned_first_page == cleaned_expected.strip()


@pytest.mark.requires("pypdf")