community[minor]: 05 - Refactoring PyPDFium2 parser (#29625)

This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the PyPDFium2 parser. For more details, see https://github.com/langchain-ai/langchain/pull/28970.
2025-06-25 08:03:39 +00:00 · 2025-02-08 03:31:12 +01:00 · 2025-02-08 03:31:12 +01:00 · beb75b2150
commit beb75b2150
parent 723031d548
6 changed files with 1281 additions and 126 deletions
--- a/docs/docs/integrations/document_loaders/pypdfium2.ipynb
+++ b/docs/docs/integrations/document_loaders/pypdfium2.ipynb
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -1158,50 +1158,216 @@ class PyMuPDFParser(BaseBlobParser):
 class PyPDFium2Parser(BaseBlobParser):
-    """Parse `PDF` with `PyPDFium2`."""
+    """Parse a blob from a PDF using `PyPDFium2` library.
-    def __init__(self, extract_images: bool = False) -> None:
+    This class provides methods to parse a blob from a PDF document, supporting various
-        """Initialize the parser."""
+    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyPDFium2' library for PDF processing and offers synchronous
    blob parsing.
    Examples:
        Setup:
        .. code-block:: bash
            pip install -U langchain-community pypdfium2
        Load a blob from a PDF file:
        .. code-block:: python
            from langchain_core.documents.base import Blob
            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
        Instantiate the parser:
        .. code-block:: python
            from langchain_community.document_loaders.parsers import PyPDFium2Parser
            parser = PyPDFium2Parser(
                # password=None,
                mode="page",
                pages_delimiter="\n\f",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )
        Lazily parse the blob:
        .. code-block:: python
            docs = []
            docs_lazy = parser.lazy_parse(blob)
            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    """
    # PyPDFium2 is not thread safe.
    # See https://pypdfium2.readthedocs.io/en/stable/python_api.html#thread-incompatibility
    _lock = threading.Lock()
    def __init__(
        self,
        extract_images: bool = False,
        *,
        password: Optional[str] = None,
        mode: Literal["single", "page"] = "page",
        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
        images_parser: Optional[BaseImageBlobParser] = None,
        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
    ) -> None:
        """Initialize a parser based on PyPDFium2.
        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” for experimental
                layout mode functionality
            extraction_kwargs: Optional additional parameters for the extraction
                process.
        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.
        Raises:
            ValueError: If the mode is not "single" or "page".
        """
        super().__init__()
        if mode not in ["single", "page"]:
            raise ValueError("mode must be single or page")
        self.extract_images = extract_images
        if extract_images and not images_parser:
            images_parser = RapidOCRBlobParser()
        self.images_parser = images_parser
        self.images_inner_format = images_inner_format
        self.password = password
        self.mode = mode
        self.pages_delimiter = pages_delimiter
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.
        Args:
            blob: The blob to parse.
        Raises:
            ImportError: If the `pypdf` package is not found.
        Yield:
            An iterator over the parsed documents.
        """
        try:
-            import pypdfium2  # noqa:F401
+            import pypdfium2
        except ImportError:
            raise ImportError(
                "pypdfium2 package not found, please install it with"
                " `pip install pypdfium2`"
            )
        self.extract_images = extract_images
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
        import pypdfium2
        # pypdfium2 is really finicky with respect to closing things,
        # if done incorrectly creates seg faults.
-        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
+        with PyPDFium2Parser._lock:
-            pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
+            with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
-            try:
+                pdf_reader = None
-                for page_number, page in enumerate(pdf_reader):
+                try:
-                    text_page = page.get_textpage()
+                    pdf_reader = pypdfium2.PdfDocument(
-                    content = text_page.get_text_range()
+                        file_path, password=self.password, autoclose=True
-                    text_page.close()
+                    )
-                    content += "\n" + self._extract_images_from_page(page)
+                    full_content = []
-                    page.close()
+
-                    metadata = {"source": blob.source, "page": page_number}  # type: ignore[attr-defined]
+                    doc_metadata = _purge_metadata(pdf_reader.get_metadata_dict())
-                    yield Document(page_content=content, metadata=metadata)
+                    doc_metadata["source"] = blob.source
-            finally:
+                    doc_metadata["total_pages"] = len(pdf_reader)
-                pdf_reader.close()
+
                    for page_number, page in enumerate(pdf_reader):
                        text_page = page.get_textpage()
                        text_from_page = "\n".join(
                            text_page.get_text_range().splitlines()
                        )  # Replace \r\n
                        text_page.close()
                        image_from_page = self._extract_images_from_page(page)
                        all_text = _merge_text_and_extras(
                            [image_from_page], text_from_page
                        ).strip()
                        page.close()
                        if self.mode == "page":
                            # For legacy compatibility, add the last '\n'
                            if not all_text.endswith("\n"):
                                all_text += "\n"
                            yield Document(
                                page_content=all_text,
                                metadata=_validate_metadata(
                                    {
                                        **doc_metadata,
                                        "page": page_number,
                                    }
                                ),
                            )
                        else:
                            full_content.append(all_text)
                    if self.mode == "single":
                        yield Document(
                            page_content=self.pages_delimiter.join(full_content),
                            metadata=_validate_metadata(doc_metadata),
                        )
                finally:
                    if pdf_reader:
                        pdf_reader.close()
    def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
-        """Extract images from page and get the text with RapidOCR."""
+        """Extract images from a PDF page and get the text using images_to_text.
-        if not self.extract_images:
+
        Args:
            page: The page object from which to extract images.
        Returns:
            str: The extracted text from the images on the page.
        """
        if not self.images_parser:
            return ""
        import pypdfium2.raw as pdfium_c
        images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
-
+        if not images:
-        images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
+            return ""
-        return extract_from_images_with_rapidocr(images)
+        str_images = []
        for image in images:
            image_bytes = io.BytesIO()
            np_image = image.get_bitmap().to_numpy()
            if np_image.size < 3:
                continue
            numpy.save(image_bytes, image.get_bitmap().to_numpy())
            blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy")
            text_from_image = next(self.images_parser.lazy_parse(blob)).page_content
            str_images.append(
                _format_inner_image(blob, text_from_image, self.images_inner_format)
            )
            image.close()
        return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images))
 class PDFPlumberParser(BaseBlobParser):
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -308,25 +308,116 @@ class PyPDFLoader(BasePDFLoader):
 class PyPDFium2Loader(BasePDFLoader):
-    """Load `PDF` using `pypdfium2` and chunks at character level."""
+    """Load and parse a PDF file using the `pypdfium2` library.
    This class provides methods to load and parse PDF documents, supporting various
    configurations such as handling password-protected files, extracting images, and
    defining extraction mode.
    It integrates the `pypdfium2` library for PDF processing and offers both
    synchronous and asynchronous document loading.
    Examples:
        Setup:
        .. code-block:: bash
            pip install -U langchain-community pypdfium2
        Instantiate the loader:
        .. code-block:: python
            from langchain_community.document_loaders import PyPDFium2Loader
            loader = PyPDFium2Loader(
                file_path = "./example_data/layout-parser-paper.pdf",
                # headers = None
                # password = None,
                mode = "single",
                pages_delimiter = "\n\f",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )
        Lazy load documents:
        .. code-block:: python
            docs = []
            docs_lazy = loader.lazy_load()
            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
        Load documents asynchronously:
        .. code-block:: python
            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    """
    def __init__(
        self,
        file_path: Union[str, PurePath],
        *,
-        headers: Optional[dict] = None,
+        mode: Literal["single", "page"] = "page",
        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
        password: Optional[str] = None,
        extract_images: bool = False,
        images_parser: Optional[BaseImageBlobParser] = None,
        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
        headers: Optional[dict] = None,
    ):
-        """Initialize with a file path."""
+        """Initialize with a file path.
        Args:
            file_path: The path to the PDF file to be loaded.
            headers: Optional headers to use for GET request to download a file from a
              web path.
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
        Returns:
            This class does not directly return data. Use the `load`, `lazy_load` or
            `aload` methods to retrieve parsed documents with content and metadata.
        """
        super().__init__(file_path, headers=headers)
-        self.parser = PyPDFium2Parser(extract_images=extract_images)
+        self.parser = PyPDFium2Parser(
            mode=mode,
            password=password,
            extract_images=extract_images,
            images_parser=images_parser,
            images_inner_format=images_inner_format,
            pages_delimiter=pages_delimiter,
        )
    def lazy_load(
        self,
    ) -> Iterator[Document]:
-        """Lazy load given path as pages."""
+        """
        Lazy load given path as pages.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.
        """
        if self.web_path:
-            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
+            blob = Blob.from_data(  # type: ignore[attr-defined]
                open(self.file_path, "rb").read(), path=self.web_path
            )
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
        yield from self.parser.parse(blob)
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers import (
    BaseImageBlobParser,
    PDFPlumberParser,
    PyPDFium2Parser,
 )
 if TYPE_CHECKING:
@ -96,12 +95,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
        assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
 def test_pypdfium2_parser() -> None:
    """Test PyPDFium2 parser."""
    # Does not follow defaults to split by page.
    _assert_with_parser(PyPDFium2Parser())
 def test_pdfplumber_parser() -> None:
    """Test PDFPlumber parser."""
    _assert_with_parser(PDFPlumberParser())
@ -109,11 +102,6 @@ def test_pdfplumber_parser() -> None:
    _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
 def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
    """Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser"""  # noqa: E501
    _assert_with_parser(PyPDFium2Parser(extract_images=True))
 class EmptyImageBlobParser(BaseImageBlobParser):
    def _analyze_image(self, img: "Image") -> str:
        return "Hello world"
@ -128,6 +116,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
    [
        ("PDFMinerParser", {}),
        ("PyMuPDFParser", {}),
        ("PyPDFium2Parser", {}),
        ("PyPDFParser", {"extraction_mode": "plain"}),
        ("PyPDFParser", {"extraction_mode": "layout"}),
    ],
@ -157,6 +146,7 @@ def test_mode_and_extract_images_variations(
    [
        ("PDFMinerParser", {}),
        ("PyMuPDFParser", {}),
        ("PyPDFium2Parser", {}),
        ("PyPDFParser", {"extraction_mode": "plain"}),
        ("PyPDFParser", {"extraction_mode": "layout"}),
    ],
--- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@ -9,7 +9,6 @@ from langchain_community.document_loaders import (
    AmazonTextractPDFLoader,
    MathpixPDFLoader,
    PDFMinerPDFasHTMLLoader,
    PyPDFium2Loader,
    UnstructuredPDFLoader,
 )
@ -56,21 +55,6 @@ def test_pdfminer_pdf_as_html_loader() -> None:
    assert len(docs) == 1
 def test_pypdfium2_loader() -> None:
    """Test PyPDFium2Loader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = PyPDFium2Loader(file_path)
    docs = loader.load()
    assert len(docs) == 1
    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = PyPDFium2Loader(file_path)
    docs = loader.load()
    assert len(docs) == 16
@pytest.mark.skipif(
    not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
 )
@ -184,6 +168,7 @@ def test_amazontextract_loader_failures() -> None:
    [
        ("PDFMinerLoader", {}),
        ("PyMuPDFLoader", {}),
        ("PyPDFium2Loader", {}),
        ("PyPDFLoader", {}),
    ],
 )
@ -206,8 +191,6 @@ def test_standard_parameters(
        images_parser=None,
        images_inner_format="text",
        password=None,
        extract_tables=None,
        extract_tables_settings=None,
    )
    docs = loader.load()
    assert len(docs) == 16
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
@ -9,10 +9,7 @@ import pytest
 import langchain_community.document_loaders.parsers as pdf_parsers
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
-from langchain_community.document_loaders.parsers.pdf import (
+from langchain_community.document_loaders.parsers.pdf import _merge_text_and_extras
    PyPDFium2Parser,
    _merge_text_and_extras,
 )
 _THIS_DIR = Path(__file__).parents[3]
@ -74,19 +71,13 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
        assert int(metadata["page"]) == 0
@pytest.mark.requires("pypdfium2")
 def test_pypdfium2_parser() -> None:
    """Test PyPDFium2 parser."""
    # Does not follow defaults to split by page.
    _assert_with_parser(PyPDFium2Parser())
@pytest.mark.parametrize(
    "parser_factory,require,params",
    [
        ("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
        ("PyMuPDFParser", "pymupdf", {}),
        ("PyPDFParser", "pypdf", {}),
        ("PyPDFium2Parser", "pypdfium2", {}),
    ],
 )
 def test_parsers(