Refactor PDFPlumber

2025-08-16 16:11:02 +00:00 · 2025-01-02 17:58:36 +01:00 · 2025-01-02 17:58:36 +01:00 · 2b7ffd6a7f
commit 2b7ffd6a7f
parent 78c54fccf3
10 changed files with 1693 additions and 154 deletions
--- a/docs/docs/integrations/document_loaders/pdfminer.ipynb
+++ b/docs/docs/integrations/document_loaders/pdfminer.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# PDFMinerLoader\n",
    "\n",
-    "This notebook provides a quick overview for getting started with `PDFMiner` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFMinerLoader.html).\n",
+    "This sample provides a quick overview for getting started with `PDFMiner` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all PDFMinerLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFMinerLoader.html).\n",
    "\n",
    "  \n",
    "\n",
--- a/docs/docs/integrations/document_loaders/pdfplumber.ipynb
+++ b/docs/docs/integrations/document_loaders/pdfplumber.ipynb
--- a/docs/docs/integrations/document_loaders/pymupdf.ipynb
+++ b/docs/docs/integrations/document_loaders/pymupdf.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# PyMuPDFLoader\n",
    "\n",
-    "This notebook provides a quick overview for getting started with `PyMuPDF` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html).\n",
+    "This sample provides a quick overview for getting started with `PyMuPDF` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all PyMuPDFLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html).\n",
    "\n",
    "  \n",
    "\n",
--- a/docs/docs/integrations/document_loaders/pypdfium2.ipynb
+++ b/docs/docs/integrations/document_loaders/pypdfium2.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# PyPDFium2Loader\n",
    "\n",
-    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html).\n",
+    "This sample provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all PyPDFium2Loader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html).\n",
    "\n",
    "  \n",
    "\n",
--- a/docs/docs/integrations/document_loaders/pypdfloader.ipynb
+++ b/docs/docs/integrations/document_loaders/pypdfloader.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# PyPDFLoader\n",
    "\n",
-    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
+    "This sample provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all PyPDFLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
    "\n",
    "  \n",
    "\n",
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -129,6 +129,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
    The standard keys are:
    - source
    - page (if mode='page')
    - total_page
    - creationdate
    - creator
@ -453,7 +454,9 @@ class PyPDFParser(BaseBlobParser):
                    image_bytes = io.BytesIO()
                    Image.fromarray(np_image).save(image_bytes, format="PNG")
                    blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
-                    image_text = next(self.images_parser.lazy_parse(blob)).page_content
+                    image_text = next(
                        self.images_parser.lazy_parse(blob)  # type: ignore
                    ).page_content
                    images.append(
                        _format_inner_image(blob, image_text, self.images_inner_format)
                    )
@ -748,7 +751,7 @@ class PDFMinerParser(BaseBlobParser):
                                blob = Blob.from_path(Path(tempdir) / filename)
                                blob.metadata["source"] = "#"
                                image_text = next(
-                                    self.images_parser.lazy_parse(blob)
+                                    self.images_parser.lazy_parse(blob)  # type: ignore
                                ).page_content
                                text_io.write(
@ -1101,7 +1104,9 @@ class PyMuPDFParser(BaseBlobParser):
                blob = Blob.from_data(
                    image_bytes.getvalue(), mime_type="application/x-npy"
                )
-                image_text = next(self.images_parser.lazy_parse(blob)).page_content
+                image_text = next(
                    self.images_parser.lazy_parse(blob)  # type: ignore
                ).page_content
                images.append(
                    _format_inner_image(blob, image_text, self.images_inner_format)
@ -1191,8 +1196,6 @@ class PyPDFium2Parser(BaseBlobParser):
                # password=None,
                mode="page",
                pages_delimiter="\n\f",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )
        Lazily parse the blob:
@ -1362,7 +1365,9 @@ class PyPDFium2Parser(BaseBlobParser):
                continue
            numpy.save(image_bytes, image.get_bitmap().to_numpy())
            blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy")
-            text_from_image = next(self.images_parser.lazy_parse(blob)).page_content
+            text_from_image = next(
                self.images_parser.lazy_parse(blob)  # type: ignore
            ).page_content
            str_images.append(
                _format_inner_image(blob, text_from_image, self.images_inner_format)
            )
@ -1370,98 +1375,561 @@ class PyPDFium2Parser(BaseBlobParser):
        return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images))
 # The legacy PDFPlumberParser use key with upper case.
 # This is not aligned with the new convention, which requires the key to be in
 # lower case.
 class _PDFPlumberParserMetadata(dict):
    _warning_keys: set[str] = set()
    def __init__(self, d: dict[str, Any]):
        super().__init__({k.lower(): v for k, v in d.items()})
        self._pdf_metadata_keys = set(d.keys())
    def _lower(self, k: object) -> object:
        assert isinstance(k, str)
        if k in self._pdf_metadata_keys:
            lk = k.lower()
            if lk != k:
                if k not in _PDFPlumberParserMetadata._warning_keys:
                    _PDFPlumberParserMetadata._warning_keys.add(str(k))
                    logger.warning(
                        'The key "%s" with uppercase is deprecated. '
                        "Update your code and vectorstore.",
                        k,
                    )
            return lk
        else:
            return k
    def __contains__(self, k: object) -> bool:
        return super().__contains__(self._lower(k))
    def __delitem__(self, k: object) -> None:
        super().__delitem__(self._lower(k))
    def __getitem__(self, k: object) -> Any:
        return super().__getitem__(self._lower(k))
    def get(self, k: object, default: Any = None) -> Any:
        return super().get(self._lower(str(k)), default)
    def __setitem__(self, k: object, v: Any) -> None:
        super().__setitem__(self._lower(str(k)), v)
 class PDFPlumberParser(BaseBlobParser):
-    """Parse `PDF` with `PDFPlumber`."""
+    """Parse a blob from a PDF using `pdfplumber` library.
    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'pdfplumber' library for PDF processing and offers synchronous
    blob parsing.
    Examples:
        Setup:
        .. code-block:: bash
            pip install -U langchain-community pdfplumber
        Load a blob from a PDF file:
        .. code-block:: python
            from langchain_core.documents.base import Blob
            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
        Instantiate the parser:
        .. code-block:: python
            from langchain_community.document_loaders.parsers import PDFPlumberParser
            parser = PDFPlumberParser(
                # password = None,
                mode = "single",
                pages_delimiter = "\n\f",
                # extract_tables="markdown",
            )
        Lazily parse the blob:
        .. code-block:: python
            docs = []
            docs_lazy = parser.lazy_parse(blob)
            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    """
    def __init__(
        self,
        text_kwargs: Optional[Mapping[str, Any]] = None,
        dedupe: bool = False,
        extract_images: bool = False,
        *,
        password: Optional[str] = None,
        mode: Literal["single", "page"] = "page",
        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
        images_parser: Optional[BaseImageBlobParser] = None,
        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
        extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
        extract_tables_settings: Optional[dict[str, Any]] = None,
    ) -> None:
        """Initialize the parser.
        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extract_tables: Whether to extract images from the PDF in a specific
                format, such as "csv", "markdown" or "html".
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
-            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
+            dedupe:  Avoiding the error of duplicate characters if `dedupe=True`
            extract_tables_settings: Optional dictionary of settings for customizing
            table extraction.
        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.
        Raises:
            ValueError: If the `mode` is not "single" or "page".
            ValueError: If the `extract_tables` is not "csv", "markdown" or "html".
        """
-        try:
+        super().__init__()
-            import PIL  # noqa:F401
+        if mode not in ["single", "page"]:
-        except ImportError:
+            raise ValueError("mode must be single or page")
-            raise ImportError(
+        if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
-                "pillow package not found, please install it with `pip install pillow`"
+            raise ValueError("mode must be csv, markdown or html")
-            )
+        if not extract_images and not images_parser:
-        self.text_kwargs = text_kwargs or {}
+            images_parser = RapidOCRBlobParser()
-        self.dedupe = dedupe
+        self.password = password
        self.extract_images = extract_images
        self.images_parser = images_parser
        self.images_inner_format = images_inner_format
        self.mode = mode
        self.pages_delimiter = pages_delimiter
        self.dedupe = dedupe
        self.text_kwargs = text_kwargs or {}
        self.extract_tables = extract_tables
        self.extract_tables_settings = extract_tables_settings or {
            "vertical_strategy": "lines",
            "horizontal_strategy": "lines",
            "snap_y_tolerance": 5,
            "intersection_x_tolerance": 15,
        }
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
-        """Lazily parse the blob."""
+        """
-        import pdfplumber
+        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.
        Args:
            blob: The blob to parse.
        Raises:
            ImportError: If the `pypdf` package is not found.
        Yield:
            An iterator over the parsed documents.
        """
        try:
            import pdfplumber
        except ImportError:
            raise ImportError(
                "pdfplumber package not found, please install it "
                "with `pip install pdfplumber`"
            )
        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
-            doc = pdfplumber.open(file_path)  # open document
+            doc = pdfplumber.open(file_path, password=self.password)  # open document
            from pdfplumber.utils import geometry  # import WordExctractor, TextMap
-            yield from [
+            contents = []
-                Document(
+            doc_metadata = _purge_metadata(
-                    page_content=self._process_page_content(page)
+                (
-                    + "\n"
+                    doc.metadata
-                    + self._extract_images_from_page(page),
+                    | {
-                    metadata=dict(
+                        "source": blob.source,
-                        {
+                        "file_path": blob.source,
-                            "source": blob.source,  # type: ignore[attr-defined]
+                        "total_pages": len(doc.pages),
-                            "file_path": blob.source,  # type: ignore[attr-defined]
+                    }
-                            "page": page.page_number - 1,
+                )
-                            "total_pages": len(doc.pages),
+            )
-                        },
+            for page in doc.pages:
-                        **{
+                tables_bbox: list[tuple[float, float, float, float]] = (
-                            k: doc.metadata[k]
+                    self._extract_tables_bbox_from_page(page)
-                            for k in doc.metadata
+                )
-                            if type(doc.metadata[k]) in [str, int]
+                tables_content = self._extract_tables_from_page(page)
-                        },
+                images_bbox = [geometry.obj_to_bbox(image) for image in page.images]
                image_from_page = self._extract_images_from_page(page)
                page_text = []
                extras = []
                for content in self._split_page_content(
                    page,
                    tables_bbox,
                    tables_content,
                    images_bbox,
                    image_from_page,
                ):
                    if isinstance(content, str):  # Text
                        page_text.append(content)
                    elif isinstance(content, list):  # Table
                        page_text.append(_JOIN_TABLES + self._convert_table(content))
                    else:  # Image
                        image_bytes = io.BytesIO()
                        numpy.save(image_bytes, content)
                        blob = Blob.from_data(
                            image_bytes.getvalue(), mime_type="application/x-npy"
                        )
                        text_from_image = next(
                            self.images_parser.lazy_parse(blob)  # type: ignore
                        ).page_content
                        extras.append(
                            _format_inner_image(
                                blob, text_from_image, self.images_inner_format
                            )
                        )
                all_text = _merge_text_and_extras(extras, "".join(page_text).strip())
                if self.mode == "page":
                    # For legacy compatibility, add the last '\n'_
                    if not all_text.endswith("\n"):
                        all_text += "\n"
                    yield Document(
                        page_content=all_text,
                        metadata=_validate_metadata(
                            _PDFPlumberParserMetadata(
                                doc_metadata
                                | {
                                    "page": page.page_number - 1,
                                }
                            )
                        ),
                    )
                else:
                    contents.append(all_text)
                # "tables_as_html": [self._convert_table_to_html(table)
                #                    for
                #                    table in tables_content],
                # "images": images_content,
                # tables_as_html.extend([self._convert_table(table)
                #                        for
                #                        table in tables_content])
            if self.mode == "single":
                yield Document(
                    page_content=self.pages_delimiter.join(contents),
                    metadata=_validate_metadata(
                        _PDFPlumberParserMetadata(doc_metadata)
                    ),
                )
                for page in doc.pages
            ]
    def _process_page_content(self, page: pdfplumber.page.Page) -> str:
-        """Process the page content based on dedupe."""
+        """Process the page content based on dedupe.
        Args:
            page: The PDF page to process.
        Returns:
            The extracted text from the page.
        """
        if self.dedupe:
            return page.dedupe_chars().extract_text(**self.text_kwargs)
        return page.extract_text(**self.text_kwargs)
-    def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
+    def _split_page_content(
-        """Extract images from page and get the text with RapidOCR."""
+        self,
        page: pdfplumber.page.Page,
        tables_bbox: list[tuple[float, float, float, float]],
        tables_content: list[list[list[Any]]],
        images_bbox: list[tuple[float, float, float, float]],
        images_content: list[np.ndarray],
        **kwargs: Any,
    ) -> Iterator[Union[str, list[list[str]], np.ndarray]]:
        """Split the page content into text, tables, and images.
        Args:
            page: The PDF page to process.
            tables_bbox: Bounding boxes of tables on the page.
            tables_content: Content of tables on the page.
            images_bbox: Bounding boxes of images on the page.
            images_content: Content of images on the page.
            **kwargs: Additional keyword arguments.
        Yields:
            An iterator over the split content (text, tables, images).
        """
        from pdfplumber.utils import (
            geometry,
            text,
        )
        # Iterate over words. If a word is in a table,
        # yield the accumulated text, and the table
        # A the word is in a previously see table, ignore it
        # Finish with the accumulated text
        kwargs.update(
            {
                "keep_blank_chars": True,
                # "use_text_flow": True,
                "presorted": True,
                "layout_bbox": kwargs.get("layout_bbox")
                # or geometry.objects_to_bbox(page.chars),
                or page.cropbox,
            }
        )
        chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars
        extractor = text.WordExtractor(
            **{k: kwargs[k] for k in text.WORD_EXTRACTOR_KWARGS if k in kwargs}
        )
        wordmap = extractor.extract_wordmap(chars)
        extract_wordmaps: list[Any] = []
        used_arrays = [False] * len(tables_bbox)
        for word, o in wordmap.tuples:
            # print(f"  Try with '{word['text']}' ...")
            is_table = False
            word_bbox = geometry.obj_to_bbox(word)
            for i, table_bbox in enumerate(tables_bbox):
                if geometry.get_bbox_overlap(word_bbox, table_bbox):
                    # Find a world in a table
                    # print("  Find in an array")
                    is_table = True
                    if not used_arrays[i]:
                        # First time I see a word in this array
                        # Yield the previous part
                        if extract_wordmaps:
                            new_wordmap = text.WordMap(tuples=extract_wordmaps)
                            new_textmap = new_wordmap.to_textmap(
                                **{
                                    k: kwargs[k]
                                    for k in text.TEXTMAP_KWARGS
                                    if k in kwargs
                                }
                            )
                            # print(f"yield {new_textmap.to_string()}")
                            yield new_textmap.to_string()
                            extract_wordmaps.clear()
                        # and yield the table
                        used_arrays[i] = True
                        # print(f"yield table {i}")
                        yield tables_content[i]
                    break
            if not is_table:
                # print(f'  Add {word["text"]}')
                extract_wordmaps.append((word, o))
        if extract_wordmaps:
            # Text after the array ?
            new_wordmap = text.WordMap(tuples=extract_wordmaps)
            new_textmap = new_wordmap.to_textmap(
                **{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs}
            )
            # print(f"yield {new_textmap.to_string()}")
            yield new_textmap.to_string()
        # Add images-
        for content in images_content:
            yield content
    def _extract_images_from_page(self, page: pdfplumber.page.Page) -> list[np.ndarray]:
        """Extract images from a PDF page.
        Args:
            page: The PDF page to extract images from.
        Returns:
            A list of extracted images as numpy arrays.
        """
        from PIL import Image
-        if not self.extract_images:
+        if not self.images_parser:
-            return ""
+            return []
        images = []
        for img in page.images:
-            if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
+            if "Filter" in img["stream"]:
-                if img["stream"]["BitsPerComponent"] == 1:
+                if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
                    images.append(
                        np.array(
                            Image.frombytes(
                                "1",
                                (img["stream"]["Width"], img["stream"]["Height"]),
                                img["stream"].get_data(),
                            ).convert("L")
                        )
                    )
                else:
                    images.append(
                        np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
                            img["stream"]["Height"], img["stream"]["Width"], -1
                        )
                    )
-            elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
+                elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
-                images.append(img["stream"].get_data())
+                    buf = np.frombuffer(img["stream"].get_data(), dtype=np.uint8)
-            else:
+                    images.append(
-                warnings.warn("Unknown PDF Filter!")
+                        np.array(Image.open(io.BytesIO(buf.tobytes())))  # type: ignore
                    )
                else:
                    logger.warning("Unknown PDF Filter!")
-        return extract_from_images_with_rapidocr(images)
+        return images
    def _extract_tables_bbox_from_page(
        self,
        page: pdfplumber.page.Page,
    ) -> list[tuple]:
        """Extract bounding boxes of tables from a PDF page.
        Args:
            page: The PDF page to extract table bounding boxes from.
        Returns:
            A list of bounding boxes for tables on the page.
        """
        if not self.extract_tables:
            return []
        from pdfplumber.table import TableSettings
        table_settings = self.extract_tables_settings
        tset = TableSettings.resolve(table_settings)
        return [table.bbox for table in page.find_tables(tset)]
    def _extract_tables_from_page(
        self,
        page: pdfplumber.page.Page,
    ) -> list[list[list[Any]]]:
        """Extract tables from a PDF page.
        Args:
            page: The PDF page to extract tables from.
        Returns:
            A list of tables, where each table is a list of rows, and each row is a
            list of cell values.
        """
        if not self.extract_tables:
            return []
        table_settings = self.extract_tables_settings
        tables_list = page.extract_tables(table_settings)
        return tables_list
    def _convert_table(self, table: list[list[str]]) -> str:
        """Convert a table to the specified format.
        Args:
            table: The table to convert.
        Returns:
            The table content as a string in the specified format.
        """
        format = self.extract_tables
        if format is None:
            return ""
        if format == "markdown":
            return self._convert_table_to_markdown(table)
        elif format == "html":
            return self._convert_table_to_html(table)
        elif format == "csv":
            return self._convert_table_to_csv(table)
        else:
            raise ValueError(f"Unknown table format: {format}")
    def _convert_table_to_csv(self, table: list[list[str]]) -> str:
        """Convert a table to CSV format.
        Args:
            table: The table to convert.
        Returns:
            The table content as a string in CSV format.
        """
        if not table:
            return ""
        output = ["\n\n"]
        # skip first row in details if header is part of the table
        # j = 0 if self.header.external else 1
        # iterate over detail rows
        for row in table:
            line = ""
            for i, cell in enumerate(row):
                # output None cells with empty string
                cell = "" if cell is None else cell.replace("\n", " ")
                line += cell + ","
            output.append(line)
        return "\n".join(output) + "\n\n"
    def _convert_table_to_html(self, table: list[list[str]]) -> str:
        """
        Convert table content as a string in HTML format.
        If clean is true, markdown syntax is removed from cell content.
        Args:
            table: The table to convert.
        Returns:
            The table content as a string in HTML format.
        """
        if not len(table):
            return ""
        output = "<table>\n"
        clean = True
        # iterate over detail rows
        for row in table:
            line = "<tr>"
            for i, cell in enumerate(row):
                # output None cells with empty string
                cell = "" if cell is None else cell.replace("\n", " ")
                if clean:  # remove sensitive syntax
                    cell = html.escape(cell.replace("-", "&#45;"))
                line += "<td>" + cell + "</td>"
            line += "</tr>\n"
            output += line
        return output + "</table>\n"
    def _convert_table_to_markdown(self, table: list[list[str]]) -> str:
        """Convert table content as a string in Github-markdown format.
        Args:
            table: The table to convert.
        Returns:
            The table content as a string in Markdown format.
        """
        clean = False
        if not table:
            return ""
        col_count = len(table[0])
        output = "|" + "|".join("" for i in range(col_count)) + "|\n"
        output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
        # skip first row in details if header is part of the table
        # j = 0 if self.header.external else 1
        # iterate over detail rows
        for row in table:
            line = "|"
            for i, cell in enumerate(row):
                # output None cells with empty string
                cell = "" if cell is None else cell.replace("\n", " ")
                if clean:  # remove sensitive syntax
                    cell = html.escape(cell.replace("-", "&#45;"))
                line += cell + "|"
            line += "\n"
            output += line
        return output + "\n"
 class AmazonTextractPDFParser(BaseBlobParser):
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -1013,7 +1013,59 @@ class MathpixPDFLoader(BasePDFLoader):
 class PDFPlumberLoader(BasePDFLoader):
-    """Load `PDF` files using `pdfplumber`."""
+    """Load and parse a PDF file using 'pdfplumber' library.
    This class provides methods to load and parse PDF documents, supporting various
    configurations such as handling password-protected files, extracting images, and
    defining extraction mode. It integrates the `pdfplumber` library for PDF processing
    and offers both synchronous and asynchronous document loading.
    Examples:
        Setup:
        .. code-block:: bash
            pip install -U langchain-community pdfplumber
        Instantiate the loader:
        .. code-block:: python
            from langchain_community.document_loaders import PDFPlumberLoader
            loader = PDFPlumberLoader(
                file_path = "./example_data/layout-parser-paper.pdf",
                # headers = None
                # password = None,
                mode = "single",
                pages_delimiter = "\n\f",
                images_inner_format = "text",
                # extract_tables = None,
                # extract_tables_settings = None,
                # text_kwargs = {"use_text_flow": False, "keep_blank_chars": False},
                # dedupe = False,
            )
        Lazy load documents:
        .. code-block:: python
            docs = []
            docs_lazy = loader.lazy_load()
            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
        Load documents asynchronously:
        .. code-block:: python
            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    """
    def __init__(
        self,
@ -1022,34 +1074,78 @@ class PDFPlumberLoader(BasePDFLoader):
        dedupe: bool = False,
        headers: Optional[dict] = None,
        extract_images: bool = False,
        *,
        password: Optional[str] = None,
        mode: Literal["single", "page"] = "page",
        images_parser: Optional[BaseImageBlobParser] = None,
        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
        extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
        extract_tables_settings: Optional[dict[str, Any]] = None,
    ) -> None:
-        """Initialize with a file path."""
+        """Initialize with a file path.
        try:
            import pdfplumber  # noqa:F401
        except ImportError:
            raise ImportError(
                "pdfplumber package not found, please install it with "
                "`pip install pdfplumber`"
            )
        Args:
            file_path: The path to the PDF file to be loaded.
            headers: Optional headers to use for GET request to download a file from a
              web path.
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extract_tables: Whether to extract tables in a specific format, such as
                "csv", "markdown", or "html".
            extract_tables_settings: Optional dictionary of settings for customizing
                table extraction.
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe:  Avoiding the error of duplicate characters if `dedupe=True`
        Returns:
            This method does not directly return data. Use the `load`, `lazy_load`,
            or `aload` methods
            to retrieve parsed documents with content and metadata.
        Raises:
            ImportError: If the `pdfplumber` package is not installed.
        """
        super().__init__(file_path, headers=headers)
-        self.text_kwargs = text_kwargs or {}
+        self.parser = PDFPlumberParser(
-        self.dedupe = dedupe
+            password=password,
-        self.extract_images = extract_images
+            mode=mode,
-
+            pages_delimiter=pages_delimiter,
-    def load(self) -> list[Document]:
+            extract_images=extract_images,
-        """Load file."""
+            images_parser=images_parser,
-
+            images_inner_format=images_inner_format,
-        parser = PDFPlumberParser(
+            extract_tables=extract_tables,
-            text_kwargs=self.text_kwargs,
+            text_kwargs=text_kwargs,
-            dedupe=self.dedupe,
+            extract_tables_settings=extract_tables_settings,
-            extract_images=self.extract_images,
+            dedupe=dedupe,
        )
    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """
        Lazy load given path as pages.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.
        """
        if self.web_path:
-            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
+            blob = Blob.from_data(  # type: ignore[attr-defined]
                open(self.file_path, "rb").read(), path=self.web_path
            )
        else:
            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
-        return parser.parse(blob)
+        yield from self.parser.lazy_parse(blob)
 class AmazonTextractPDFLoader(BasePDFLoader):
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -9,10 +9,7 @@ import pytest
 import langchain_community.document_loaders.parsers as pdf_parsers
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
-from langchain_community.document_loaders.parsers import (
+from langchain_community.document_loaders.parsers import BaseImageBlobParser
    BaseImageBlobParser,
    PDFPlumberParser,
 )
 if TYPE_CHECKING:
    from PIL.Image import Image
@ -95,13 +92,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
        assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
 def test_pdfplumber_parser() -> None:
    """Test PDFPlumber parser."""
    _assert_with_parser(PDFPlumberParser())
    _assert_with_duplicate_parser(PDFPlumberParser())
    _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
 class EmptyImageBlobParser(BaseImageBlobParser):
    def _analyze_image(self, img: "Image") -> str:
        return "Hello world"
@ -115,6 +105,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
    "parser_factory,params",
    [
        ("PDFMinerParser", {}),
        ("PDFPlumberParser", {}),
        ("PyMuPDFParser", {}),
        ("PyPDFium2Parser", {}),
        ("PyPDFParser", {"extraction_mode": "plain"}),
@ -145,6 +136,7 @@ def test_mode_and_extract_images_variations(
    "parser_factory,params",
    [
        ("PDFMinerParser", {}),
        ("PDFPlumberParser", {}),
        ("PyMuPDFParser", {}),
        ("PyPDFium2Parser", {}),
        ("PyPDFParser", {"extraction_mode": "plain"}),
@ -245,6 +237,7 @@ def _test_matrix(
@pytest.mark.parametrize(
    "parser_factory,params",
    [
        ("PDFPlumberParser", {}),
        ("PyMuPDFParser", {}),
    ],
 )
--- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@ -167,6 +167,7 @@ def test_amazontextract_loader_failures() -> None:
    "parser_factory,params",
    [
        ("PDFMinerLoader", {}),
        ("PDFPlumberLoader", {}),
        ("PyMuPDFLoader", {}),
        ("PyPDFium2Loader", {}),
        ("PyPDFLoader", {}),
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
@ -9,7 +9,9 @@ import pytest
 import langchain_community.document_loaders.parsers as pdf_parsers
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
-from langchain_community.document_loaders.parsers.pdf import _merge_text_and_extras
+from langchain_community.document_loaders.parsers.pdf import (
    _merge_text_and_extras,
 )
 _THIS_DIR = Path(__file__).parents[3]
@ -75,6 +77,7 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
    "parser_factory,require,params",
    [
        ("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
        ("PDFPlumberParser", "pdfplumber", {}),
        ("PyMuPDFParser", "pymupdf", {}),
        ("PyPDFParser", "pypdf", {}),
        ("PyPDFium2Parser", "pypdfium2", {}),