Merge remote-tracking branch 'origin/pprados/06-pdfplumber' into pprados/06-pdfplumber

2025-08-16 16:11:02 +00:00 · 2025-03-07 14:45:44 +01:00 · 2025-03-07 14:45:44 +01:00 · 89903c87ee
commit 89903c87ee
parent 76b3d6bdb8 1bc4c91244
1 changed files with 117 additions and 128 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -10,10 +10,6 @@ import warnings
 from datetime import datetime
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from urllib.parse import urlparse
 import numpy
 import numpy as np
 from typing import (
    TYPE_CHECKING,
    Any,
@ -27,6 +23,11 @@ from typing import (
    Union,
    cast,
 )
 from urllib.parse import urlparse
 import numpy
 import numpy as np
 from langchain_core.documents import Document
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@ -34,7 +35,6 @@ from langchain_community.document_loaders.parsers.images import (
    BaseImageBlobParser,
    RapidOCRBlobParser,
 )
 from langchain_core.documents import Document
 if TYPE_CHECKING:
    import pdfplumber
@ -1266,8 +1266,8 @@ class PyPDFium2Parser(BaseBlobParser):
        self.pages_delimiter = pages_delimiter
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
-        """
+        """Lazily parse the blob.
-        Lazily parse the blob.
+
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.
@ -1469,7 +1469,6 @@ class PDFPlumberParser(BaseBlobParser):
        Raises:
            ValueError: If the `mode` is not "single" or "page".
            ValueError: If the `extract_tables` is not "csv", "markdown" or "html".
        """
        super().__init__()
        if mode not in ["single", "page"]:
@ -1495,10 +1494,7 @@ class PDFPlumberParser(BaseBlobParser):
        }
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
-        """
+        """Lazily parse the blob.
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.
        Args:
            blob: The blob to parse.
@ -1534,8 +1530,8 @@ class PDFPlumberParser(BaseBlobParser):
            # the metadatas.
            doc_metadata = (
-                    doc.metadata |  # Legacy metdata with...
+                doc.metadata  # Legacy metdata with...
-                    _purge_metadata(
+                | _purge_metadata(
                    (
                        doc.metadata  # Add parser metdata
                        | {  # with more keys
@ -1696,23 +1692,19 @@ class PDFPlumberParser(BaseBlobParser):
                            )
                            yield new_textmap.to_string()
                            extract_wordmaps.clear()
-                        # and yield the table
+                        # And yield the table
                        used_arrays[i] = True
                        # print(f"yield table {i}")
                        yield tables_content[i]
                    break
            if not is_table:
                # print(f'  Add {word["text"]}')
                extract_wordmaps.append((word, o))
        if extract_wordmaps:
            # Text after the array ?
            new_wordmap = text.WordMap(tuples=extract_wordmaps)
            new_textmap = new_wordmap.to_textmap(
                **{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs}
            )
            # print(f"yield {new_textmap.to_string()}")
            yield new_textmap.to_string()
-        # Add images-
+        # Add images
        for content in images_content:
            yield content
@ -1882,7 +1874,6 @@ class PDFPlumberParser(BaseBlobParser):
        output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
        # skip first row in details if header is part of the table
        # iterate over detail rows
        for row in table:
            line = "|"
@ -2013,8 +2004,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
        the blob.data is taken
        """
-        url_parse_result = urlparse(
+        url_parse_result = urlparse(str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
            str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
        # Either call with S3 path (multi-page) or with bytes (single-page)
        if (
            url_parse_result
@ -2060,8 +2050,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
        self.client = client
        self.model = model
-    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
+    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:  # type: ignore[valid-type]
        Document]:  # type: ignore[valid-type]
        for p in result.pages:
            content = " ".join([line.content for line in p.lines])