Merge legacy and standard metadata keys in pdf parser.

2025-08-18 00:51:18 +00:00 · 2025-03-07 14:45:37 +01:00 · 2025-03-07 14:45:37 +01:00 · 76b3d6bdb8
commit 76b3d6bdb8
parent fa47539b60
1 changed files with 127 additions and 111 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -10,6 +10,10 @@ import warnings
 from datetime import datetime
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from urllib.parse import urlparse
 import numpy
 import numpy as np
 from typing import (
    TYPE_CHECKING,
    Any,
@ -23,11 +27,6 @@ from typing import (
    Union,
    cast,
 )
 from urllib.parse import urlparse
 import numpy
 import numpy as np
 from langchain_core.documents import Document
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
    BaseImageBlobParser,
    RapidOCRBlobParser,
 )
 from langchain_core.documents import Document
 if TYPE_CHECKING:
    import pdfplumber
@ -62,7 +62,7 @@ _PDF_FILTER_WITHOUT_LOSS = [
 def extract_from_images_with_rapidocr(
-    images: Sequence[Union[Iterable[np.ndarray], bytes]],
+        images: Sequence[Union[Iterable[np.ndarray], bytes]],
 ) -> str:
    """Extract text from images with RapidOCR.
@ -201,7 +201,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
    """
    def _recurs_merge_text_and_extras(
-        extras: list[str], text_from_page: str, recurs: bool
+            extras: list[str], text_from_page: str, recurs: bool
    ) -> Optional[str]:
        if extras:
            for delim in _PARAGRAPH_DELIMITER:
@ -221,7 +221,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
                        if str_extras:
                            all_extras = delim + str_extras
                        all_text = (
-                            text_from_page[:pos] + all_extras + text_from_page[pos:]
+                                text_from_page[:pos] + all_extras + text_from_page[pos:]
                        )
                    break
            else:
@ -291,16 +291,16 @@ class PyPDFParser(BaseBlobParser):
    """
    def __init__(
-        self,
+            self,
-        password: Optional[Union[str, bytes]] = None,
+            password: Optional[Union[str, bytes]] = None,
-        extract_images: bool = False,
+            extract_images: bool = False,
-        *,
+            *,
-        mode: Literal["single", "page"] = "page",
+            mode: Literal["single", "page"] = "page",
-        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+            pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
-        images_parser: Optional[BaseImageBlobParser] = None,
+            images_parser: Optional[BaseImageBlobParser] = None,
-        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+            images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
-        extraction_mode: Literal["plain", "layout"] = "plain",
+            extraction_mode: Literal["plain", "layout"] = "plain",
-        extraction_kwargs: Optional[dict[str, Any]] = None,
+            extraction_kwargs: Optional[dict[str, Any]] = None,
    ):
        """Initialize a parser based on PyPDF.
@ -519,15 +519,15 @@ class PDFMinerParser(BaseBlobParser):
    _warn_concatenate_pages = False
    def __init__(
-        self,
+            self,
-        extract_images: bool = False,
+            extract_images: bool = False,
-        *,
+            *,
-        password: Optional[str] = None,
+            password: Optional[str] = None,
-        mode: Literal["single", "page"] = "single",
+            mode: Literal["single", "page"] = "single",
-        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+            pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
-        images_parser: Optional[BaseImageBlobParser] = None,
+            images_parser: Optional[BaseImageBlobParser] = None,
-        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+            images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
-        concatenate_pages: Optional[bool] = None,
+            concatenate_pages: Optional[bool] = None,
    ):
        """Initialize a parser based on PDFMiner.
@ -629,10 +629,10 @@ class PDFMinerParser(BaseBlobParser):
        return obj
    def _get_metadata(
-        self,
+            self,
-        fp: BinaryIO,
+            fp: BinaryIO,
-        password: str = "",
+            password: str = "",
-        caching: bool = True,
+            caching: bool = True,
    ) -> dict[str, Any]:
        """
        Extract metadata from a PDF file.
@ -726,10 +726,10 @@ class PDFMinerParser(BaseBlobParser):
            class Visitor(PDFLayoutAnalyzer):
                def __init__(
-                    self,
+                        self,
-                    rsrcmgr: PDFResourceManager,
+                        rsrcmgr: PDFResourceManager,
-                    pageno: int = 1,
+                        pageno: int = 1,
-                    laparams: Optional[LAParams] = None,
+                        laparams: Optional[LAParams] = None,
                ) -> None:
                    super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
@ -855,17 +855,17 @@ class PyMuPDFParser(BaseBlobParser):
    _lock = threading.Lock()
    def __init__(
-        self,
+            self,
-        text_kwargs: Optional[dict[str, Any]] = None,
+            text_kwargs: Optional[dict[str, Any]] = None,
-        extract_images: bool = False,
+            extract_images: bool = False,
-        *,
+            *,
-        password: Optional[str] = None,
+            password: Optional[str] = None,
-        mode: Literal["single", "page"] = "page",
+            mode: Literal["single", "page"] = "page",
-        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+            pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
-        images_parser: Optional[BaseImageBlobParser] = None,
+            images_parser: Optional[BaseImageBlobParser] = None,
-        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+            images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
-        extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
+            extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
-        extract_tables_settings: Optional[dict[str, Any]] = None,
+            extract_tables_settings: Optional[dict[str, Any]] = None,
    ) -> None:
        """Initialize a parser based on PyMuPDF.
@ -921,11 +921,11 @@ class PyMuPDFParser(BaseBlobParser):
        )
    def _lazy_parse(
-        self,
+            self,
-        blob: Blob,
+            blob: Blob,
-        # text-kwargs is present for backwards compatibility.
+            # text-kwargs is present for backwards compatibility.
-        # Users should not use it directly.
+            # Users should not use it directly.
-        text_kwargs: Optional[dict[str, Any]] = None,
+            text_kwargs: Optional[dict[str, Any]] = None,
    ) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
@ -1014,10 +1014,10 @@ class PyMuPDFParser(BaseBlobParser):
                    )
    def _get_page_content(
-        self,
+            self,
-        doc: pymupdf.Document,
+            doc: pymupdf.Document,
-        page: pymupdf.Page,
+            page: pymupdf.Page,
-        text_kwargs: dict[str, Any],
+            text_kwargs: dict[str, Any],
    ) -> str:
        """Get the text of the page using PyMuPDF and RapidOCR and issue a warning
        if it is empty.
@ -1075,7 +1075,7 @@ class PyMuPDFParser(BaseBlobParser):
        return metadata
    def _extract_images_from_page(
-        self, doc: pymupdf.Document, page: pymupdf.Page
+            self, doc: pymupdf.Document, page: pymupdf.Page
    ) -> str:
        """Extract images from a PDF page and get the text using images_to_text.
@ -1216,14 +1216,14 @@ class PyPDFium2Parser(BaseBlobParser):
    _lock = threading.Lock()
    def __init__(
-        self,
+            self,
-        extract_images: bool = False,
+            extract_images: bool = False,
-        *,
+            *,
-        password: Optional[str] = None,
+            password: Optional[str] = None,
-        mode: Literal["single", "page"] = "page",
+            mode: Literal["single", "page"] = "page",
-        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+            pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
-        images_parser: Optional[BaseImageBlobParser] = None,
+            images_parser: Optional[BaseImageBlobParser] = None,
-        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+            images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
    ) -> None:
        """Initialize a parser based on PyPDFium2.
@ -1426,18 +1426,18 @@ class PDFPlumberParser(BaseBlobParser):
    """
    def __init__(
-        self,
+            self,
-        text_kwargs: Optional[Mapping[str, Any]] = None,
+            text_kwargs: Optional[Mapping[str, Any]] = None,
-        dedupe: bool = False,
+            dedupe: bool = False,
-        extract_images: bool = False,
+            extract_images: bool = False,
-        *,
+            *,
-        password: Optional[str] = None,
+            password: Optional[str] = None,
-        mode: Literal["single", "page"] = "page",
+            mode: Literal["single", "page"] = "page",
-        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+            pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
-        images_parser: Optional[BaseImageBlobParser] = None,
+            images_parser: Optional[BaseImageBlobParser] = None,
-        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+            images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
-        extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
+            extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
-        extract_tables_settings: Optional[dict[str, Any]] = None,
+            extract_tables_settings: Optional[dict[str, Any]] = None,
    ) -> None:
        """Initialize the parser.
@ -1522,15 +1522,29 @@ class PDFPlumberParser(BaseBlobParser):
            from pdfplumber.utils import geometry
            contents = []
-            doc_metadata = doc.metadata | _purge_metadata(
+            # The legacy version, use CreationDate, Creator, etc.
-                (
+            # The new 'standard' version must use lower case key.
-                    doc.metadata
+            # This next line, merge the legecy keys and standard keys
-                    | {
+            # in the same dictionary.
-                        "source": blob.source,
+            # - The CreationDate is duplicate to `creationdate` with iso format.
-                        "file_path": blob.source,
+            # - The Creator is duplicate to 'creator', etc.
-                        "total_pages": len(doc.pages),
+            # With this strategy, the legacy code can continue to use CreationDate
-                    }
+            # or Creator. The new code, can use `creationdate` or `creator`.
-                )
+            # _purge_metadata() convert and normalize the name and format of
            # the metadatas.
            doc_metadata = (
                    doc.metadata |  # Legacy metdata with...
                    _purge_metadata(
                        (
                                doc.metadata  # Add parser metdata
                                | {  # with more keys
                                    "source": blob.source,
                                    "file_path": blob.source,
                                    "total_pages": len(doc.pages),
                                }
                        )
                    )
            )
            for page in doc.pages:
@ -1543,11 +1557,11 @@ class PDFPlumberParser(BaseBlobParser):
                page_text = []
                extras = []
                for content in self._split_page_content(
-                    page,
+                        page,
-                    tables_bbox,
+                        tables_bbox,
-                    tables_content,
+                        tables_content,
-                    images_bbox,
+                        images_bbox,
-                    image_from_page,
+                        image_from_page,
                ):
                    if isinstance(content, str):  # Text
                        page_text.append(content)
@ -1615,13 +1629,13 @@ class PDFPlumberParser(BaseBlobParser):
        return page.extract_text(**self.text_kwargs)
    def _split_page_content(
-        self,
+            self,
-        page: pdfplumber.page.Page,
+            page: pdfplumber.page.Page,
-        tables_bbox: list[tuple[float, float, float, float]],
+            tables_bbox: list[tuple[float, float, float, float]],
-        tables_content: list[list[list[Any]]],
+            tables_content: list[list[list[Any]]],
-        images_bbox: list[tuple[float, float, float, float]],
+            images_bbox: list[tuple[float, float, float, float]],
-        images_content: list[np.ndarray],
+            images_content: list[np.ndarray],
-        **kwargs: Any,
+            **kwargs: Any,
    ) -> Iterator[Union[str, list[list[str]], np.ndarray]]:
        """Split the page content into text, tables, and images.
@ -1736,8 +1750,8 @@ class PDFPlumberParser(BaseBlobParser):
        return images
    def _extract_tables_bbox_from_page(
-        self,
+            self,
-        page: pdfplumber.page.Page,
+            page: pdfplumber.page.Page,
    ) -> list[tuple[float, float, float, float]]:
        """Extract bounding boxes of tables from a PDF page.
@ -1756,8 +1770,8 @@ class PDFPlumberParser(BaseBlobParser):
        return [table.bbox for table in page.find_tables(tset)]
    def _extract_tables_from_page(
-        self,
+            self,
-        page: pdfplumber.page.Page,
+            page: pdfplumber.page.Page,
    ) -> list[list[list[Any]]]:
        """Extract tables from a PDF page.
@ -1930,11 +1944,11 @@ class AmazonTextractPDFParser(BaseBlobParser):
    """
    def __init__(
-        self,
+            self,
-        textract_features: Optional[Sequence[int]] = None,
+            textract_features: Optional[Sequence[int]] = None,
-        client: Optional[Any] = None,
+            client: Optional[Any] = None,
-        *,
+            *,
-        linearization_config: Optional[TextLinearizationConfig] = None,
+            linearization_config: Optional[TextLinearizationConfig] = None,
    ) -> None:
        """Initializes the parser.
@ -1999,12 +2013,13 @@ class AmazonTextractPDFParser(BaseBlobParser):
        the blob.data is taken
        """
-        url_parse_result = urlparse(str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
+        url_parse_result = urlparse(
            str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
        # Either call with S3 path (multi-page) or with bytes (single-page)
        if (
-            url_parse_result
+                url_parse_result
-            and url_parse_result.scheme == "s3"
+                and url_parse_result.scheme == "s3"
-            and url_parse_result.netloc
+                and url_parse_result.netloc
        ):
            textract_response_json = self.tc.call_textract(
                input_document=str(blob.path),  # type: ignore[attr-defined]
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
        self.client = client
        self.model = model
-    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:  # type: ignore[valid-type]
+    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
        Document]:  # type: ignore[valid-type]
        for p in result.pages:
            content = " ".join([line.content for line in p.lines])