Merge legacy and standard metadata keys in pdf parser.

2025-08-23 03:22:38 +00:00 · 2025-03-07 14:45:37 +01:00 · 2025-03-07 14:45:37 +01:00 · 76b3d6bdb8
commit 76b3d6bdb8
parent fa47539b60
1 changed files with 127 additions and 111 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -10,6 +10,10 @@ import warnings
 from datetime import datetime
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from urllib.parse import urlparse
 import numpy
 import numpy as np
 from typing import (
    TYPE_CHECKING,
    Any,
@ -23,11 +27,6 @@ from typing import (
    Union,
    cast,
 )
 from urllib.parse import urlparse
 import numpy
 import numpy as np
 from langchain_core.documents import Document
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
    BaseImageBlobParser,
    RapidOCRBlobParser,
 )
 from langchain_core.documents import Document
 if TYPE_CHECKING:
    import pdfplumber
@ -1522,16 +1522,30 @@ class PDFPlumberParser(BaseBlobParser):
            from pdfplumber.utils import geometry
            contents = []
-            doc_metadata = doc.metadata | _purge_metadata(
+            # The legacy version, use CreationDate, Creator, etc.
            # The new 'standard' version must use lower case key.
            # This next line, merge the legecy keys and standard keys
            # in the same dictionary.
            # - The CreationDate is duplicate to `creationdate` with iso format.
            # - The Creator is duplicate to 'creator', etc.
            # With this strategy, the legacy code can continue to use CreationDate
            # or Creator. The new code, can use `creationdate` or `creator`.
            # _purge_metadata() convert and normalize the name and format of
            # the metadatas.
            doc_metadata = (
                    doc.metadata |  # Legacy metdata with...
                    _purge_metadata(
                        (
-                    doc.metadata
+                                doc.metadata  # Add parser metdata
-                    | {
+                                | {  # with more keys
                                    "source": blob.source,
                                    "file_path": blob.source,
                                    "total_pages": len(doc.pages),
                                }
                        )
                    )
            )
            for page in doc.pages:
                tables_bbox: list[tuple[float, float, float, float]] = (
@ -1999,7 +2013,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
        the blob.data is taken
        """
-        url_parse_result = urlparse(str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
+        url_parse_result = urlparse(
            str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
        # Either call with S3 path (multi-page) or with bytes (single-page)
        if (
                url_parse_result
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
        self.client = client
        self.model = model
-    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:  # type: ignore[valid-type]
+    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
        Document]:  # type: ignore[valid-type]
        for p in result.pages:
            content = " ".join([line.content for line in p.lines])