Merge legacy and standard metadata keys in pdf parser.

This commit is contained in:
Philippe Prados 2025-03-07 14:45:37 +01:00
parent fa47539b60
commit 76b3d6bdb8

View File

@ -10,6 +10,10 @@ import warnings
from datetime import datetime
from pathlib import Path
from tempfile import TemporaryDirectory
from urllib.parse import urlparse
import numpy
import numpy as np
from typing import (
TYPE_CHECKING,
Any,
@ -23,11 +27,6 @@ from typing import (
Union,
cast,
)
from urllib.parse import urlparse
import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser,
RapidOCRBlobParser,
)
from langchain_core.documents import Document
if TYPE_CHECKING:
import pdfplumber
@ -1522,16 +1522,30 @@ class PDFPlumberParser(BaseBlobParser):
from pdfplumber.utils import geometry
contents = []
doc_metadata = doc.metadata | _purge_metadata(
# The legacy version, use CreationDate, Creator, etc.
# The new 'standard' version must use lower case key.
# This next line, merge the legecy keys and standard keys
# in the same dictionary.
# - The CreationDate is duplicate to `creationdate` with iso format.
# - The Creator is duplicate to 'creator', etc.
# With this strategy, the legacy code can continue to use CreationDate
# or Creator. The new code, can use `creationdate` or `creator`.
# _purge_metadata() convert and normalize the name and format of
# the metadatas.
doc_metadata = (
doc.metadata | # Legacy metdata with...
_purge_metadata(
(
doc.metadata
| {
doc.metadata # Add parser metdata
| { # with more keys
"source": blob.source,
"file_path": blob.source,
"total_pages": len(doc.pages),
}
)
)
)
for page in doc.pages:
tables_bbox: list[tuple[float, float, float, float]] = (
@ -1999,7 +2013,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
the blob.data is taken
"""
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined]
url_parse_result = urlparse(
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
# Either call with S3 path (multi-page) or with bytes (single-page)
if (
url_parse_result
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
self.client = client
self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
Document]: # type: ignore[valid-type]
for p in result.pages:
content = " ".join([line.content for line in p.lines])