Merge legacy and standard metadata keys in pdf parser.

This commit is contained in:
Philippe Prados 2025-03-07 14:45:37 +01:00
parent fa47539b60
commit 76b3d6bdb8

View File

@ -10,6 +10,10 @@ import warnings
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from urllib.parse import urlparse
import numpy
import numpy as np
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Any, Any,
@ -23,11 +27,6 @@ from typing import (
Union, Union,
cast, cast,
) )
from urllib.parse import urlparse
import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser, BaseImageBlobParser,
RapidOCRBlobParser, RapidOCRBlobParser,
) )
from langchain_core.documents import Document
if TYPE_CHECKING: if TYPE_CHECKING:
import pdfplumber import pdfplumber
@ -1522,16 +1522,30 @@ class PDFPlumberParser(BaseBlobParser):
from pdfplumber.utils import geometry from pdfplumber.utils import geometry
contents = [] contents = []
doc_metadata = doc.metadata | _purge_metadata( # The legacy version, use CreationDate, Creator, etc.
# The new 'standard' version must use lower case key.
# This next line, merge the legecy keys and standard keys
# in the same dictionary.
# - The CreationDate is duplicate to `creationdate` with iso format.
# - The Creator is duplicate to 'creator', etc.
# With this strategy, the legacy code can continue to use CreationDate
# or Creator. The new code, can use `creationdate` or `creator`.
# _purge_metadata() convert and normalize the name and format of
# the metadatas.
doc_metadata = (
doc.metadata | # Legacy metdata with...
_purge_metadata(
( (
doc.metadata doc.metadata # Add parser metdata
| { | { # with more keys
"source": blob.source, "source": blob.source,
"file_path": blob.source, "file_path": blob.source,
"total_pages": len(doc.pages), "total_pages": len(doc.pages),
} }
) )
) )
)
for page in doc.pages: for page in doc.pages:
tables_bbox: list[tuple[float, float, float, float]] = ( tables_bbox: list[tuple[float, float, float, float]] = (
@ -1999,7 +2013,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
the blob.data is taken the blob.data is taken
""" """
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined] url_parse_result = urlparse(
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
# Either call with S3 path (multi-page) or with bytes (single-page) # Either call with S3 path (multi-page) or with bytes (single-page)
if ( if (
url_parse_result url_parse_result
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
self.client = client self.client = client
self.model = model self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type] def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
Document]: # type: ignore[valid-type]
for p in result.pages: for p in result.pages:
content = " ".join([line.content for line in p.lines]) content = " ".join([line.content for line in p.lines])