mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 23:29:21 +00:00
Merge legacy and standard metadata keys in pdf parser.
This commit is contained in:
parent
fa47539b60
commit
76b3d6bdb8
@ -10,6 +10,10 @@ import warnings
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import numpy
|
||||
import numpy as np
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
@ -23,11 +27,6 @@ from typing import (
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import numpy
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
|
||||
BaseImageBlobParser,
|
||||
RapidOCRBlobParser,
|
||||
)
|
||||
from langchain_core.documents import Document
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pdfplumber
|
||||
@ -1522,16 +1522,30 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
from pdfplumber.utils import geometry
|
||||
|
||||
contents = []
|
||||
doc_metadata = doc.metadata | _purge_metadata(
|
||||
# The legacy version, use CreationDate, Creator, etc.
|
||||
# The new 'standard' version must use lower case key.
|
||||
# This next line, merge the legecy keys and standard keys
|
||||
# in the same dictionary.
|
||||
# - The CreationDate is duplicate to `creationdate` with iso format.
|
||||
# - The Creator is duplicate to 'creator', etc.
|
||||
# With this strategy, the legacy code can continue to use CreationDate
|
||||
# or Creator. The new code, can use `creationdate` or `creator`.
|
||||
# _purge_metadata() convert and normalize the name and format of
|
||||
# the metadatas.
|
||||
|
||||
doc_metadata = (
|
||||
doc.metadata | # Legacy metdata with...
|
||||
_purge_metadata(
|
||||
(
|
||||
doc.metadata
|
||||
| {
|
||||
doc.metadata # Add parser metdata
|
||||
| { # with more keys
|
||||
"source": blob.source,
|
||||
"file_path": blob.source,
|
||||
"total_pages": len(doc.pages),
|
||||
}
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
for page in doc.pages:
|
||||
tables_bbox: list[tuple[float, float, float, float]] = (
|
||||
@ -1999,7 +2013,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
the blob.data is taken
|
||||
"""
|
||||
|
||||
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
||||
url_parse_result = urlparse(
|
||||
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
||||
# Either call with S3 path (multi-page) or with bytes (single-page)
|
||||
if (
|
||||
url_parse_result
|
||||
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
||||
self.client = client
|
||||
self.model = model
|
||||
|
||||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
|
||||
Document]: # type: ignore[valid-type]
|
||||
for p in result.pages:
|
||||
content = " ".join([line.content for line in p.lines])
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user