mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
Merge legacy and standard metadata keys in pdf parser.
This commit is contained in:
parent
fa47539b60
commit
76b3d6bdb8
@ -10,6 +10,10 @@ import warnings
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import numpy as np
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
@ -23,11 +27,6 @@ from typing import (
|
|||||||
Union,
|
Union,
|
||||||
cast,
|
cast,
|
||||||
)
|
)
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
import numpy as np
|
|
||||||
from langchain_core.documents import Document
|
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
|
|||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
RapidOCRBlobParser,
|
RapidOCRBlobParser,
|
||||||
)
|
)
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
@ -1522,16 +1522,30 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
from pdfplumber.utils import geometry
|
from pdfplumber.utils import geometry
|
||||||
|
|
||||||
contents = []
|
contents = []
|
||||||
doc_metadata = doc.metadata | _purge_metadata(
|
# The legacy version, use CreationDate, Creator, etc.
|
||||||
|
# The new 'standard' version must use lower case key.
|
||||||
|
# This next line, merge the legecy keys and standard keys
|
||||||
|
# in the same dictionary.
|
||||||
|
# - The CreationDate is duplicate to `creationdate` with iso format.
|
||||||
|
# - The Creator is duplicate to 'creator', etc.
|
||||||
|
# With this strategy, the legacy code can continue to use CreationDate
|
||||||
|
# or Creator. The new code, can use `creationdate` or `creator`.
|
||||||
|
# _purge_metadata() convert and normalize the name and format of
|
||||||
|
# the metadatas.
|
||||||
|
|
||||||
|
doc_metadata = (
|
||||||
|
doc.metadata | # Legacy metdata with...
|
||||||
|
_purge_metadata(
|
||||||
(
|
(
|
||||||
doc.metadata
|
doc.metadata # Add parser metdata
|
||||||
| {
|
| { # with more keys
|
||||||
"source": blob.source,
|
"source": blob.source,
|
||||||
"file_path": blob.source,
|
"file_path": blob.source,
|
||||||
"total_pages": len(doc.pages),
|
"total_pages": len(doc.pages),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
for page in doc.pages:
|
for page in doc.pages:
|
||||||
tables_bbox: list[tuple[float, float, float, float]] = (
|
tables_bbox: list[tuple[float, float, float, float]] = (
|
||||||
@ -1999,7 +2013,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
the blob.data is taken
|
the blob.data is taken
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
url_parse_result = urlparse(
|
||||||
|
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
||||||
# Either call with S3 path (multi-page) or with bytes (single-page)
|
# Either call with S3 path (multi-page) or with bytes (single-page)
|
||||||
if (
|
if (
|
||||||
url_parse_result
|
url_parse_result
|
||||||
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
|||||||
self.client = client
|
self.client = client
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
|
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
|
||||||
|
Document]: # type: ignore[valid-type]
|
||||||
for p in result.pages:
|
for p in result.pages:
|
||||||
content = " ".join([line.content for line in p.lines])
|
content = " ".join([line.content for line in p.lines])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user