mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 23:29:21 +00:00
Merge legacy and standard metadata keys in pdf parser.
This commit is contained in:
parent
fa47539b60
commit
76b3d6bdb8
@ -10,6 +10,10 @@ import warnings
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import numpy as np
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
@ -23,11 +27,6 @@ from typing import (
|
|||||||
Union,
|
Union,
|
||||||
cast,
|
cast,
|
||||||
)
|
)
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
import numpy as np
|
|
||||||
from langchain_core.documents import Document
|
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
|
|||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
RapidOCRBlobParser,
|
RapidOCRBlobParser,
|
||||||
)
|
)
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
@ -62,7 +62,7 @@ _PDF_FILTER_WITHOUT_LOSS = [
|
|||||||
|
|
||||||
|
|
||||||
def extract_from_images_with_rapidocr(
|
def extract_from_images_with_rapidocr(
|
||||||
images: Sequence[Union[Iterable[np.ndarray], bytes]],
|
images: Sequence[Union[Iterable[np.ndarray], bytes]],
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Extract text from images with RapidOCR.
|
"""Extract text from images with RapidOCR.
|
||||||
|
|
||||||
@ -201,7 +201,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def _recurs_merge_text_and_extras(
|
def _recurs_merge_text_and_extras(
|
||||||
extras: list[str], text_from_page: str, recurs: bool
|
extras: list[str], text_from_page: str, recurs: bool
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
if extras:
|
if extras:
|
||||||
for delim in _PARAGRAPH_DELIMITER:
|
for delim in _PARAGRAPH_DELIMITER:
|
||||||
@ -221,7 +221,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
|
|||||||
if str_extras:
|
if str_extras:
|
||||||
all_extras = delim + str_extras
|
all_extras = delim + str_extras
|
||||||
all_text = (
|
all_text = (
|
||||||
text_from_page[:pos] + all_extras + text_from_page[pos:]
|
text_from_page[:pos] + all_extras + text_from_page[pos:]
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -291,16 +291,16 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
password: Optional[Union[str, bytes]] = None,
|
password: Optional[Union[str, bytes]] = None,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
*,
|
*,
|
||||||
mode: Literal["single", "page"] = "page",
|
mode: Literal["single", "page"] = "page",
|
||||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
images_parser: Optional[BaseImageBlobParser] = None,
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
extraction_mode: Literal["plain", "layout"] = "plain",
|
extraction_mode: Literal["plain", "layout"] = "plain",
|
||||||
extraction_kwargs: Optional[dict[str, Any]] = None,
|
extraction_kwargs: Optional[dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
"""Initialize a parser based on PyPDF.
|
"""Initialize a parser based on PyPDF.
|
||||||
|
|
||||||
@ -519,15 +519,15 @@ class PDFMinerParser(BaseBlobParser):
|
|||||||
_warn_concatenate_pages = False
|
_warn_concatenate_pages = False
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
*,
|
*,
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
mode: Literal["single", "page"] = "single",
|
mode: Literal["single", "page"] = "single",
|
||||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
images_parser: Optional[BaseImageBlobParser] = None,
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
concatenate_pages: Optional[bool] = None,
|
concatenate_pages: Optional[bool] = None,
|
||||||
):
|
):
|
||||||
"""Initialize a parser based on PDFMiner.
|
"""Initialize a parser based on PDFMiner.
|
||||||
|
|
||||||
@ -629,10 +629,10 @@ class PDFMinerParser(BaseBlobParser):
|
|||||||
return obj
|
return obj
|
||||||
|
|
||||||
def _get_metadata(
|
def _get_metadata(
|
||||||
self,
|
self,
|
||||||
fp: BinaryIO,
|
fp: BinaryIO,
|
||||||
password: str = "",
|
password: str = "",
|
||||||
caching: bool = True,
|
caching: bool = True,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Extract metadata from a PDF file.
|
Extract metadata from a PDF file.
|
||||||
@ -726,10 +726,10 @@ class PDFMinerParser(BaseBlobParser):
|
|||||||
|
|
||||||
class Visitor(PDFLayoutAnalyzer):
|
class Visitor(PDFLayoutAnalyzer):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
rsrcmgr: PDFResourceManager,
|
rsrcmgr: PDFResourceManager,
|
||||||
pageno: int = 1,
|
pageno: int = 1,
|
||||||
laparams: Optional[LAParams] = None,
|
laparams: Optional[LAParams] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
|
super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
|
||||||
|
|
||||||
@ -855,17 +855,17 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
_lock = threading.Lock()
|
_lock = threading.Lock()
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
text_kwargs: Optional[dict[str, Any]] = None,
|
text_kwargs: Optional[dict[str, Any]] = None,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
*,
|
*,
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
mode: Literal["single", "page"] = "page",
|
mode: Literal["single", "page"] = "page",
|
||||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
images_parser: Optional[BaseImageBlobParser] = None,
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
|
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
|
||||||
extract_tables_settings: Optional[dict[str, Any]] = None,
|
extract_tables_settings: Optional[dict[str, Any]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a parser based on PyMuPDF.
|
"""Initialize a parser based on PyMuPDF.
|
||||||
|
|
||||||
@ -921,11 +921,11 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _lazy_parse(
|
def _lazy_parse(
|
||||||
self,
|
self,
|
||||||
blob: Blob,
|
blob: Blob,
|
||||||
# text-kwargs is present for backwards compatibility.
|
# text-kwargs is present for backwards compatibility.
|
||||||
# Users should not use it directly.
|
# Users should not use it directly.
|
||||||
text_kwargs: Optional[dict[str, Any]] = None,
|
text_kwargs: Optional[dict[str, Any]] = None,
|
||||||
) -> Iterator[Document]: # type: ignore[valid-type]
|
) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
"""Lazily parse the blob.
|
"""Lazily parse the blob.
|
||||||
Insert image, if possible, between two paragraphs.
|
Insert image, if possible, between two paragraphs.
|
||||||
@ -1014,10 +1014,10 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _get_page_content(
|
def _get_page_content(
|
||||||
self,
|
self,
|
||||||
doc: pymupdf.Document,
|
doc: pymupdf.Document,
|
||||||
page: pymupdf.Page,
|
page: pymupdf.Page,
|
||||||
text_kwargs: dict[str, Any],
|
text_kwargs: dict[str, Any],
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning
|
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning
|
||||||
if it is empty.
|
if it is empty.
|
||||||
@ -1075,7 +1075,7 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
def _extract_images_from_page(
|
def _extract_images_from_page(
|
||||||
self, doc: pymupdf.Document, page: pymupdf.Page
|
self, doc: pymupdf.Document, page: pymupdf.Page
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Extract images from a PDF page and get the text using images_to_text.
|
"""Extract images from a PDF page and get the text using images_to_text.
|
||||||
|
|
||||||
@ -1216,14 +1216,14 @@ class PyPDFium2Parser(BaseBlobParser):
|
|||||||
_lock = threading.Lock()
|
_lock = threading.Lock()
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
*,
|
*,
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
mode: Literal["single", "page"] = "page",
|
mode: Literal["single", "page"] = "page",
|
||||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
images_parser: Optional[BaseImageBlobParser] = None,
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a parser based on PyPDFium2.
|
"""Initialize a parser based on PyPDFium2.
|
||||||
|
|
||||||
@ -1426,18 +1426,18 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
text_kwargs: Optional[Mapping[str, Any]] = None,
|
text_kwargs: Optional[Mapping[str, Any]] = None,
|
||||||
dedupe: bool = False,
|
dedupe: bool = False,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
*,
|
*,
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
mode: Literal["single", "page"] = "page",
|
mode: Literal["single", "page"] = "page",
|
||||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
images_parser: Optional[BaseImageBlobParser] = None,
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
|
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
|
||||||
extract_tables_settings: Optional[dict[str, Any]] = None,
|
extract_tables_settings: Optional[dict[str, Any]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the parser.
|
"""Initialize the parser.
|
||||||
|
|
||||||
@ -1522,15 +1522,29 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
from pdfplumber.utils import geometry
|
from pdfplumber.utils import geometry
|
||||||
|
|
||||||
contents = []
|
contents = []
|
||||||
doc_metadata = doc.metadata | _purge_metadata(
|
# The legacy version, use CreationDate, Creator, etc.
|
||||||
(
|
# The new 'standard' version must use lower case key.
|
||||||
doc.metadata
|
# This next line, merge the legecy keys and standard keys
|
||||||
| {
|
# in the same dictionary.
|
||||||
"source": blob.source,
|
# - The CreationDate is duplicate to `creationdate` with iso format.
|
||||||
"file_path": blob.source,
|
# - The Creator is duplicate to 'creator', etc.
|
||||||
"total_pages": len(doc.pages),
|
# With this strategy, the legacy code can continue to use CreationDate
|
||||||
}
|
# or Creator. The new code, can use `creationdate` or `creator`.
|
||||||
)
|
# _purge_metadata() convert and normalize the name and format of
|
||||||
|
# the metadatas.
|
||||||
|
|
||||||
|
doc_metadata = (
|
||||||
|
doc.metadata | # Legacy metdata with...
|
||||||
|
_purge_metadata(
|
||||||
|
(
|
||||||
|
doc.metadata # Add parser metdata
|
||||||
|
| { # with more keys
|
||||||
|
"source": blob.source,
|
||||||
|
"file_path": blob.source,
|
||||||
|
"total_pages": len(doc.pages),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for page in doc.pages:
|
for page in doc.pages:
|
||||||
@ -1543,11 +1557,11 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
page_text = []
|
page_text = []
|
||||||
extras = []
|
extras = []
|
||||||
for content in self._split_page_content(
|
for content in self._split_page_content(
|
||||||
page,
|
page,
|
||||||
tables_bbox,
|
tables_bbox,
|
||||||
tables_content,
|
tables_content,
|
||||||
images_bbox,
|
images_bbox,
|
||||||
image_from_page,
|
image_from_page,
|
||||||
):
|
):
|
||||||
if isinstance(content, str): # Text
|
if isinstance(content, str): # Text
|
||||||
page_text.append(content)
|
page_text.append(content)
|
||||||
@ -1615,13 +1629,13 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
return page.extract_text(**self.text_kwargs)
|
return page.extract_text(**self.text_kwargs)
|
||||||
|
|
||||||
def _split_page_content(
|
def _split_page_content(
|
||||||
self,
|
self,
|
||||||
page: pdfplumber.page.Page,
|
page: pdfplumber.page.Page,
|
||||||
tables_bbox: list[tuple[float, float, float, float]],
|
tables_bbox: list[tuple[float, float, float, float]],
|
||||||
tables_content: list[list[list[Any]]],
|
tables_content: list[list[list[Any]]],
|
||||||
images_bbox: list[tuple[float, float, float, float]],
|
images_bbox: list[tuple[float, float, float, float]],
|
||||||
images_content: list[np.ndarray],
|
images_content: list[np.ndarray],
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Iterator[Union[str, list[list[str]], np.ndarray]]:
|
) -> Iterator[Union[str, list[list[str]], np.ndarray]]:
|
||||||
"""Split the page content into text, tables, and images.
|
"""Split the page content into text, tables, and images.
|
||||||
|
|
||||||
@ -1736,8 +1750,8 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
return images
|
return images
|
||||||
|
|
||||||
def _extract_tables_bbox_from_page(
|
def _extract_tables_bbox_from_page(
|
||||||
self,
|
self,
|
||||||
page: pdfplumber.page.Page,
|
page: pdfplumber.page.Page,
|
||||||
) -> list[tuple[float, float, float, float]]:
|
) -> list[tuple[float, float, float, float]]:
|
||||||
"""Extract bounding boxes of tables from a PDF page.
|
"""Extract bounding boxes of tables from a PDF page.
|
||||||
|
|
||||||
@ -1756,8 +1770,8 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
return [table.bbox for table in page.find_tables(tset)]
|
return [table.bbox for table in page.find_tables(tset)]
|
||||||
|
|
||||||
def _extract_tables_from_page(
|
def _extract_tables_from_page(
|
||||||
self,
|
self,
|
||||||
page: pdfplumber.page.Page,
|
page: pdfplumber.page.Page,
|
||||||
) -> list[list[list[Any]]]:
|
) -> list[list[list[Any]]]:
|
||||||
"""Extract tables from a PDF page.
|
"""Extract tables from a PDF page.
|
||||||
|
|
||||||
@ -1930,11 +1944,11 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
textract_features: Optional[Sequence[int]] = None,
|
textract_features: Optional[Sequence[int]] = None,
|
||||||
client: Optional[Any] = None,
|
client: Optional[Any] = None,
|
||||||
*,
|
*,
|
||||||
linearization_config: Optional[TextLinearizationConfig] = None,
|
linearization_config: Optional[TextLinearizationConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initializes the parser.
|
"""Initializes the parser.
|
||||||
|
|
||||||
@ -1999,12 +2013,13 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
the blob.data is taken
|
the blob.data is taken
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
url_parse_result = urlparse(
|
||||||
|
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
||||||
# Either call with S3 path (multi-page) or with bytes (single-page)
|
# Either call with S3 path (multi-page) or with bytes (single-page)
|
||||||
if (
|
if (
|
||||||
url_parse_result
|
url_parse_result
|
||||||
and url_parse_result.scheme == "s3"
|
and url_parse_result.scheme == "s3"
|
||||||
and url_parse_result.netloc
|
and url_parse_result.netloc
|
||||||
):
|
):
|
||||||
textract_response_json = self.tc.call_textract(
|
textract_response_json = self.tc.call_textract(
|
||||||
input_document=str(blob.path), # type: ignore[attr-defined]
|
input_document=str(blob.path), # type: ignore[attr-defined]
|
||||||
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
|||||||
self.client = client
|
self.client = client
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
|
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
|
||||||
|
Document]: # type: ignore[valid-type]
|
||||||
for p in result.pages:
|
for p in result.pages:
|
||||||
content = " ".join([line.content for line in p.lines])
|
content = " ".join([line.content for line in p.lines])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user