Merge legacy and standard metadata keys in pdf parser.

This commit is contained in:
Philippe Prados 2025-03-07 14:45:37 +01:00
parent fa47539b60
commit 76b3d6bdb8

View File

@ -10,6 +10,10 @@ import warnings
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from urllib.parse import urlparse
import numpy
import numpy as np
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Any, Any,
@ -23,11 +27,6 @@ from typing import (
Union, Union,
cast, cast,
) )
from urllib.parse import urlparse
import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser, BaseImageBlobParser,
RapidOCRBlobParser, RapidOCRBlobParser,
) )
from langchain_core.documents import Document
if TYPE_CHECKING: if TYPE_CHECKING:
import pdfplumber import pdfplumber
@ -62,7 +62,7 @@ _PDF_FILTER_WITHOUT_LOSS = [
def extract_from_images_with_rapidocr( def extract_from_images_with_rapidocr(
images: Sequence[Union[Iterable[np.ndarray], bytes]], images: Sequence[Union[Iterable[np.ndarray], bytes]],
) -> str: ) -> str:
"""Extract text from images with RapidOCR. """Extract text from images with RapidOCR.
@ -201,7 +201,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
""" """
def _recurs_merge_text_and_extras( def _recurs_merge_text_and_extras(
extras: list[str], text_from_page: str, recurs: bool extras: list[str], text_from_page: str, recurs: bool
) -> Optional[str]: ) -> Optional[str]:
if extras: if extras:
for delim in _PARAGRAPH_DELIMITER: for delim in _PARAGRAPH_DELIMITER:
@ -221,7 +221,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
if str_extras: if str_extras:
all_extras = delim + str_extras all_extras = delim + str_extras
all_text = ( all_text = (
text_from_page[:pos] + all_extras + text_from_page[pos:] text_from_page[:pos] + all_extras + text_from_page[pos:]
) )
break break
else: else:
@ -291,16 +291,16 @@ class PyPDFParser(BaseBlobParser):
""" """
def __init__( def __init__(
self, self,
password: Optional[Union[str, bytes]] = None, password: Optional[Union[str, bytes]] = None,
extract_images: bool = False, extract_images: bool = False,
*, *,
mode: Literal["single", "page"] = "page", mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None, images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
extraction_mode: Literal["plain", "layout"] = "plain", extraction_mode: Literal["plain", "layout"] = "plain",
extraction_kwargs: Optional[dict[str, Any]] = None, extraction_kwargs: Optional[dict[str, Any]] = None,
): ):
"""Initialize a parser based on PyPDF. """Initialize a parser based on PyPDF.
@ -519,15 +519,15 @@ class PDFMinerParser(BaseBlobParser):
_warn_concatenate_pages = False _warn_concatenate_pages = False
def __init__( def __init__(
self, self,
extract_images: bool = False, extract_images: bool = False,
*, *,
password: Optional[str] = None, password: Optional[str] = None,
mode: Literal["single", "page"] = "single", mode: Literal["single", "page"] = "single",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None, images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
concatenate_pages: Optional[bool] = None, concatenate_pages: Optional[bool] = None,
): ):
"""Initialize a parser based on PDFMiner. """Initialize a parser based on PDFMiner.
@ -629,10 +629,10 @@ class PDFMinerParser(BaseBlobParser):
return obj return obj
def _get_metadata( def _get_metadata(
self, self,
fp: BinaryIO, fp: BinaryIO,
password: str = "", password: str = "",
caching: bool = True, caching: bool = True,
) -> dict[str, Any]: ) -> dict[str, Any]:
""" """
Extract metadata from a PDF file. Extract metadata from a PDF file.
@ -726,10 +726,10 @@ class PDFMinerParser(BaseBlobParser):
class Visitor(PDFLayoutAnalyzer): class Visitor(PDFLayoutAnalyzer):
def __init__( def __init__(
self, self,
rsrcmgr: PDFResourceManager, rsrcmgr: PDFResourceManager,
pageno: int = 1, pageno: int = 1,
laparams: Optional[LAParams] = None, laparams: Optional[LAParams] = None,
) -> None: ) -> None:
super().__init__(rsrcmgr, pageno=pageno, laparams=laparams) super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
@ -855,17 +855,17 @@ class PyMuPDFParser(BaseBlobParser):
_lock = threading.Lock() _lock = threading.Lock()
def __init__( def __init__(
self, self,
text_kwargs: Optional[dict[str, Any]] = None, text_kwargs: Optional[dict[str, Any]] = None,
extract_images: bool = False, extract_images: bool = False,
*, *,
password: Optional[str] = None, password: Optional[str] = None,
mode: Literal["single", "page"] = "page", mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None, images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
extract_tables_settings: Optional[dict[str, Any]] = None, extract_tables_settings: Optional[dict[str, Any]] = None,
) -> None: ) -> None:
"""Initialize a parser based on PyMuPDF. """Initialize a parser based on PyMuPDF.
@ -921,11 +921,11 @@ class PyMuPDFParser(BaseBlobParser):
) )
def _lazy_parse( def _lazy_parse(
self, self,
blob: Blob, blob: Blob,
# text-kwargs is present for backwards compatibility. # text-kwargs is present for backwards compatibility.
# Users should not use it directly. # Users should not use it directly.
text_kwargs: Optional[dict[str, Any]] = None, text_kwargs: Optional[dict[str, Any]] = None,
) -> Iterator[Document]: # type: ignore[valid-type] ) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob. """Lazily parse the blob.
Insert image, if possible, between two paragraphs. Insert image, if possible, between two paragraphs.
@ -1014,10 +1014,10 @@ class PyMuPDFParser(BaseBlobParser):
) )
def _get_page_content( def _get_page_content(
self, self,
doc: pymupdf.Document, doc: pymupdf.Document,
page: pymupdf.Page, page: pymupdf.Page,
text_kwargs: dict[str, Any], text_kwargs: dict[str, Any],
) -> str: ) -> str:
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning """Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty. if it is empty.
@ -1075,7 +1075,7 @@ class PyMuPDFParser(BaseBlobParser):
return metadata return metadata
def _extract_images_from_page( def _extract_images_from_page(
self, doc: pymupdf.Document, page: pymupdf.Page self, doc: pymupdf.Document, page: pymupdf.Page
) -> str: ) -> str:
"""Extract images from a PDF page and get the text using images_to_text. """Extract images from a PDF page and get the text using images_to_text.
@ -1216,14 +1216,14 @@ class PyPDFium2Parser(BaseBlobParser):
_lock = threading.Lock() _lock = threading.Lock()
def __init__( def __init__(
self, self,
extract_images: bool = False, extract_images: bool = False,
*, *,
password: Optional[str] = None, password: Optional[str] = None,
mode: Literal["single", "page"] = "page", mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None, images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
) -> None: ) -> None:
"""Initialize a parser based on PyPDFium2. """Initialize a parser based on PyPDFium2.
@ -1426,18 +1426,18 @@ class PDFPlumberParser(BaseBlobParser):
""" """
def __init__( def __init__(
self, self,
text_kwargs: Optional[Mapping[str, Any]] = None, text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False, dedupe: bool = False,
extract_images: bool = False, extract_images: bool = False,
*, *,
password: Optional[str] = None, password: Optional[str] = None,
mode: Literal["single", "page"] = "page", mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None, images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
extract_tables_settings: Optional[dict[str, Any]] = None, extract_tables_settings: Optional[dict[str, Any]] = None,
) -> None: ) -> None:
"""Initialize the parser. """Initialize the parser.
@ -1522,15 +1522,29 @@ class PDFPlumberParser(BaseBlobParser):
from pdfplumber.utils import geometry from pdfplumber.utils import geometry
contents = [] contents = []
doc_metadata = doc.metadata | _purge_metadata( # The legacy version, use CreationDate, Creator, etc.
( # The new 'standard' version must use lower case key.
doc.metadata # This next line, merge the legecy keys and standard keys
| { # in the same dictionary.
"source": blob.source, # - The CreationDate is duplicate to `creationdate` with iso format.
"file_path": blob.source, # - The Creator is duplicate to 'creator', etc.
"total_pages": len(doc.pages), # With this strategy, the legacy code can continue to use CreationDate
} # or Creator. The new code, can use `creationdate` or `creator`.
) # _purge_metadata() convert and normalize the name and format of
# the metadatas.
doc_metadata = (
doc.metadata | # Legacy metdata with...
_purge_metadata(
(
doc.metadata # Add parser metdata
| { # with more keys
"source": blob.source,
"file_path": blob.source,
"total_pages": len(doc.pages),
}
)
)
) )
for page in doc.pages: for page in doc.pages:
@ -1543,11 +1557,11 @@ class PDFPlumberParser(BaseBlobParser):
page_text = [] page_text = []
extras = [] extras = []
for content in self._split_page_content( for content in self._split_page_content(
page, page,
tables_bbox, tables_bbox,
tables_content, tables_content,
images_bbox, images_bbox,
image_from_page, image_from_page,
): ):
if isinstance(content, str): # Text if isinstance(content, str): # Text
page_text.append(content) page_text.append(content)
@ -1615,13 +1629,13 @@ class PDFPlumberParser(BaseBlobParser):
return page.extract_text(**self.text_kwargs) return page.extract_text(**self.text_kwargs)
def _split_page_content( def _split_page_content(
self, self,
page: pdfplumber.page.Page, page: pdfplumber.page.Page,
tables_bbox: list[tuple[float, float, float, float]], tables_bbox: list[tuple[float, float, float, float]],
tables_content: list[list[list[Any]]], tables_content: list[list[list[Any]]],
images_bbox: list[tuple[float, float, float, float]], images_bbox: list[tuple[float, float, float, float]],
images_content: list[np.ndarray], images_content: list[np.ndarray],
**kwargs: Any, **kwargs: Any,
) -> Iterator[Union[str, list[list[str]], np.ndarray]]: ) -> Iterator[Union[str, list[list[str]], np.ndarray]]:
"""Split the page content into text, tables, and images. """Split the page content into text, tables, and images.
@ -1736,8 +1750,8 @@ class PDFPlumberParser(BaseBlobParser):
return images return images
def _extract_tables_bbox_from_page( def _extract_tables_bbox_from_page(
self, self,
page: pdfplumber.page.Page, page: pdfplumber.page.Page,
) -> list[tuple[float, float, float, float]]: ) -> list[tuple[float, float, float, float]]:
"""Extract bounding boxes of tables from a PDF page. """Extract bounding boxes of tables from a PDF page.
@ -1756,8 +1770,8 @@ class PDFPlumberParser(BaseBlobParser):
return [table.bbox for table in page.find_tables(tset)] return [table.bbox for table in page.find_tables(tset)]
def _extract_tables_from_page( def _extract_tables_from_page(
self, self,
page: pdfplumber.page.Page, page: pdfplumber.page.Page,
) -> list[list[list[Any]]]: ) -> list[list[list[Any]]]:
"""Extract tables from a PDF page. """Extract tables from a PDF page.
@ -1930,11 +1944,11 @@ class AmazonTextractPDFParser(BaseBlobParser):
""" """
def __init__( def __init__(
self, self,
textract_features: Optional[Sequence[int]] = None, textract_features: Optional[Sequence[int]] = None,
client: Optional[Any] = None, client: Optional[Any] = None,
*, *,
linearization_config: Optional[TextLinearizationConfig] = None, linearization_config: Optional[TextLinearizationConfig] = None,
) -> None: ) -> None:
"""Initializes the parser. """Initializes the parser.
@ -1999,12 +2013,13 @@ class AmazonTextractPDFParser(BaseBlobParser):
the blob.data is taken the blob.data is taken
""" """
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined] url_parse_result = urlparse(
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
# Either call with S3 path (multi-page) or with bytes (single-page) # Either call with S3 path (multi-page) or with bytes (single-page)
if ( if (
url_parse_result url_parse_result
and url_parse_result.scheme == "s3" and url_parse_result.scheme == "s3"
and url_parse_result.netloc and url_parse_result.netloc
): ):
textract_response_json = self.tc.call_textract( textract_response_json = self.tc.call_textract(
input_document=str(blob.path), # type: ignore[attr-defined] input_document=str(blob.path), # type: ignore[attr-defined]
@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser):
self.client = client self.client = client
self.model = model self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type] def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
Document]: # type: ignore[valid-type]
for p in result.pages: for p in result.pages:
content = " ".join([line.content for line in p.lines]) content = " ".join([line.content for line in p.lines])