From 76b3d6bdb8231a77a391001692f9af7474c3a737 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Fri, 7 Mar 2025 14:45:37 +0100 Subject: [PATCH] Merge legacy and standard metadata keys in pdf parser. --- .../document_loaders/parsers/pdf.py | 238 ++++++++++-------- 1 file changed, 127 insertions(+), 111 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index a6c94673942..51c601f18fb 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -10,6 +10,10 @@ import warnings from datetime import datetime from pathlib import Path from tempfile import TemporaryDirectory +from urllib.parse import urlparse + +import numpy +import numpy as np from typing import ( TYPE_CHECKING, Any, @@ -23,11 +27,6 @@ from typing import ( Union, cast, ) -from urllib.parse import urlparse - -import numpy -import numpy as np -from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob @@ -35,6 +34,7 @@ from langchain_community.document_loaders.parsers.images import ( BaseImageBlobParser, RapidOCRBlobParser, ) +from langchain_core.documents import Document if TYPE_CHECKING: import pdfplumber @@ -62,7 +62,7 @@ _PDF_FILTER_WITHOUT_LOSS = [ def extract_from_images_with_rapidocr( - images: Sequence[Union[Iterable[np.ndarray], bytes]], + images: Sequence[Union[Iterable[np.ndarray], bytes]], ) -> str: """Extract text from images with RapidOCR. @@ -201,7 +201,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str: """ def _recurs_merge_text_and_extras( - extras: list[str], text_from_page: str, recurs: bool + extras: list[str], text_from_page: str, recurs: bool ) -> Optional[str]: if extras: for delim in _PARAGRAPH_DELIMITER: @@ -221,7 +221,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str: if str_extras: all_extras = delim + str_extras all_text = ( - text_from_page[:pos] + all_extras + text_from_page[pos:] + text_from_page[:pos] + all_extras + text_from_page[pos:] ) break else: @@ -291,16 +291,16 @@ class PyPDFParser(BaseBlobParser): """ def __init__( - self, - password: Optional[Union[str, bytes]] = None, - extract_images: bool = False, - *, - mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, - images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", - extraction_mode: Literal["plain", "layout"] = "plain", - extraction_kwargs: Optional[dict[str, Any]] = None, + self, + password: Optional[Union[str, bytes]] = None, + extract_images: bool = False, + *, + mode: Literal["single", "page"] = "page", + pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + images_parser: Optional[BaseImageBlobParser] = None, + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", + extraction_mode: Literal["plain", "layout"] = "plain", + extraction_kwargs: Optional[dict[str, Any]] = None, ): """Initialize a parser based on PyPDF. @@ -519,15 +519,15 @@ class PDFMinerParser(BaseBlobParser): _warn_concatenate_pages = False def __init__( - self, - extract_images: bool = False, - *, - password: Optional[str] = None, - mode: Literal["single", "page"] = "single", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, - images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", - concatenate_pages: Optional[bool] = None, + self, + extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "single", + pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + images_parser: Optional[BaseImageBlobParser] = None, + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", + concatenate_pages: Optional[bool] = None, ): """Initialize a parser based on PDFMiner. @@ -629,10 +629,10 @@ class PDFMinerParser(BaseBlobParser): return obj def _get_metadata( - self, - fp: BinaryIO, - password: str = "", - caching: bool = True, + self, + fp: BinaryIO, + password: str = "", + caching: bool = True, ) -> dict[str, Any]: """ Extract metadata from a PDF file. @@ -726,10 +726,10 @@ class PDFMinerParser(BaseBlobParser): class Visitor(PDFLayoutAnalyzer): def __init__( - self, - rsrcmgr: PDFResourceManager, - pageno: int = 1, - laparams: Optional[LAParams] = None, + self, + rsrcmgr: PDFResourceManager, + pageno: int = 1, + laparams: Optional[LAParams] = None, ) -> None: super().__init__(rsrcmgr, pageno=pageno, laparams=laparams) @@ -855,17 +855,17 @@ class PyMuPDFParser(BaseBlobParser): _lock = threading.Lock() def __init__( - self, - text_kwargs: Optional[dict[str, Any]] = None, - extract_images: bool = False, - *, - password: Optional[str] = None, - mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, - images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", - extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, - extract_tables_settings: Optional[dict[str, Any]] = None, + self, + text_kwargs: Optional[dict[str, Any]] = None, + extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "page", + pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + images_parser: Optional[BaseImageBlobParser] = None, + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", + extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, + extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: """Initialize a parser based on PyMuPDF. @@ -921,11 +921,11 @@ class PyMuPDFParser(BaseBlobParser): ) def _lazy_parse( - self, - blob: Blob, - # text-kwargs is present for backwards compatibility. - # Users should not use it directly. - text_kwargs: Optional[dict[str, Any]] = None, + self, + blob: Blob, + # text-kwargs is present for backwards compatibility. + # Users should not use it directly. + text_kwargs: Optional[dict[str, Any]] = None, ) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob. Insert image, if possible, between two paragraphs. @@ -1014,10 +1014,10 @@ class PyMuPDFParser(BaseBlobParser): ) def _get_page_content( - self, - doc: pymupdf.Document, - page: pymupdf.Page, - text_kwargs: dict[str, Any], + self, + doc: pymupdf.Document, + page: pymupdf.Page, + text_kwargs: dict[str, Any], ) -> str: """Get the text of the page using PyMuPDF and RapidOCR and issue a warning if it is empty. @@ -1075,7 +1075,7 @@ class PyMuPDFParser(BaseBlobParser): return metadata def _extract_images_from_page( - self, doc: pymupdf.Document, page: pymupdf.Page + self, doc: pymupdf.Document, page: pymupdf.Page ) -> str: """Extract images from a PDF page and get the text using images_to_text. @@ -1216,14 +1216,14 @@ class PyPDFium2Parser(BaseBlobParser): _lock = threading.Lock() def __init__( - self, - extract_images: bool = False, - *, - password: Optional[str] = None, - mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, - images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", + self, + extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "page", + pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + images_parser: Optional[BaseImageBlobParser] = None, + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", ) -> None: """Initialize a parser based on PyPDFium2. @@ -1426,18 +1426,18 @@ class PDFPlumberParser(BaseBlobParser): """ def __init__( - self, - text_kwargs: Optional[Mapping[str, Any]] = None, - dedupe: bool = False, - extract_images: bool = False, - *, - password: Optional[str] = None, - mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, - images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", - extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, - extract_tables_settings: Optional[dict[str, Any]] = None, + self, + text_kwargs: Optional[Mapping[str, Any]] = None, + dedupe: bool = False, + extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "page", + pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + images_parser: Optional[BaseImageBlobParser] = None, + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", + extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, + extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: """Initialize the parser. @@ -1522,15 +1522,29 @@ class PDFPlumberParser(BaseBlobParser): from pdfplumber.utils import geometry contents = [] - doc_metadata = doc.metadata | _purge_metadata( - ( - doc.metadata - | { - "source": blob.source, - "file_path": blob.source, - "total_pages": len(doc.pages), - } - ) + # The legacy version, use CreationDate, Creator, etc. + # The new 'standard' version must use lower case key. + # This next line, merge the legecy keys and standard keys + # in the same dictionary. + # - The CreationDate is duplicate to `creationdate` with iso format. + # - The Creator is duplicate to 'creator', etc. + # With this strategy, the legacy code can continue to use CreationDate + # or Creator. The new code, can use `creationdate` or `creator`. + # _purge_metadata() convert and normalize the name and format of + # the metadatas. + + doc_metadata = ( + doc.metadata | # Legacy metdata with... + _purge_metadata( + ( + doc.metadata # Add parser metdata + | { # with more keys + "source": blob.source, + "file_path": blob.source, + "total_pages": len(doc.pages), + } + ) + ) ) for page in doc.pages: @@ -1543,11 +1557,11 @@ class PDFPlumberParser(BaseBlobParser): page_text = [] extras = [] for content in self._split_page_content( - page, - tables_bbox, - tables_content, - images_bbox, - image_from_page, + page, + tables_bbox, + tables_content, + images_bbox, + image_from_page, ): if isinstance(content, str): # Text page_text.append(content) @@ -1615,13 +1629,13 @@ class PDFPlumberParser(BaseBlobParser): return page.extract_text(**self.text_kwargs) def _split_page_content( - self, - page: pdfplumber.page.Page, - tables_bbox: list[tuple[float, float, float, float]], - tables_content: list[list[list[Any]]], - images_bbox: list[tuple[float, float, float, float]], - images_content: list[np.ndarray], - **kwargs: Any, + self, + page: pdfplumber.page.Page, + tables_bbox: list[tuple[float, float, float, float]], + tables_content: list[list[list[Any]]], + images_bbox: list[tuple[float, float, float, float]], + images_content: list[np.ndarray], + **kwargs: Any, ) -> Iterator[Union[str, list[list[str]], np.ndarray]]: """Split the page content into text, tables, and images. @@ -1736,8 +1750,8 @@ class PDFPlumberParser(BaseBlobParser): return images def _extract_tables_bbox_from_page( - self, - page: pdfplumber.page.Page, + self, + page: pdfplumber.page.Page, ) -> list[tuple[float, float, float, float]]: """Extract bounding boxes of tables from a PDF page. @@ -1756,8 +1770,8 @@ class PDFPlumberParser(BaseBlobParser): return [table.bbox for table in page.find_tables(tset)] def _extract_tables_from_page( - self, - page: pdfplumber.page.Page, + self, + page: pdfplumber.page.Page, ) -> list[list[list[Any]]]: """Extract tables from a PDF page. @@ -1930,11 +1944,11 @@ class AmazonTextractPDFParser(BaseBlobParser): """ def __init__( - self, - textract_features: Optional[Sequence[int]] = None, - client: Optional[Any] = None, - *, - linearization_config: Optional[TextLinearizationConfig] = None, + self, + textract_features: Optional[Sequence[int]] = None, + client: Optional[Any] = None, + *, + linearization_config: Optional[TextLinearizationConfig] = None, ) -> None: """Initializes the parser. @@ -1999,12 +2013,13 @@ class AmazonTextractPDFParser(BaseBlobParser): the blob.data is taken """ - url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined] + url_parse_result = urlparse( + str(blob.path)) if blob.path else None # type: ignore[attr-defined] # Either call with S3 path (multi-page) or with bytes (single-page) if ( - url_parse_result - and url_parse_result.scheme == "s3" - and url_parse_result.netloc + url_parse_result + and url_parse_result.scheme == "s3" + and url_parse_result.netloc ): textract_response_json = self.tc.call_textract( input_document=str(blob.path), # type: ignore[attr-defined] @@ -2045,7 +2060,8 @@ class DocumentIntelligenceParser(BaseBlobParser): self.client = client self.model = model - def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type] + def _generate_docs(self, blob: Blob, result: Any) -> Iterator[ + Document]: # type: ignore[valid-type] for p in result.pages: content = " ".join([line.content for line in p.lines])