diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index eb707adcf3e..cabbe76200b 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -454,9 +454,7 @@ class PyPDFParser(BaseBlobParser): image_bytes = io.BytesIO() Image.fromarray(np_image).save(image_bytes, format="PNG") blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png") - image_text = next( - self.images_parser.lazy_parse(blob) # type: ignore - ).page_content + image_text = next(self.images_parser.lazy_parse(blob)).page_content images.append( _format_inner_image(blob, image_text, self.images_inner_format) ) @@ -751,7 +749,7 @@ class PDFMinerParser(BaseBlobParser): blob = Blob.from_path(Path(tempdir) / filename) blob.metadata["source"] = "#" image_text = next( - self.images_parser.lazy_parse(blob) # type: ignore + self.images_parser.lazy_parse(blob) ).page_content text_io.write( @@ -1104,9 +1102,7 @@ class PyMuPDFParser(BaseBlobParser): blob = Blob.from_data( image_bytes.getvalue(), mime_type="application/x-npy" ) - image_text = next( - self.images_parser.lazy_parse(blob) # type: ignore - ).page_content + image_text = next(self.images_parser.lazy_parse(blob)).page_content images.append( _format_inner_image(blob, image_text, self.images_inner_format) @@ -1196,6 +1192,8 @@ class PyPDFium2Parser(BaseBlobParser): # password=None, mode="page", pages_delimiter="\n\f", + # extract_images = True, + # images_to_text = convert_images_to_text_with_tesseract(), ) Lazily parse the blob: @@ -1365,9 +1363,7 @@ class PyPDFium2Parser(BaseBlobParser): continue numpy.save(image_bytes, image.get_bitmap().to_numpy()) blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy") - text_from_image = next( - self.images_parser.lazy_parse(blob) # type: ignore - ).page_content + text_from_image = next(self.images_parser.lazy_parse(blob)).page_content str_images.append( _format_inner_image(blob, text_from_image, self.images_inner_format) ) @@ -1410,6 +1406,7 @@ class PDFPlumberParser(BaseBlobParser): mode = "single", pages_delimiter = "\n\f", # extract_tables="markdown", + metadata_format="standard", ) Lazily parse the blob: @@ -1438,6 +1435,7 @@ class PDFPlumberParser(BaseBlobParser): images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, extract_tables_settings: Optional[dict[str, Any]] = None, + metadata_format: Literal["legacy", "standard"] = "legacy", ) -> None: """Initialize the parser. @@ -1461,6 +1459,8 @@ class PDFPlumberParser(BaseBlobParser): dedupe: Avoiding the error of duplicate characters if `dedupe=True` extract_tables_settings: Optional dictionary of settings for customizing table extraction. + metadata_format: Use CamelCase keys with 'legacy' + and lower keys with 'standard'. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` @@ -1492,6 +1492,19 @@ class PDFPlumberParser(BaseBlobParser): "snap_y_tolerance": 5, "intersection_x_tolerance": 15, } + if metadata_format == "legacy": + warnings.warn( + "The default value 'legacy' use some CamelCase keys. " + "It's will be deprecated in the next major version." + ) + + self.metadata_format = metadata_format + + def _validate_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]: + if self.metadata_format == "legacy": + return metadata + else: + return _validate_metadata(metadata) def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob. @@ -1520,18 +1533,17 @@ class PDFPlumberParser(BaseBlobParser): contents = [] # The legacy version, use CreationDate, Creator, etc. # The new 'standard' version must use lower case key. - # This next line, merge the legecy keys and standard keys - # in the same dictionary. - # - The CreationDate is duplicate to `creationdate` with iso format. - # - The Creator is duplicate to 'creator', etc. - # With this strategy, the legacy code can continue to use CreationDate - # or Creator. The new code, can use `creationdate` or `creator`. - # _purge_metadata() convert and normalize the name and format of - # the metadatas. - - doc_metadata = ( - doc.metadata # Legacy metdata with... - | _purge_metadata( + if self.metadata_format == "legacy": + doc_metadata = ( + doc.metadata # Add parser metdata + | { # with more keys + "source": blob.source, + "file_path": blob.source, + "total_pages": len(doc.pages), + } + ) + else: + doc_metadata = _purge_metadata( ( doc.metadata # Add parser metdata | { # with more keys @@ -1541,7 +1553,6 @@ class PDFPlumberParser(BaseBlobParser): } ) ) - ) for page in doc.pages: tables_bbox: list[tuple[float, float, float, float]] = ( @@ -1596,7 +1607,7 @@ class PDFPlumberParser(BaseBlobParser): all_text += "\n" yield Document( page_content=all_text, - metadata=_validate_metadata( + metadata=self._validate_metadata( doc_metadata | { "page": page.page_number - 1, @@ -1608,7 +1619,7 @@ class PDFPlumberParser(BaseBlobParser): if self.mode == "single": yield Document( page_content=self.pages_delimiter.join(contents), - metadata=_validate_metadata(doc_metadata), + metadata=self._validate_metadata(doc_metadata), ) def _process_page_content(self, page: pdfplumber.page.Page) -> str: diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index cddea3da8f4..38f0fbcebfc 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -1044,6 +1044,7 @@ class PDFPlumberLoader(BasePDFLoader): # extract_tables_settings = None, # text_kwargs = {"use_text_flow": False, "keep_blank_chars": False}, # dedupe = False, + metadata_format="standard", ) Lazy load documents: @@ -1082,6 +1083,7 @@ class PDFPlumberLoader(BasePDFLoader): pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, extract_tables_settings: Optional[dict[str, Any]] = None, + metadata_format: Literal["legacy", "standard"] = "legacy", ) -> None: """Initialize with a file path. @@ -1108,6 +1110,8 @@ class PDFPlumberLoader(BasePDFLoader): table extraction. text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` dedupe: Avoiding the error of duplicate characters if `dedupe=True` + metadata_format: Use CamelCase keys with 'legacy' + and lower keys with 'standard'. Returns: This method does not directly return data. Use the `load`, `lazy_load`, @@ -1129,6 +1133,7 @@ class PDFPlumberLoader(BasePDFLoader): text_kwargs=text_kwargs, extract_tables_settings=extract_tables_settings, dedupe=dedupe, + metadata_format=metadata_format, ) def lazy_load( diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index c61b0074333..439feaac822 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -2,19 +2,18 @@ import re from pathlib import Path -from typing import TYPE_CHECKING, Iterator +from typing import TYPE_CHECKING, Iterator, Type import pytest -import langchain_community.document_loaders.parsers as pdf_parsers from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers import ( BaseImageBlobParser, - PDFPlumberParser, ) from langchain_community.document_loaders.parsers.pdf import ( PDFMinerParser, + PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, PyPDFParser, @@ -114,7 +113,7 @@ class EmptyImageBlobParser(BaseImageBlobParser): "parser_class,params", [ (PDFMinerParser, {}), - (PDFPlumberParser, {}), + (PDFPlumberParser, {"metadata_format": "standard"}), (PyMuPDFParser, {}), (PyPDFium2Parser, {}), (PyPDFParser, {"extraction_mode": "plain"}), @@ -145,7 +144,7 @@ def test_mode_and_extract_images_variations( "parser_class,params", [ (PDFMinerParser, {}), - (PDFPlumberParser, {}), + (PDFPlumberParser, {"metadata_format": "standard"}), (PyMuPDFParser, {}), (PyPDFium2Parser, {}), (PyPDFParser, {"extraction_mode": "plain"}), @@ -254,52 +253,50 @@ def test_parser_with_table( mode: str, extract_tables: str, ) -> None: - parser_class = getattr(pdf_parsers, parser_factory) - from PIL.Image import Image from langchain_community.document_loaders.parsers.images import BaseImageBlobParser def _std_assert_with_parser(parser: BaseBlobParser) -> None: - """Standard tests to verify that the given parser works. + """Standard tests to verify that the given parser works. - Args: - parser (BaseBlobParser): The parser to test. - """ - blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF) - doc_generator = parser.lazy_parse(blob) - docs = list(doc_generator) - tables = [] - for doc in docs: - if extract_tables == "markdown": - pattern = ( - r"(?s)(" - r"(?:(?:[^\n]*\|)\n)" - r"(?:\|(?:\s?:?---*:?\s?\|)+)\n" - r"(?:(?:[^\n]*\|)\n)+" - r")" - ) - elif extract_tables == "html": - pattern = r"(?s)(