diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 07681a67671..2ec7a684be6 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -1,11 +1,16 @@ """Module contains common parsers for PDFs.""" -from typing import Any, Iterator, Mapping, Optional, Sequence, Union +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Iterator, Mapping, Optional, Sequence, Union from urllib.parse import urlparse from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob from langchain.schema import Document +if TYPE_CHECKING: + import pdfplumber.page + class PyPDFParser(BaseBlobParser): """Load `PDF` using `pypdf` and chunk at character level.""" @@ -116,13 +121,17 @@ class PyPDFium2Parser(BaseBlobParser): class PDFPlumberParser(BaseBlobParser): """Parse `PDF` with `PDFPlumber`.""" - def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None: + def __init__( + self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False + ) -> None: """Initialize the parser. Args: text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` + dedupe: Avoiding the error of duplicate characters if `dedupe=True`. """ self.text_kwargs = text_kwargs or {} + self.dedupe = dedupe def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" @@ -133,7 +142,7 @@ class PDFPlumberParser(BaseBlobParser): yield from [ Document( - page_content=page.extract_text(**self.text_kwargs), + page_content=self._process_page_content(page), metadata=dict( { "source": blob.source, @@ -151,6 +160,12 @@ class PDFPlumberParser(BaseBlobParser): for page in doc.pages ] + def _process_page_content(self, page: pdfplumber.page.Page) -> str: + """Process the page content based on dedupe.""" + if self.dedupe: + return page.dedupe_chars().extract_text(**self.text_kwargs) + return page.extract_text(**self.text_kwargs) + class AmazonTextractPDFParser(BaseBlobParser): """Send `PDF` files to `Amazon Textract` and parse them. diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index d907494d458..801a426a76b 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -437,7 +437,10 @@ class PDFPlumberLoader(BasePDFLoader): """Load `PDF` files using `pdfplumber`.""" def __init__( - self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None + self, + file_path: str, + text_kwargs: Optional[Mapping[str, Any]] = None, + dedupe: bool = False, ) -> None: """Initialize with a file path.""" try: @@ -450,11 +453,12 @@ class PDFPlumberLoader(BasePDFLoader): super().__init__(file_path) self.text_kwargs = text_kwargs or {} + self.dedupe = dedupe def load(self) -> List[Document]: """Load file.""" - parser = PDFPlumberParser(text_kwargs=self.text_kwargs) + parser = PDFPlumberParser(text_kwargs=self.text_kwargs, dedupe=self.dedupe) blob = Blob.from_path(self.file_path) return parser.parse(blob) diff --git a/libs/langchain/tests/data.py b/libs/langchain/tests/data.py index 228a9b212e1..c3b240bbc57 100644 --- a/libs/langchain/tests/data.py +++ b/libs/langchain/tests/data.py @@ -8,3 +8,4 @@ _EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples" # Paths to test PDF files HELLO_PDF = _EXAMPLES_DIR / "hello.pdf" LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf" +DUPLICATE_CHARS = _EXAMPLES_DIR / "duplicate-chars.pdf" diff --git a/libs/langchain/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/langchain/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index 7b76e0f721f..408498c126a 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/langchain/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -19,6 +19,10 @@ LAYOUT_PARSER_PAPER_PDF = ( Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf" ) +DUPLICATE_CHARS = ( + Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf" +) + def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None: """Standard tests to verify that the given parser works. @@ -59,6 +63,26 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> assert metadata["page"] == 0 +def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None: + """PDFPlumber tests to verify that duplicate characters appear or not + Args: + parser (BaseBlobParser): The parser to test. + splits_by_page (bool): Whether the parser splits by page or not by default. + dedupe: Avoiding the error of duplicate characters if `dedupe=True`. + """ + blob = Blob.from_path(DUPLICATE_CHARS) + doc_generator = parser.lazy_parse(blob) + assert isinstance(doc_generator, Iterator) + docs = list(doc_generator) + + if dedupe: + # use dedupe avoid duplicate characters. + assert "1000 Series" == docs[0].page_content.split("\n")[0] + else: + # duplicate characters will appear in doc if not dedupe + assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0] + + def test_pymupdf_loader() -> None: """Test PyMuPDF loader.""" _assert_with_parser(PyMuPDFParser()) @@ -84,3 +108,5 @@ def test_pypdfium2_parser() -> None: def test_pdfplumber_parser() -> None: """Test PDFPlumber parser.""" _assert_with_parser(PDFPlumberParser()) + _assert_with_duplicate_parser(PDFPlumberParser()) + _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True) diff --git a/libs/langchain/tests/integration_tests/examples/duplicate-chars.pdf b/libs/langchain/tests/integration_tests/examples/duplicate-chars.pdf new file mode 100644 index 00000000000..47467cd035d Binary files /dev/null and b/libs/langchain/tests/integration_tests/examples/duplicate-chars.pdf differ