community[minor]: 04 - Refactoring PDFMiner parser (#29526)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once. This specific part focuses on updating the XXX
parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Philippe PRADOS
2025-02-06 03:08:27 +01:00
committed by GitHub
parent 4460d20ba9
commit 6ff0d5c807
8 changed files with 2559 additions and 773 deletions

View File

@@ -11,7 +11,6 @@ from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import (
BaseImageBlobParser,
PDFMinerParser,
PDFPlumberParser,
PyPDFium2Parser,
)
@@ -97,12 +96,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
@@ -116,11 +109,6 @@ def test_pdfplumber_parser() -> None:
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
_assert_with_parser(PDFMinerParser(extract_images=True))
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
_assert_with_parser(PyPDFium2Parser(extract_images=True))
@@ -138,6 +126,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
@pytest.mark.parametrize(
"parser_factory,params",
[
("PDFMinerParser", {}),
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),
@@ -166,6 +155,7 @@ def test_mode_and_extract_images_variations(
@pytest.mark.parametrize(
"parser_factory,params",
[
("PDFMinerParser", {}),
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),

View File

@@ -8,7 +8,6 @@ import langchain_community.document_loaders as pdf_loaders
from langchain_community.document_loaders import (
AmazonTextractPDFLoader,
MathpixPDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PyPDFium2Loader,
UnstructuredPDFLoader,
@@ -42,34 +41,6 @@ def test_unstructured_pdf_loader_default_mode() -> None:
assert len(docs) == 1
def test_pdfminer_loader() -> None:
"""Test PDFMiner loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(file_path)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(file_path)
docs = loader.load()
assert len(docs) == 1
# Verify that concatenating pages parameter works
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(file_path, concatenate_pages=True)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(file_path, concatenate_pages=False)
docs = loader.load()
assert len(docs) == 16
def test_pdfminer_pdf_as_html_loader() -> None:
"""Test PDFMinerPDFasHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
@@ -211,6 +182,7 @@ def test_amazontextract_loader_failures() -> None:
@pytest.mark.parametrize(
"parser_factory,params",
[
("PDFMinerLoader", {}),
("PyMuPDFLoader", {}),
("PyPDFLoader", {}),
],
@@ -234,6 +206,8 @@ def test_standard_parameters(
images_parser=None,
images_inner_format="text",
password=None,
extract_tables=None,
extract_tables_settings=None,
)
docs = loader.load()
assert len(docs) == 16

View File

@@ -10,7 +10,6 @@ import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyPDFium2Parser,
_merge_text_and_extras,
)
@@ -75,13 +74,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
@@ -92,6 +84,7 @@ def test_pypdfium2_parser() -> None:
@pytest.mark.parametrize(
"parser_factory,require,params",
[
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}),
],