mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 06:53:59 +00:00
community[minor]: 04 - Refactoring PDFMiner parser (#29526)
This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the XXX parser. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970). --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
@@ -11,7 +11,6 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers import (
|
||||
BaseImageBlobParser,
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyPDFium2Parser,
|
||||
)
|
||||
@@ -97,12 +96,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
|
||||
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
||||
|
||||
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||
|
||||
|
||||
def test_pypdfium2_parser() -> None:
|
||||
"""Test PyPDFium2 parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
@@ -116,11 +109,6 @@ def test_pdfplumber_parser() -> None:
|
||||
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
||||
|
||||
|
||||
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
|
||||
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
|
||||
_assert_with_parser(PDFMinerParser(extract_images=True))
|
||||
|
||||
|
||||
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
|
||||
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
|
||||
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
||||
@@ -138,6 +126,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PDFMinerParser", {}),
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
@@ -166,6 +155,7 @@ def test_mode_and_extract_images_variations(
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PDFMinerParser", {}),
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
|
@@ -8,7 +8,6 @@ import langchain_community.document_loaders as pdf_loaders
|
||||
from langchain_community.document_loaders import (
|
||||
AmazonTextractPDFLoader,
|
||||
MathpixPDFLoader,
|
||||
PDFMinerLoader,
|
||||
PDFMinerPDFasHTMLLoader,
|
||||
PyPDFium2Loader,
|
||||
UnstructuredPDFLoader,
|
||||
@@ -42,34 +41,6 @@ def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_pdfminer_loader() -> None:
|
||||
"""Test PDFMiner loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
# Verify that concatenating pages parameter works
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=True)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=False)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
|
||||
|
||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
"""Test PDFMinerPDFasHTMLLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
@@ -211,6 +182,7 @@ def test_amazontextract_loader_failures() -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PDFMinerLoader", {}),
|
||||
("PyMuPDFLoader", {}),
|
||||
("PyPDFLoader", {}),
|
||||
],
|
||||
@@ -234,6 +206,8 @@ def test_standard_parameters(
|
||||
images_parser=None,
|
||||
images_inner_format="text",
|
||||
password=None,
|
||||
extract_tables=None,
|
||||
extract_tables_settings=None,
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
|
@@ -10,7 +10,6 @@ import langchain_community.document_loaders.parsers as pdf_parsers
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyPDFium2Parser,
|
||||
_merge_text_and_extras,
|
||||
)
|
||||
@@ -75,13 +74,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("pdfminer")
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdfium2")
|
||||
def test_pypdfium2_parser() -> None:
|
||||
"""Test PyPDFium2 parser."""
|
||||
@@ -92,6 +84,7 @@ def test_pypdfium2_parser() -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,require,params",
|
||||
[
|
||||
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
|
||||
("PyMuPDFParser", "pymupdf", {}),
|
||||
("PyPDFParser", "pypdf", {}),
|
||||
],
|
||||
|
Reference in New Issue
Block a user