mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-26 05:48:40 +00:00
Clean up tests for pdf parsers (#4595)
# Organize tests for pdf parsers Clean up tests for pdf parsers, remove duplicate tests, convert to unit tests.
This commit is contained in:
@@ -7,6 +7,8 @@ from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
)
|
||||
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
|
||||
@@ -62,3 +64,16 @@ def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||
|
||||
|
||||
@pytest.mark.requires("fitz") # package is PyMuPDF
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
_assert_with_parser(PyMuPDFParser())
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdfium2")
|
||||
def test_pypdfium2_parser() -> None:
|
||||
"""Test PyPDFium2 parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PyPDFium2Parser())
|
||||
|
Reference in New Issue
Block a user