Clean up tests for pdf parsers (#4595)

# Organize tests for pdf parsers

Clean up tests for pdf parsers, remove duplicate tests, convert to unit
tests.
This commit is contained in:
Eugene Yurtsev
2023-05-15 14:21:05 -04:00
committed by GitHub
parent 70fd7cda14
commit 09587a3201
4 changed files with 353 additions and 278 deletions

View File

@@ -7,6 +7,8 @@ from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.pdf import (
PDFMinerParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
)
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
@@ -62,3 +64,16 @@ def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("fitz") # package is PyMuPDF
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
_assert_with_parser(PyMuPDFParser())
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())