mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
community[patch]: add to pypdf tests and run in CI (#26663)
This commit is contained in:
parent
4d3d62c249
commit
f91bdd12d2
@ -1,4 +1,3 @@
|
|||||||
import re
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Sequence, Union
|
from typing import Sequence, Union
|
||||||
|
|
||||||
@ -11,7 +10,6 @@ from langchain_community.document_loaders import (
|
|||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
PyMuPDFLoader,
|
PyMuPDFLoader,
|
||||||
PyPDFium2Loader,
|
PyPDFium2Loader,
|
||||||
PyPDFLoader,
|
|
||||||
UnstructuredPDFLoader,
|
UnstructuredPDFLoader,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -86,37 +84,6 @@ def test_pdfminer_pdf_as_html_loader() -> None:
|
|||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_pypdf_loader() -> None:
|
|
||||||
"""Test PyPDFLoader."""
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
||||||
loader = PyPDFLoader(str(file_path))
|
|
||||||
docs = loader.load()
|
|
||||||
|
|
||||||
assert len(docs) == 1
|
|
||||||
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
||||||
loader = PyPDFLoader(str(file_path))
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
assert len(docs) == 16
|
|
||||||
|
|
||||||
|
|
||||||
def test_pypdf_loader_with_layout() -> None:
|
|
||||||
"""Test PyPDFLoader with layout mode."""
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
||||||
loader = PyPDFLoader(str(file_path), extraction_mode="layout")
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
first_page = docs[0].page_content
|
|
||||||
|
|
||||||
expected = (
|
|
||||||
Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt"
|
|
||||||
).read_text(encoding="utf-8")
|
|
||||||
cleaned_first_page = re.sub(r"\x00", "", first_page)
|
|
||||||
cleaned_expected = re.sub(r"\x00", "", expected)
|
|
||||||
assert cleaned_first_page == cleaned_expected
|
|
||||||
|
|
||||||
|
|
||||||
def test_pypdfium2_loader() -> None:
|
def test_pypdfium2_loader() -> None:
|
||||||
"""Test PyPDFium2Loader."""
|
"""Test PyPDFium2Loader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
62
libs/community/tests/unit_tests/document_loaders/test_pdf.py
Normal file
62
libs/community/tests/unit_tests/document_loaders/test_pdf.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
|
||||||
|
path_to_simple_pdf = (
|
||||||
|
Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf"
|
||||||
|
)
|
||||||
|
path_to_layout_pdf = (
|
||||||
|
Path(__file__).parent.parent
|
||||||
|
/ "document_loaders/sample_documents/layout-parser-paper.pdf"
|
||||||
|
)
|
||||||
|
path_to_layout_pdf_txt = (
|
||||||
|
Path(__file__).parent.parent.parent
|
||||||
|
/ "integration_tests/examples/layout-parser-paper-page-1.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("pypdf")
|
||||||
|
def test_pypdf_loader() -> None:
|
||||||
|
"""Test PyPDFLoader."""
|
||||||
|
loader = PyPDFLoader(str(path_to_simple_pdf))
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
loader = PyPDFLoader(str(path_to_layout_pdf))
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 16
|
||||||
|
for page, doc in enumerate(docs):
|
||||||
|
assert doc.metadata["page"] == page
|
||||||
|
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
|
||||||
|
assert len(doc.page_content) > 10
|
||||||
|
|
||||||
|
first_page = docs[0].page_content
|
||||||
|
for expected in ["LayoutParser", "A Unified Toolkit"]:
|
||||||
|
assert expected in first_page
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("pypdf")
|
||||||
|
def test_pypdf_loader_with_layout() -> None:
|
||||||
|
"""Test PyPDFLoader with layout mode."""
|
||||||
|
loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout")
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 16
|
||||||
|
for page, doc in enumerate(docs):
|
||||||
|
assert doc.metadata["page"] == page
|
||||||
|
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
|
||||||
|
assert len(doc.page_content) > 10
|
||||||
|
|
||||||
|
first_page = docs[0].page_content
|
||||||
|
for expected in ["LayoutParser", "A Unified Toolkit"]:
|
||||||
|
assert expected in first_page
|
||||||
|
|
||||||
|
expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
|
||||||
|
cleaned_first_page = re.sub(r"\x00", "", first_page)
|
||||||
|
cleaned_expected = re.sub(r"\x00", "", expected)
|
||||||
|
assert cleaned_first_page == cleaned_expected
|
Loading…
Reference in New Issue
Block a user