Harrison/unstructured page number (#6464)

Co-authored-by: Reza Sanaie <reza@sanaie.ca>
This commit is contained in:
Harrison Chase
2023-06-19 22:31:43 -07:00
committed by GitHub
parent b82ddf9cfb
commit 9eec7c3206
3 changed files with 51 additions and 4 deletions

View File

@@ -11,7 +11,25 @@ from langchain.document_loaders import (
)
def test_unstructured_pdf_loader() -> None:
def test_unstructured_pdf_loader_elements_mode() -> None:
"""Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
docs = loader.load()
assert len(docs) == 2
def test_unstructured_pdf_loader_paged_mode() -> None:
"""Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
docs = loader.load()
assert len(docs) == 16
def test_unstructured_pdf_loader_default_mode() -> None:
"""Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path))