unstructured[patch]: add to integration tests (#26666)

- Add to tests on parsed content;
- Add tests for async + lazy loading;
- Add a test for `strategy="hi_res"`.
This commit is contained in:
ccurme 2024-09-19 13:43:34 -04:00 committed by GitHub
parent 28dd6564db
commit 7d49ee9741
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,8 +1,9 @@
import os
from pathlib import Path
from typing import Callable
from typing import Callable, List
import pytest
from langchain_core.documents import Document
from langchain_unstructured import UnstructuredLoader
@ -13,6 +14,51 @@ EXAMPLE_DOCS_DIRECTORY = str(
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
def _check_docs_content(docs: List[Document]) -> None:
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert (
sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16
) # 16 page doc
expected_metadata_keys = [
"source",
"languages",
"page_number",
"category",
"coordinates",
"element_id",
]
for doc in docs:
if doc.page_content:
for key in expected_metadata_keys:
assert key in doc.metadata
else:
assert doc.metadata.get("category") == "PageBreak"
page_numbers = []
for doc in docs:
if page_number := doc.metadata.get("page_number"):
page_numbers.append(page_number)
assert set(page_numbers) == set(range(1, 17))
assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks)
page_1_content = ""
for doc in docs:
if doc.metadata.get("page_number") == 1:
page_1_content += f" {doc.page_content}"
assert (
"LayoutParser: A Unified Toolkit for Deep Learning "
"Based Document Image Analysis"
) in page_1_content
categories = set(doc.metadata.get("category") for doc in docs)
assert "NarrativeText" in categories
assert "Title" in categories
# -- Local partition --
@ -27,10 +73,24 @@ def test_loader_partitions_locally() -> None:
include_page_breaks=True,
).load()
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
_check_docs_content(docs)
@pytest.mark.local
async def test_loader_partitions_locally_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
)
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
docs = []
async for doc in loader.alazy_load():
docs.append(doc)
_check_docs_content(docs)
@pytest.mark.local
@ -79,16 +139,30 @@ def test_loader_partitions_via_api() -> None:
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)
docs = loader.load()
assert len(docs) > 1
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
_check_docs_content(docs)
async def test_loader_partitions_via_api_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)
assert docs[0].metadata.get("element_id") is not None
docs = []
async for doc in loader.alazy_load():
docs.append(doc)
_check_docs_content(docs)
def test_loader_partitions_multiple_via_api() -> None:
@ -124,6 +198,22 @@ def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
loader.load()
def test_loader_partitions_via_api_hi_res() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="hi_res",
)
docs = loader.load()
categories = set(doc.metadata.get("category") for doc in docs)
assert "Table" in categories
assert "Image" in categories
# -- fixtures ---