mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-03 03:38:06 +00:00
unstructured[patch]: add to integration tests (#26666)
- Add to tests on parsed content; - Add tests for async + lazy loading; - Add a test for `strategy="hi_res"`.
This commit is contained in:
parent
28dd6564db
commit
7d49ee9741
@ -1,8 +1,9 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
from typing import Callable, List
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
@ -13,6 +14,51 @@ EXAMPLE_DOCS_DIRECTORY = str(
|
||||
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
|
||||
|
||||
|
||||
def _check_docs_content(docs: List[Document]) -> None:
|
||||
assert all(
|
||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
||||
)
|
||||
assert (
|
||||
sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16
|
||||
) # 16 page doc
|
||||
|
||||
expected_metadata_keys = [
|
||||
"source",
|
||||
"languages",
|
||||
"page_number",
|
||||
"category",
|
||||
"coordinates",
|
||||
"element_id",
|
||||
]
|
||||
for doc in docs:
|
||||
if doc.page_content:
|
||||
for key in expected_metadata_keys:
|
||||
assert key in doc.metadata
|
||||
else:
|
||||
assert doc.metadata.get("category") == "PageBreak"
|
||||
|
||||
page_numbers = []
|
||||
for doc in docs:
|
||||
if page_number := doc.metadata.get("page_number"):
|
||||
page_numbers.append(page_number)
|
||||
|
||||
assert set(page_numbers) == set(range(1, 17))
|
||||
assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks)
|
||||
|
||||
page_1_content = ""
|
||||
for doc in docs:
|
||||
if doc.metadata.get("page_number") == 1:
|
||||
page_1_content += f" {doc.page_content}"
|
||||
assert (
|
||||
"LayoutParser: A Unified Toolkit for Deep Learning "
|
||||
"Based Document Image Analysis"
|
||||
) in page_1_content
|
||||
|
||||
categories = set(doc.metadata.get("category") for doc in docs)
|
||||
assert "NarrativeText" in categories
|
||||
assert "Title" in categories
|
||||
|
||||
|
||||
# -- Local partition --
|
||||
|
||||
|
||||
@ -27,10 +73,24 @@ def test_loader_partitions_locally() -> None:
|
||||
include_page_breaks=True,
|
||||
).load()
|
||||
|
||||
assert all(
|
||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
||||
_check_docs_content(docs)
|
||||
|
||||
|
||||
@pytest.mark.local
|
||||
async def test_loader_partitions_locally_async_lazy() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
|
||||
loader = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
# Unstructured kwargs
|
||||
strategy="fast",
|
||||
include_page_breaks=True,
|
||||
)
|
||||
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
||||
docs = []
|
||||
async for doc in loader.alazy_load():
|
||||
docs.append(doc)
|
||||
|
||||
_check_docs_content(docs)
|
||||
|
||||
|
||||
@pytest.mark.local
|
||||
@ -79,16 +139,30 @@ def test_loader_partitions_via_api() -> None:
|
||||
# Unstructured kwargs
|
||||
strategy="fast",
|
||||
include_page_breaks=True,
|
||||
coordinates=True,
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
||||
assert all(
|
||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
||||
_check_docs_content(docs)
|
||||
|
||||
|
||||
async def test_loader_partitions_via_api_async_lazy() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
partition_via_api=True,
|
||||
# Unstructured kwargs
|
||||
strategy="fast",
|
||||
include_page_breaks=True,
|
||||
coordinates=True,
|
||||
)
|
||||
assert docs[0].metadata.get("element_id") is not None
|
||||
|
||||
docs = []
|
||||
async for doc in loader.alazy_load():
|
||||
docs.append(doc)
|
||||
|
||||
_check_docs_content(docs)
|
||||
|
||||
|
||||
def test_loader_partitions_multiple_via_api() -> None:
|
||||
@ -124,6 +198,22 @@ def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
|
||||
loader.load()
|
||||
|
||||
|
||||
def test_loader_partitions_via_api_hi_res() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
partition_via_api=True,
|
||||
# Unstructured kwargs
|
||||
strategy="hi_res",
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
categories = set(doc.metadata.get("category") for doc in docs)
|
||||
assert "Table" in categories
|
||||
assert "Image" in categories
|
||||
|
||||
|
||||
# -- fixtures ---
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user