mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 04:07:54 +00:00
unstructured[patch]: add to integration tests (#26666)
- Add to tests on parsed content; - Add tests for async + lazy loading; - Add a test for `strategy="hi_res"`.
This commit is contained in:
parent
28dd6564db
commit
7d49ee9741
@ -1,8 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable
|
from typing import Callable, List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_unstructured import UnstructuredLoader
|
from langchain_unstructured import UnstructuredLoader
|
||||||
|
|
||||||
@ -13,6 +14,51 @@ EXAMPLE_DOCS_DIRECTORY = str(
|
|||||||
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
|
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
def _check_docs_content(docs: List[Document]) -> None:
|
||||||
|
assert all(
|
||||||
|
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16
|
||||||
|
) # 16 page doc
|
||||||
|
|
||||||
|
expected_metadata_keys = [
|
||||||
|
"source",
|
||||||
|
"languages",
|
||||||
|
"page_number",
|
||||||
|
"category",
|
||||||
|
"coordinates",
|
||||||
|
"element_id",
|
||||||
|
]
|
||||||
|
for doc in docs:
|
||||||
|
if doc.page_content:
|
||||||
|
for key in expected_metadata_keys:
|
||||||
|
assert key in doc.metadata
|
||||||
|
else:
|
||||||
|
assert doc.metadata.get("category") == "PageBreak"
|
||||||
|
|
||||||
|
page_numbers = []
|
||||||
|
for doc in docs:
|
||||||
|
if page_number := doc.metadata.get("page_number"):
|
||||||
|
page_numbers.append(page_number)
|
||||||
|
|
||||||
|
assert set(page_numbers) == set(range(1, 17))
|
||||||
|
assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks)
|
||||||
|
|
||||||
|
page_1_content = ""
|
||||||
|
for doc in docs:
|
||||||
|
if doc.metadata.get("page_number") == 1:
|
||||||
|
page_1_content += f" {doc.page_content}"
|
||||||
|
assert (
|
||||||
|
"LayoutParser: A Unified Toolkit for Deep Learning "
|
||||||
|
"Based Document Image Analysis"
|
||||||
|
) in page_1_content
|
||||||
|
|
||||||
|
categories = set(doc.metadata.get("category") for doc in docs)
|
||||||
|
assert "NarrativeText" in categories
|
||||||
|
assert "Title" in categories
|
||||||
|
|
||||||
|
|
||||||
# -- Local partition --
|
# -- Local partition --
|
||||||
|
|
||||||
|
|
||||||
@ -27,10 +73,24 @@ def test_loader_partitions_locally() -> None:
|
|||||||
include_page_breaks=True,
|
include_page_breaks=True,
|
||||||
).load()
|
).load()
|
||||||
|
|
||||||
assert all(
|
_check_docs_content(docs)
|
||||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
|
||||||
|
|
||||||
|
@pytest.mark.local
|
||||||
|
async def test_loader_partitions_locally_async_lazy() -> None:
|
||||||
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||||
|
|
||||||
|
loader = UnstructuredLoader(
|
||||||
|
file_path=file_path,
|
||||||
|
# Unstructured kwargs
|
||||||
|
strategy="fast",
|
||||||
|
include_page_breaks=True,
|
||||||
)
|
)
|
||||||
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
docs = []
|
||||||
|
async for doc in loader.alazy_load():
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
_check_docs_content(docs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.local
|
@pytest.mark.local
|
||||||
@ -79,16 +139,30 @@ def test_loader_partitions_via_api() -> None:
|
|||||||
# Unstructured kwargs
|
# Unstructured kwargs
|
||||||
strategy="fast",
|
strategy="fast",
|
||||||
include_page_breaks=True,
|
include_page_breaks=True,
|
||||||
|
coordinates=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
assert len(docs) > 1
|
_check_docs_content(docs)
|
||||||
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
|
||||||
assert all(
|
|
||||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
async def test_loader_partitions_via_api_async_lazy() -> None:
|
||||||
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||||
|
loader = UnstructuredLoader(
|
||||||
|
file_path=file_path,
|
||||||
|
partition_via_api=True,
|
||||||
|
# Unstructured kwargs
|
||||||
|
strategy="fast",
|
||||||
|
include_page_breaks=True,
|
||||||
|
coordinates=True,
|
||||||
)
|
)
|
||||||
assert docs[0].metadata.get("element_id") is not None
|
|
||||||
|
docs = []
|
||||||
|
async for doc in loader.alazy_load():
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
_check_docs_content(docs)
|
||||||
|
|
||||||
|
|
||||||
def test_loader_partitions_multiple_via_api() -> None:
|
def test_loader_partitions_multiple_via_api() -> None:
|
||||||
@ -124,6 +198,22 @@ def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
|
|||||||
loader.load()
|
loader.load()
|
||||||
|
|
||||||
|
|
||||||
|
def test_loader_partitions_via_api_hi_res() -> None:
|
||||||
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||||
|
loader = UnstructuredLoader(
|
||||||
|
file_path=file_path,
|
||||||
|
partition_via_api=True,
|
||||||
|
# Unstructured kwargs
|
||||||
|
strategy="hi_res",
|
||||||
|
)
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
categories = set(doc.metadata.get("category") for doc in docs)
|
||||||
|
assert "Table" in categories
|
||||||
|
assert "Image" in categories
|
||||||
|
|
||||||
|
|
||||||
# -- fixtures ---
|
# -- fixtures ---
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user