mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 05:20:39 +00:00
#### Update (2): A single `UnstructuredLoader` is added to handle both local and api partitioning. This loader also handles single or multiple documents. #### Changes in `community`: Changes here do not affect users. In the initial process of using the SDK for the API Loaders, the Loaders in community were refactored. Other changes include: The `UnstructuredBaseLoader` has a new check to see if both `mode="paged"` and `chunking_strategy="by_page"`. It also now has `Element.element_id` added to the `Document.metadata`. `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. As such, now both directly inherit from `UnstructuredBaseLoader` and initialize their `file_path`/`file` attributes respectively and implement their own `_post_process_elements` methods. -------- #### Update: New SDK Loaders in a [partner package](https://python.langchain.com/v0.1/docs/contributing/integrations/#partner-package-in-langchain-repo) are introduced to prevent breaking changes for users (see discussion below). ##### TODO: - [x] Test docstring examples -------- - **Description:** UnstructuredAPIFileIOLoader and UnstructuredAPIFileLoader calls to the unstructured api are now made using the unstructured-client sdk. - **New Dependencies:** unstructured-client - [x] **Add tests and docs**: If you're adding a new integration, please include - [x] a test for the integration, preferably unit tests that do not rely on network access, - [x] update the description in `docs/docs/integrations/providers/unstructured.mdx` - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. TODO: - [x] Update https://python.langchain.com/v0.1/docs/integrations/document_loaders/unstructured_file/#unstructured-api - `langchain/docs/docs/integrations/document_loaders/unstructured_file.ipynb` - The description here needs to indicate that users should install `unstructured-client` instead of `unstructured`. Read over closely to look for any other changes that need to be made. - [x] Update the `lazy_load` method in `UnstructuredBaseLoader` to handle json responses from the API instead of just lists of elements. - This method may need to be overwritten by the API loaders instead of changing it in the `UnstructuredBaseLoader`. - [x] Update the documentation links in the class docstrings (the Unstructured documents have moved) - [x] Update Document.metadata to include `element_id` (see thread [here](https://unstructuredw-kbe4326.slack.com/archives/C044N0YV08G/p1718187499818419)) --------- Signed-off-by: ChengZi <chen.zhang@zilliz.com> Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Co-authored-by: ChengZi <chen.zhang@zilliz.com>
136 lines
3.6 KiB
Python
136 lines
3.6 KiB
Python
import os
|
|
from pathlib import Path
|
|
from typing import Callable
|
|
|
|
import pytest
|
|
|
|
from langchain_unstructured import UnstructuredLoader
|
|
|
|
EXAMPLE_DOCS_DIRECTORY = str(
|
|
Path(__file__).parent.parent.parent.parent.parent
|
|
/ "community/tests/integration_tests/examples/"
|
|
)
|
|
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
|
|
|
|
|
|
# -- Local partition --
|
|
|
|
|
|
@pytest.mark.local
|
|
def test_loader_partitions_locally() -> None:
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
|
|
|
docs = UnstructuredLoader(
|
|
file_path=file_path,
|
|
# Unstructured kwargs
|
|
strategy="fast",
|
|
include_page_breaks=True,
|
|
).load()
|
|
|
|
assert all(
|
|
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
|
)
|
|
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
|
|
|
|
|
@pytest.mark.local
|
|
def test_loader_partition_ignores_invalid_arg() -> None:
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
|
|
|
docs = UnstructuredLoader(
|
|
file_path=file_path,
|
|
# Unstructured kwargs
|
|
strategy="fast",
|
|
# mode is no longer a valid argument and is ignored when partitioning locally
|
|
mode="single",
|
|
).load()
|
|
|
|
assert len(docs) > 1
|
|
assert all(
|
|
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
|
)
|
|
|
|
|
|
@pytest.mark.local
|
|
def test_loader_partitions_locally_and_applies_post_processors(
|
|
get_post_processor: Callable[[str], str],
|
|
) -> None:
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
|
loader = UnstructuredLoader(
|
|
file_path=file_path,
|
|
post_processors=[get_post_processor],
|
|
strategy="fast",
|
|
)
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) > 1
|
|
assert docs[0].page_content.endswith("THE END!")
|
|
|
|
|
|
# -- API partition --
|
|
|
|
|
|
def test_loader_partitions_via_api() -> None:
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
|
loader = UnstructuredLoader(
|
|
file_path=file_path,
|
|
partition_via_api=True,
|
|
# Unstructured kwargs
|
|
strategy="fast",
|
|
include_page_breaks=True,
|
|
)
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) > 1
|
|
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
|
assert all(
|
|
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
|
)
|
|
assert docs[0].metadata.get("element_id") is not None
|
|
|
|
|
|
def test_loader_partitions_multiple_via_api() -> None:
|
|
file_paths = [
|
|
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
|
|
os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml"),
|
|
]
|
|
loader = UnstructuredLoader(
|
|
file_path=file_paths,
|
|
api_key=UNSTRUCTURED_API_KEY,
|
|
partition_via_api=True,
|
|
# Unstructured kwargs
|
|
strategy="fast",
|
|
)
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) > 1
|
|
assert docs[0].metadata.get("filename") == "layout-parser-paper.pdf"
|
|
assert docs[-1].metadata.get("filename") == "fake-email-attachment.eml"
|
|
|
|
|
|
def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
|
loader = UnstructuredLoader(
|
|
file_path=file_path,
|
|
api_key=UNSTRUCTURED_API_KEY,
|
|
partition_via_api=True,
|
|
mode="elements",
|
|
)
|
|
|
|
with pytest.raises(TypeError, match="unexpected keyword argument 'mode'"):
|
|
loader.load()
|
|
|
|
|
|
# -- fixtures ---
|
|
|
|
|
|
@pytest.fixture()
|
|
def get_post_processor() -> Callable[[str], str]:
|
|
def append_the_end(text: str) -> str:
|
|
return text + "THE END!"
|
|
|
|
return append_the_end
|