unstructured, community, initialize langchain-unstructured package (#22779)

#### Update (2): 
A single `UnstructuredLoader` is added to handle both local and api
partitioning. This loader also handles single or multiple documents.

#### Changes in `community`:
Changes here do not affect users. In the initial process of using the
SDK for the API Loaders, the Loaders in community were refactored.
Other changes include:
The `UnstructuredBaseLoader` has a new check to see if both
`mode="paged"` and `chunking_strategy="by_page"`. It also now has
`Element.element_id` added to the `Document.metadata`.
`UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. As such,
now both directly inherit from `UnstructuredBaseLoader` and initialize
their `file_path`/`file` attributes respectively and implement their own
`_post_process_elements` methods.

--------
#### Update:
New SDK Loaders in a [partner
package](https://python.langchain.com/v0.1/docs/contributing/integrations/#partner-package-in-langchain-repo)
are introduced to prevent breaking changes for users (see discussion
below).

##### TODO:
- [x] Test docstring examples
--------
- **Description:** UnstructuredAPIFileIOLoader and
UnstructuredAPIFileLoader calls to the unstructured api are now made
using the unstructured-client sdk.
- **New Dependencies:** unstructured-client

- [x] **Add tests and docs**: If you're adding a new integration, please
include
- [x] a test for the integration, preferably unit tests that do not rely
on network access,
- [x] update the description in
`docs/docs/integrations/providers/unstructured.mdx`
- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

TODO:
- [x] Update
https://python.langchain.com/v0.1/docs/integrations/document_loaders/unstructured_file/#unstructured-api
-
`langchain/docs/docs/integrations/document_loaders/unstructured_file.ipynb`
- The description here needs to indicate that users should install
`unstructured-client` instead of `unstructured`. Read over closely to
look for any other changes that need to be made.
- [x] Update the `lazy_load` method in `UnstructuredBaseLoader` to
handle json responses from the API instead of just lists of elements.
- This method may need to be overwritten by the API loaders instead of
changing it in the `UnstructuredBaseLoader`.
- [x] Update the documentation links in the class docstrings (the
Unstructured documents have moved)
- [x] Update Document.metadata to include `element_id` (see thread
[here](https://unstructuredw-kbe4326.slack.com/archives/C044N0YV08G/p1718187499818419))

---------

Signed-off-by: ChengZi <chen.zhang@zilliz.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
Co-authored-by: ChengZi <chen.zhang@zilliz.com>
This commit is contained in:
John
2024-07-24 19:21:20 -04:00
committed by GitHub
parent 2394807033
commit d59c656ea5
23 changed files with 5929 additions and 347 deletions

View File

@@ -1,14 +1,23 @@
"""Loader that uses unstructured to load files."""
import collections
from __future__ import annotations
import logging
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
from typing import IO, Any, Callable, Iterator, List, Optional, Sequence, Union
from langchain_core._api.deprecation import deprecated
from langchain_core.documents import Document
from typing_extensions import TypeAlias
from langchain_community.document_loaders.base import BaseLoader
Element: TypeAlias = Any
logger = logging.getLogger(__file__)
def satisfies_min_unstructured_version(min_version: str) -> bool:
"""Check if the installed `Unstructured` version exceeds the minimum version
@@ -41,8 +50,8 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
def __init__(
self,
mode: str = "single",
post_processors: Optional[List[Callable]] = None,
mode: str = "single", # deprecated
post_processors: Optional[List[Callable[[str], str]]] = None,
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
@@ -53,32 +62,41 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
# `single` - elements are combined into one (default)
# `elements` - maintain individual elements
# `paged` - elements are combined by page
_valid_modes = {"single", "elements", "paged"}
if mode not in _valid_modes:
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)
self.mode = mode
if not satisfies_min_unstructured_version("0.5.4"):
if "strategy" in unstructured_kwargs:
unstructured_kwargs.pop("strategy")
self._check_if_both_mode_and_chunking_strategy_are_by_page(
mode, unstructured_kwargs
)
self.mode = mode
self.unstructured_kwargs = unstructured_kwargs
self.post_processors = post_processors or []
@abstractmethod
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
"""Get elements."""
@abstractmethod
def _get_metadata(self) -> dict:
"""Get metadata."""
def _get_metadata(self) -> dict[str, Any]:
"""Get file_path metadata if available."""
def _post_process_elements(self, elements: list) -> list:
"""Applies post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables are passed
in using the post_processors kwarg when the loader is instantiated."""
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
@@ -97,18 +115,25 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
metadata.update(element.metadata.to_dict())
if hasattr(element, "category"):
metadata["category"] = element.category
if element.to_dict().get("element_id"):
metadata["element_id"] = element.to_dict().get("element_id")
yield Document(page_content=str(element), metadata=metadata)
elif self.mode == "paged":
text_dict: Dict[int, str] = {}
meta_dict: Dict[int, Dict] = {}
logger.warning(
"`mode='paged'` is deprecated in favor of the 'by_page' chunking"
" strategy. Learn more about chunking here:"
" https://docs.unstructured.io/open-source/core-functionality/chunking"
)
text_dict: dict[int, str] = {}
meta_dict: dict[int, dict[str, Any]] = {}
for idx, element in enumerate(elements):
for element in elements:
metadata = self._get_metadata()
if hasattr(element, "metadata"):
metadata.update(element.metadata.to_dict())
page_number = metadata.get("page_number", 1)
# Check if this page_number already exists in docs_dict
# Check if this page_number already exists in text_dict
if page_number not in text_dict:
# If not, create new entry with initial text and metadata
text_dict[page_number] = str(element) + "\n\n"
@@ -128,18 +153,37 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
else:
raise ValueError(f"mode of {self.mode} not supported.")
def _check_if_both_mode_and_chunking_strategy_are_by_page(
self, mode: str, unstructured_kwargs: dict[str, Any]
) -> None:
if (
mode == "paged"
and unstructured_kwargs.get("chunking_strategy") == "by_page"
):
raise ValueError(
"Only one of `chunking_strategy='by_page'` or `mode='paged'` may be"
" set. `chunking_strategy` is preferred."
)
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredFileLoader(UnstructuredBaseLoader):
"""Load files using `Unstructured`.
The file loader uses the
unstructured partition function and will automatically detect the file
type. You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
The file loader uses the unstructured partition function and will automatically
detect the file type. You can run the loader in different modes: "single",
"elements", and "paged". The default "single" mode will return a single langchain
Document object. If you use "elements" mode, the unstructured library will split
the document into elements such as Title and NarrativeText and return those as
individual langchain Document objects. In addition to these post-processing modes
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
parameters for post-processing elements into more useful chunks for uses cases such
as Retrieval Augmented Generation (RAG). You can pass in additional unstructured
kwargs to configure different unstructured settings.
Examples
--------
@@ -152,24 +196,27 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file_path: Union[str, List[str], Path, List[Path], None],
file_path: Union[str, List[str], Path, List[Path]],
*,
mode: str = "single",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
self.file_path = file_path
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
from unstructured.partition.auto import partition
if isinstance(self.file_path, list):
elements = []
elements: List[Element] = []
for file in self.file_path:
if isinstance(file, Path):
file = str(file)
@@ -180,35 +227,33 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
self.file_path = str(self.file_path)
return partition(filename=self.file_path, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
def _get_metadata(self) -> dict[str, Any]:
return {"source": self.file_path}
def get_elements_from_api(
file_path: Union[str, List[str], Path, List[Path], None] = None,
file: Union[IO, Sequence[IO], None] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
file: Union[IO[bytes], Sequence[IO[bytes]], None] = None,
api_url: str = "https://api.unstructuredapp.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
) -> List:
) -> List[Element]:
"""Retrieve a list of elements from the `Unstructured API`."""
if is_list := isinstance(file_path, list):
file_path = [str(path) for path in file_path]
if isinstance(file, collections.abc.Sequence) or is_list:
if isinstance(file, Sequence) or is_list:
from unstructured.partition.api import partition_multiple_via_api
_doc_elements = partition_multiple_via_api(
filenames=file_path,
files=file,
filenames=file_path, # type: ignore
files=file, # type: ignore
api_key=api_key,
api_url=api_url,
**unstructured_kwargs,
)
elements = []
for _elements in _doc_elements:
elements.extend(_elements)
return elements
else:
from unstructured.partition.api import partition_via_api
@@ -222,59 +267,69 @@ def get_elements_from_api(
)
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredAPIFileLoader(UnstructuredBaseLoader):
"""Load files using `Unstructured` API.
By default, the loader makes a call to the hosted Unstructured API.
If you are running the unstructured API locally, you can change the
API rule by passing in the url parameter when you initialize the loader.
The hosted Unstructured API requires an API key. See
https://www.unstructured.io/api-key/ if you need to generate a key.
By default, the loader makes a call to the hosted Unstructured API. If you are
running the unstructured API locally, you can change the API rule by passing in the
url parameter when you initialize the loader. The hosted Unstructured API requires
an API key. See the links below to learn more about our API offerings and get an
API key.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
You can run the loader in different modes: "single", "elements", and "paged". The
default "single" mode will return a single langchain Document object. If you use
"elements" mode, the unstructured library will split the document into elements such
as Title and NarrativeText and return those as individual langchain Document
objects. In addition to these post-processing modes (which are specific to the
LangChain Loaders), Unstructured has its own "chunking" parameters for
post-processing elements into more useful chunks for uses cases such as Retrieval
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
configure different unstructured settings.
Examples
```python
from langchain_community.document_loaders import UnstructuredAPIFileLoader
loader = UnstructuredFileAPILoader(
loader = UnstructuredAPIFileLoader(
"example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://www.unstructured.io/api-key/
https://github.com/Unstructured-IO/unstructured-api
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file_path: Union[str, List[str], None] = "",
file_path: Union[str, List[str]],
*,
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
url: str = "https://api.unstructuredapp.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
validate_unstructured_version(min_unstructured_version="0.10.15")
self.file_path = file_path
self.url = url
self.api_key = api_key
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
super().__init__(mode=mode, **unstructured_kwargs)
def _get_metadata(self) -> dict:
def _get_metadata(self) -> dict[str, Any]:
return {"source": self.file_path}
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
return get_elements_from_api(
file_path=self.file_path,
api_key=self.api_key,
@@ -282,18 +337,36 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
**self.unstructured_kwargs,
)
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Load files using `Unstructured`.
"""Load file-like objects opened in read mode using `Unstructured`.
The file loader
uses the unstructured partition function and will automatically detect the file
type. You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
The file loader uses the unstructured partition function and will automatically
detect the file type. You can run the loader in different modes: "single",
"elements", and "paged". The default "single" mode will return a single langchain
Document object. If you use "elements" mode, the unstructured library will split
the document into elements such as Title and NarrativeText and return those as
individual langchain Document objects. In addition to these post-processing modes
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
parameters for post-processing elements into more useful chunks for uses cases
such as Retrieval Augmented Generation (RAG). You can pass in additional
unstructured kwargs to configure different unstructured settings.
Examples
--------
@@ -308,12 +381,14 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file: Union[IO, Sequence[IO]],
file: IO[bytes],
*,
mode: str = "single",
**unstructured_kwargs: Any,
):
@@ -321,72 +396,114 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
self.file = file
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
from unstructured.partition.auto import partition
return partition(file=self.file, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
def _get_metadata(self) -> dict[str, Any]:
return {}
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
"""Load files using `Unstructured` API.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements
By default, the loader makes a call to the hosted Unstructured API.
If you are running the unstructured API locally, you can change the
API rule by passing in the url parameter when you initialize the loader.
The hosted Unstructured API requires an API key. See
https://www.unstructured.io/api-key/ if you need to generate a key.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredAPIFileIOLoader(UnstructuredBaseLoader):
"""Send file-like objects with `unstructured-client` sdk to the Unstructured API.
By default, the loader makes a call to the hosted Unstructured API. If you are
running the unstructured API locally, you can change the API rule by passing in the
url parameter when you initialize the loader. The hosted Unstructured API requires
an API key. See the links below to learn more about our API offerings and get an
API key.
You can run the loader in different modes: "single", "elements", and "paged". The
default "single" mode will return a single langchain Document object. If you use
"elements" mode, the unstructured library will split the document into elements
such as Title and NarrativeText and return those as individual langchain Document
objects. In addition to these post-processing modes (which are specific to the
LangChain Loaders), Unstructured has its own "chunking" parameters for
post-processing elements into more useful chunks for uses cases such as Retrieval
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
configure different unstructured settings.
Examples
--------
from langchain_community.document_loaders import UnstructuredAPIFileLoader
with open("example.pdf", "rb") as f:
loader = UnstructuredFileAPILoader(
loader = UnstructuredAPIFileIOLoader(
f, mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://www.unstructured.io/api-key/
https://github.com/Unstructured-IO/unstructured-api
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file: Union[IO, Sequence[IO]],
file: Union[IO[bytes], Sequence[IO[bytes]]],
*,
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
url: str = "https://api.unstructuredapp.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
if isinstance(file, collections.abc.Sequence):
if isinstance(file, Sequence):
validate_unstructured_version(min_unstructured_version="0.6.3")
if file:
validate_unstructured_version(min_unstructured_version="0.6.2")
validate_unstructured_version(min_unstructured_version="0.6.2")
self.file = file
self.url = url
self.api_key = api_key
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
super().__init__(file=file, mode=mode, **unstructured_kwargs)
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
return get_elements_from_api(
file=self.file,
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)
def _get_elements(self) -> List[Element]:
if self.unstructured_kwargs.get("metadata_filename"):
return get_elements_from_api(
file=self.file,
file_path=self.unstructured_kwargs.pop("metadata_filename"),
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)
else:
raise ValueError(
"If partitioning a file via api,"
" metadata_filename must be specified as well.",
)
def _get_metadata(self) -> dict[str, Any]:
return {}
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements

1
libs/partners/unstructured/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
__pycache__

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 LangChain, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,66 @@
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
# Default target executed when no arguments are given to make.
all: help
# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
integration_test integration_tests: TEST_FILE = tests/integration_tests/
# unit tests are run with the --disable-socket flag to prevent network calls
test tests:
poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
# integration tests are run without the --disable-socket flag to allow network calls
integration_test:
poetry run pytest $(TEST_FILE)
# skip tests marked as local in CI
integration_tests:
poetry run pytest $(TEST_FILE) -m "not local"
######################
# LINTING AND FORMATTING
######################
# Define a variable for Python and notebook files.
PYTHON_FILES=.
MYPY_CACHE=.mypy_cache
lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/unstructured --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint_package: PYTHON_FILES=langchain_unstructured
lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test
lint lint_diff lint_package lint_tests:
poetry run ruff .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff --select I $(PYTHON_FILES)
mkdir -p $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff --select I --fix $(PYTHON_FILES)
spell_check:
poetry run codespell --toml pyproject.toml
spell_fix:
poetry run codespell --toml pyproject.toml -w
check_imports: $(shell find langchain_unstructured -name '*.py')
poetry run python ./scripts/check_imports.py $^
######################
# HELP
######################
help:
@echo '----'
@echo 'check_imports - check imports'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'test - run unit tests'
@echo 'tests - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'

View File

@@ -0,0 +1,71 @@
# langchain-unstructured
This package contains the LangChain integration with Unstructured
## Installation
```bash
pip install -U langchain-unstructured
```
And you should configure credentials by setting the following environment variables:
```bash
export UNSTRUCTURED_API_KEY="your-api-key"
```
## Loaders
Partition and load files using either the `unstructured-client` sdk and the
Unstructured API or locally using the `unstructured` library.
API:
To partition via the Unstructured API `pip install unstructured-client` and set
`partition_via_api=True` and define `api_key`. If you are running the unstructured API
locally, you can change the API rule by defining `url` when you initialize the
loader. The hosted Unstructured API requires an API key. See the links below to
learn more about our API offerings and get an API key.
Local:
By default the file loader uses the Unstructured `partition` function and will
automatically detect the file type.
In addition to document specific partition parameters, Unstructured has a rich set
of "chunking" parameters for post-processing elements into more useful text segments
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
Unstructured kwargs to the loader to configure different unstructured settings.
Setup:
```bash
pip install -U langchain-unstructured
pip install -U unstructured-client
export UNSTRUCTURED_API_KEY="your-api-key"
```
Instantiate:
```python
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path = ["example.pdf", "fake.pdf"],
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
chunking_strategy="by_title",
strategy="fast",
)
```
Load:
```python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
```
References
----------
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking

View File

@@ -0,0 +1,15 @@
from importlib import metadata
from langchain_unstructured.document_loaders import UnstructuredLoader
try:
__version__ = metadata.version(__package__)
except metadata.PackageNotFoundError:
# Case where package metadata is not available.
__version__ = ""
del metadata # optional, avoids polluting the results of dir(__package__)
__all__ = [
"UnstructuredLoader",
"__version__",
]

View File

@@ -0,0 +1,280 @@
"""Unstructured document loader."""
from __future__ import annotations
import json
import logging
import os
from pathlib import Path
from typing import IO, Any, Callable, Iterator, Optional, cast
from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from typing_extensions import TypeAlias
from unstructured_client import UnstructuredClient # type: ignore
from unstructured_client.models import operations, shared # type: ignore
Element: TypeAlias = Any
logger = logging.getLogger(__file__)
_DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
class UnstructuredLoader(BaseLoader):
"""Unstructured document loader interface.
Partition and load files using either the `unstructured-client` sdk and the
Unstructured API or locally using the `unstructured` library.
API:
This package is configured to work with the Unstructured API by default.
To use the Unstructured API, set
`partition_via_api=True` and define `api_key`. If you are running the unstructured
API locally, you can change the API rule by defining `url` when you initialize the
loader. The hosted Unstructured API requires an API key. See the links below to
learn more about our API offerings and get an API key.
Local:
To partition files locally, you must have the `unstructured` package installed.
You can install it with `pip install unstructured`.
By default the file loader uses the Unstructured `partition` function and will
automatically detect the file type.
In addition to document specific partition parameters, Unstructured has a rich set
of "chunking" parameters for post-processing elements into more useful text segments
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
Unstructured kwargs to the loader to configure different unstructured settings.
Setup:
.. code-block:: bash
pip install -U langchain-unstructured
export UNSTRUCTURED_API_KEY="your-api-key"
Instantiate:
.. code-block:: python
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path = ["example.pdf", "fake.pdf"],
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
chunking_strategy="by_title",
strategy="fast",
)
Load:
.. code-block:: python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
References
----------
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file_path: Optional[str | Path | list[str] | list[Path]] = None,
*,
file: Optional[IO[bytes] | list[IO[bytes]]] = None,
partition_via_api: bool = False,
post_processors: Optional[list[Callable[[str], str]]] = None,
# SDK parameters
api_key: Optional[str] = None,
client: Optional[UnstructuredClient] = None,
server_url: Optional[str] = None,
**kwargs: Any,
):
"""Initialize loader."""
if file_path is not None and file is not None:
raise ValueError("file_path and file cannot be defined simultaneously.")
if client is not None:
disallowed_params = [("api_key", api_key), ("server_url", server_url)]
bad_params = [
param for param, value in disallowed_params if value is not None
]
if bad_params:
raise ValueError(
"if you are passing a custom `client`, you cannot also pass these "
f"params: {', '.join(bad_params)}."
)
unstructured_api_key = api_key or os.getenv("UNSTRUCTURED_API_KEY")
unstructured_url = server_url or os.getenv("UNSTRUCTURED_URL") or _DEFAULT_URL
self.client = client or UnstructuredClient(
api_key_auth=unstructured_api_key, server_url=unstructured_url
)
self.file_path = file_path
self.file = file
self.partition_via_api = partition_via_api
self.post_processors = post_processors
self.unstructured_kwargs = kwargs
def lazy_load(self) -> Iterator[Document]:
"""Load file(s) to the _UnstructuredBaseLoader."""
def load_file(
f: Optional[IO[bytes]] = None, f_path: Optional[str | Path] = None
) -> Iterator[Document]:
"""Load an individual file to the _UnstructuredBaseLoader."""
return _SingleDocumentLoader(
file=f,
file_path=f_path,
partition_via_api=self.partition_via_api,
post_processors=self.post_processors,
# SDK parameters
client=self.client,
**self.unstructured_kwargs,
).lazy_load()
if isinstance(self.file, list):
for f in self.file:
yield from load_file(f=f)
return
if isinstance(self.file_path, list):
for f_path in self.file_path:
yield from load_file(f_path=f_path)
return
# Call _UnstructuredBaseLoader normally since file and file_path are not lists
yield from load_file(f=self.file, f_path=self.file_path)
class _SingleDocumentLoader(BaseLoader):
"""Provides loader functionality for individual document/file objects.
Encapsulates partitioning individual file objects (file or file_path) either
locally or via the Unstructured API.
"""
def __init__(
self,
file_path: Optional[str | Path] = None,
*,
client: UnstructuredClient,
file: Optional[IO[bytes]] = None,
partition_via_api: bool = False,
post_processors: Optional[list[Callable[[str], str]]] = None,
# SDK parameters
**kwargs: Any,
):
"""Initialize loader."""
self.file_path = str(file_path) if isinstance(file_path, Path) else file_path
self.file = file
self.partition_via_api = partition_via_api
self.post_processors = post_processors
# SDK parameters
self.client = client
self.unstructured_kwargs = kwargs
def lazy_load(self) -> Iterator[Document]:
"""Load file."""
elements_json = (
self._post_process_elements_json(self._elements_json)
if self.post_processors
else self._elements_json
)
for element in elements_json:
metadata = self._get_metadata()
metadata.update(element.get("metadata")) # type: ignore
metadata.update(
{"category": element.get("category") or element.get("type")}
)
metadata.update({"element_id": element.get("element_id")})
yield Document(
page_content=cast(str, element.get("text")), metadata=metadata
)
@property
def _elements_json(self) -> list[dict[str, Any]]:
"""Get elements as a list of dictionaries from local partition or via API."""
if self.partition_via_api:
return self._elements_via_api
return self._convert_elements_to_dicts(self._elements_via_local)
@property
def _elements_via_local(self) -> list[Element]:
try:
from unstructured.partition.auto import partition # type: ignore
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
if self.file and self.unstructured_kwargs.get("metadata_filename") is None:
raise ValueError(
"If partitioning a fileIO object, metadata_filename must be specified"
" as well.",
)
return partition(
file=self.file, filename=self.file_path, **self.unstructured_kwargs
) # type: ignore
@property
def _elements_via_api(self) -> list[dict[str, Any]]:
"""Retrieve a list of element dicts from the API using the SDK client."""
client = self.client
req = self._sdk_partition_request
response = client.general.partition(req) # type: ignore
if response.status_code == 200:
return json.loads(response.raw_response.text)
raise ValueError(
f"Receive unexpected status code {response.status_code} from the API.",
)
@property
def _file_content(self) -> bytes:
"""Get content from either file or file_path."""
if self.file is not None:
return self.file.read()
elif self.file_path:
with open(self.file_path, "rb") as f:
return f.read()
raise ValueError("file or file_path must be defined.")
@property
def _sdk_partition_request(self) -> operations.PartitionRequest:
return operations.PartitionRequest(
partition_parameters=shared.PartitionParameters(
files=shared.Files(
content=self._file_content, file_name=str(self.file_path)
),
**self.unstructured_kwargs,
),
)
def _convert_elements_to_dicts(
self, elements: list[Element]
) -> list[dict[str, Any]]:
return [element.to_dict() for element in elements]
def _get_metadata(self) -> dict[str, Any]:
"""Get file_path metadata if available."""
return {"source": self.file_path} if self.file_path else {}
def _post_process_elements_json(
self, elements_json: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
if self.post_processors:
for element in elements_json:
for post_processor in self.post_processors:
element["text"] = post_processor(str(element.get("text")))
return elements_json

4419
libs/partners/unstructured/poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,97 @@
[tool.poetry]
name = "langchain-unstructured"
version = "0.1.0"
description = "An integration package connecting Unstructured and LangChain"
authors = []
readme = "README.md"
repository = "https://github.com/langchain-ai/langchain"
license = "MIT"
[tool.poetry.urls]
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/unstructured"
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-unstructured%3D%3D0%22&expanded=true"
[tool.poetry.dependencies]
python = ">=3.9,<4.0"
langchain-core = "^0.2.23"
unstructured-client = { version = "^0.24.1" }
unstructured = { version = "^0.15.0", optional = true, python = "<3.13", extras = [
"all-docs",
] }
[tool.poetry.extras]
local = ["unstructured"]
[tool.poetry.group.test]
optional = true
[tool.poetry.group.test.dependencies]
pytest = "^7.4.3"
pytest-asyncio = "^0.23.2"
pytest-socket = "^0.7.0"
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.codespell]
optional = true
[tool.poetry.group.codespell.dependencies]
codespell = "^2.2.6"
[tool.poetry.group.test_integration]
optional = true
[tool.poetry.group.test_integration.dependencies]
[tool.poetry.group.lint]
optional = true
[tool.poetry.group.lint.dependencies]
ruff = "^0.1.8"
[tool.poetry.group.typing.dependencies]
mypy = "^1.7.1"
unstructured = { version = "^0.15.0", python = "<3.13", extras = ["all-docs"] }
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
langchain-core = { path = "../../core", develop = true }
[tool.ruff.lint]
select = [
"E", # pycodestyle
"F", # pyflakes
"I", # isort
"T201", # print
]
[tool.mypy]
disallow_untyped_defs = "True"
[tool.coverage.run]
omit = ["tests/*"]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.pytest.ini_options]
# --strict-markers will raise errors on unknown marks.
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
#
# https://docs.pytest.org/en/7.1.x/reference/reference.html
# --strict-config any warnings encountered while parsing the `pytest`
# section of the configuration file raise errors.
#
# https://github.com/tophat/syrupy
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
addopts = "--strict-markers --strict-config --durations=5"
# Registering custom markers.
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
markers = [
"compile: mark placeholder test used to compile integration tests without running them",
"local: mark tests as requiring a local install, which isn't compatible with CI currently",
]
asyncio_mode = "auto"

View File

@@ -0,0 +1,17 @@
import sys
import traceback
from importlib.machinery import SourceFileLoader
if __name__ == "__main__":
files = sys.argv[1:]
has_failure = False
for file in files:
try:
SourceFileLoader("x", file).load_module()
except Exception:
has_faillure = True
print(file) # noqa: T201
traceback.print_exc()
print() # noqa: T201
sys.exit(1 if has_failure else 0)

View File

@@ -0,0 +1,27 @@
#!/bin/bash
#
# This script searches for lines starting with "import pydantic" or "from pydantic"
# in tracked files within a Git repository.
#
# Usage: ./scripts/check_pydantic.sh /path/to/repository
# Check if a path argument is provided
if [ $# -ne 1 ]; then
echo "Usage: $0 /path/to/repository"
exit 1
fi
repository_path="$1"
# Search for lines matching the pattern within the specified repository
result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
# Check if any matching lines were found
if [ -n "$result" ]; then
echo "ERROR: The following lines need to be updated:"
echo "$result"
echo "Please replace the code with an import from langchain_core.pydantic_v1."
echo "For example, replace 'from pydantic import BaseModel'"
echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
exit 1
fi

View File

@@ -0,0 +1,18 @@
#!/bin/bash
set -eu
# Initialize a variable to keep track of errors
errors=0
# make sure not importing from langchain, langchain_experimental, or langchain_community
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_community\.' . && errors=$((errors+1))
# Decide on an exit status based on the errors
if [ "$errors" -gt 0 ]; then
exit 1
else
exit 0
fi

View File

@@ -0,0 +1,7 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""
pass

View File

@@ -0,0 +1,135 @@
import os
from pathlib import Path
from typing import Callable
import pytest
from langchain_unstructured import UnstructuredLoader
EXAMPLE_DOCS_DIRECTORY = str(
Path(__file__).parent.parent.parent.parent.parent
/ "community/tests/integration_tests/examples/"
)
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
# -- Local partition --
@pytest.mark.local
def test_loader_partitions_locally() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
docs = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
).load()
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
@pytest.mark.local
def test_loader_partition_ignores_invalid_arg() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
docs = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
# mode is no longer a valid argument and is ignored when partitioning locally
mode="single",
).load()
assert len(docs) > 1
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
@pytest.mark.local
def test_loader_partitions_locally_and_applies_post_processors(
get_post_processor: Callable[[str], str],
) -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
post_processors=[get_post_processor],
strategy="fast",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].page_content.endswith("THE END!")
# -- API partition --
def test_loader_partitions_via_api() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
)
docs = loader.load()
assert len(docs) > 1
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert docs[0].metadata.get("element_id") is not None
def test_loader_partitions_multiple_via_api() -> None:
file_paths = [
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml"),
]
loader = UnstructuredLoader(
file_path=file_paths,
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].metadata.get("filename") == "layout-parser-paper.pdf"
assert docs[-1].metadata.get("filename") == "fake-email-attachment.eml"
def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
mode="elements",
)
with pytest.raises(TypeError, match="unexpected keyword argument 'mode'"):
loader.load()
# -- fixtures ---
@pytest.fixture()
def get_post_processor() -> Callable[[str], str]:
def append_the_end(text: str) -> str:
return text + "THE END!"
return append_the_end

View File

@@ -0,0 +1,178 @@
from pathlib import Path
from typing import Any, Callable
from unittest import mock
from unittest.mock import Mock, mock_open, patch
import pytest
from unstructured.documents.elements import Text # type: ignore
from langchain_unstructured.document_loaders import (
_SingleDocumentLoader, # type: ignore
)
EXAMPLE_DOCS_DIRECTORY = str(
Path(__file__).parent.parent.parent.parent.parent
/ "community/tests/integration_tests/examples/"
)
# --- _SingleDocumentLoader._get_content() ---
def test_it_gets_content_from_file() -> None:
mock_file = Mock()
mock_file.read.return_value = b"content from file"
loader = _SingleDocumentLoader(
client=Mock(), file=mock_file, metadata_filename="fake.txt"
)
content = loader._file_content # type: ignore
assert content == b"content from file"
mock_file.read.assert_called_once()
@patch("builtins.open", new_callable=mock_open, read_data=b"content from file_path")
def test_it_gets_content_from_file_path(mock_file: Mock) -> None:
loader = _SingleDocumentLoader(client=Mock(), file_path="dummy_path")
content = loader._file_content # type: ignore
assert content == b"content from file_path"
mock_file.assert_called_once_with("dummy_path", "rb")
handle = mock_file()
handle.read.assert_called_once()
def test_it_raises_value_error_without_file_or_file_path() -> None:
loader = _SingleDocumentLoader(
client=Mock(),
)
with pytest.raises(ValueError) as e:
loader._file_content # type: ignore
assert str(e.value) == "file or file_path must be defined."
# --- _SingleDocumentLoader._elements_json ---
def test_it_calls_elements_via_api_with_valid_args() -> None:
with patch.object(
_SingleDocumentLoader, "_elements_via_api", new_callable=mock.PropertyMock
) as mock_elements_via_api:
mock_elements_via_api.return_value = [{"element": "data"}]
loader = _SingleDocumentLoader(
client=Mock(),
# Minimum required args for self._elements_via_api to be called:
partition_via_api=True,
api_key="some_key",
)
result = loader._elements_json # type: ignore
mock_elements_via_api.assert_called_once()
assert result == [{"element": "data"}]
@patch.object(_SingleDocumentLoader, "_convert_elements_to_dicts")
def test_it_partitions_locally_by_default(mock_convert_elements_to_dicts: Mock) -> None:
mock_convert_elements_to_dicts.return_value = [{}]
with patch.object(
_SingleDocumentLoader, "_elements_via_local", new_callable=mock.PropertyMock
) as mock_elements_via_local:
mock_elements_via_local.return_value = [{}]
# Minimum required args for self._elements_via_api to be called:
loader = _SingleDocumentLoader(
client=Mock(),
)
result = loader._elements_json # type: ignore
mock_elements_via_local.assert_called_once_with()
mock_convert_elements_to_dicts.assert_called_once_with([{}])
assert result == [{}]
def test_it_partitions_locally_and_logs_warning_with_partition_via_api_False(
caplog: pytest.LogCaptureFixture,
) -> None:
with patch.object(
_SingleDocumentLoader, "_elements_via_local"
) as mock_get_elements_locally:
mock_get_elements_locally.return_value = [Text("Mock text element.")]
loader = _SingleDocumentLoader(
client=Mock(), partition_via_api=False, api_key="some_key"
)
_ = loader._elements_json # type: ignore
# -- fixtures -------------------------------
@pytest.fixture()
def get_post_processor() -> Callable[[str], str]:
def append_the_end(text: str) -> str:
return text + "THE END!"
return append_the_end
@pytest.fixture()
def fake_json_response() -> list[dict[str, Any]]:
return [
{
"type": "Title",
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
"Image Analysis",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
{
"type": "UncategorizedText",
"element_id": "e1c4facddf1f2eb1d0db5be34ad0de18",
"text": "1 2 0 2",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"parent_id": "b7f58c2fd9c15949a55a62eb84e39575",
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
]
@pytest.fixture()
def fake_multiple_docs_json_response() -> list[dict[str, Any]]:
return [
{
"type": "Title",
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
" Image Analysis",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
{
"type": "NarrativeText",
"element_id": "3c4ac9e7f55f1e3dbd87d3a9364642fe",
"text": "6/29/23, 12:16\u202fam - User 4: This message was deleted",
"metadata": {
"filename": "whatsapp_chat.txt",
"languages": ["eng"],
"filetype": "text/plain",
},
},
]

View File

@@ -0,0 +1,10 @@
from langchain_unstructured import __all__
EXPECTED_ALL = [
"UnstructuredLoader",
"__version__",
]
def test_all_imports() -> None:
assert sorted(EXPECTED_ALL) == sorted(__all__)