unstructured: mv to external repo (#26923)

This commit is contained in:
Erick Friis 2024-09-30 17:38:21 -07:00 committed by GitHub
parent 7ecd720120
commit 35f6393144
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 2 additions and 5717 deletions

View File

@ -294,7 +294,6 @@ jobs:
VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
UPSTAGE_API_KEY: ${{ secrets.UPSTAGE_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }}
run: make integration_tests
working-directory: ${{ inputs.working-directory }}

View File

@ -1 +0,0 @@
__pycache__

View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 LangChain, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,69 +0,0 @@
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
# Default target executed when no arguments are given to make.
all: help
# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
integration_test integration_tests: TEST_FILE = tests/integration_tests/
# unit tests are run with the --disable-socket flag to prevent network calls
test tests:
poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
test_watch:
poetry run ptw --snapshot-update --now . -- -vv $(TEST_FILE)
# integration tests are run without the --disable-socket flag to allow network calls
integration_test:
poetry run pytest $(TEST_FILE)
# skip tests marked as local in CI
integration_tests:
poetry run pytest $(TEST_FILE) -m "not local"
######################
# LINTING AND FORMATTING
######################
# Define a variable for Python and notebook files.
PYTHON_FILES=.
MYPY_CACHE=.mypy_cache
lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/unstructured --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint_package: PYTHON_FILES=langchain_unstructured
lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test
lint lint_diff lint_package lint_tests:
poetry run ruff check .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff check --select I $(PYTHON_FILES)
mkdir -p $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff check --select I --fix $(PYTHON_FILES)
spell_check:
poetry run codespell --toml pyproject.toml
spell_fix:
poetry run codespell --toml pyproject.toml -w
check_imports: $(shell find langchain_unstructured -name '*.py')
poetry run python ./scripts/check_imports.py $^
######################
# HELP
######################
help:
@echo '----'
@echo 'check_imports - check imports'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'test - run unit tests'
@echo 'tests - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'

View File

@ -1,71 +1,3 @@
# langchain-unstructured
This package has moved!
This package contains the LangChain integration with Unstructured
## Installation
```bash
pip install -U langchain-unstructured
```
And you should configure credentials by setting the following environment variables:
```bash
export UNSTRUCTURED_API_KEY="your-api-key"
```
## Loaders
Partition and load files using either the `unstructured-client` sdk and the
Unstructured API or locally using the `unstructured` library.
API:
To partition via the Unstructured API `pip install unstructured-client` and set
`partition_via_api=True` and define `api_key`. If you are running the unstructured API
locally, you can change the API rule by defining `url` when you initialize the
loader. The hosted Unstructured API requires an API key. See the links below to
learn more about our API offerings and get an API key.
Local:
By default the file loader uses the Unstructured `partition` function and will
automatically detect the file type.
In addition to document specific partition parameters, Unstructured has a rich set
of "chunking" parameters for post-processing elements into more useful text segments
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
Unstructured kwargs to the loader to configure different unstructured settings.
Setup:
```bash
pip install -U langchain-unstructured
pip install -U unstructured-client
export UNSTRUCTURED_API_KEY="your-api-key"
```
Instantiate:
```python
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path = ["example.pdf", "fake.pdf"],
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
chunking_strategy="by_title",
strategy="fast",
)
```
Load:
```python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
```
References
----------
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
https://github.com/langchain-ai/langchain-unstructured/tree/main/libs/unstructured

View File

@ -1,15 +0,0 @@
from importlib import metadata
from langchain_unstructured.document_loaders import UnstructuredLoader
try:
__version__ = metadata.version(__package__)
except metadata.PackageNotFoundError:
# Case where package metadata is not available.
__version__ = ""
del metadata # optional, avoids polluting the results of dir(__package__)
__all__ = [
"UnstructuredLoader",
"__version__",
]

View File

@ -1,307 +0,0 @@
"""Unstructured document loader."""
from __future__ import annotations
import json
import logging
import os
from pathlib import Path
from typing import IO, Any, Callable, Iterator, Optional, cast
from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from typing_extensions import TypeAlias
from unstructured_client import UnstructuredClient # type: ignore
from unstructured_client.models import operations, shared # type: ignore
Element: TypeAlias = Any
logger = logging.getLogger(__file__)
_DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
class UnstructuredLoader(BaseLoader):
"""Unstructured document loader interface.
Setup:
Install ``langchain-unstructured`` and set environment variable ``UNSTRUCTURED_API_KEY``.
.. code-block:: bash
pip install -U langchain-unstructured
export UNSTRUCTURED_API_KEY="your-api-key"
Instantiate:
.. code-block:: python
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path = ["example.pdf", "fake.pdf"],
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
chunking_strategy="by_title",
strategy="fast",
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
1 2 0 2
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
1 2 0 2
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
Load URL:
.. code-block:: python
loader = UnstructuredLoader(web_url="https://www.example.com/")
print(docs[0])
.. code-block:: none
page_content='Example Domain' metadata={'category_depth': 0, 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://www.example.com/', 'category': 'Title', 'element_id': 'fdaa78d856f9d143aeeed85bf23f58f8'}
.. code-block:: python
print(docs[1])
.. code-block:: none
page_content='This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.' metadata={'languages': ['eng'], 'parent_id': 'fdaa78d856f9d143aeeed85bf23f58f8', 'filetype': 'text/html', 'url': 'https://www.example.com/', 'category': 'NarrativeText', 'element_id': '3652b8458b0688639f973fe36253c992'}
References
----------
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
""" # noqa: E501
def __init__(
self,
file_path: Optional[str | Path | list[str] | list[Path]] = None,
*,
file: Optional[IO[bytes] | list[IO[bytes]]] = None,
partition_via_api: bool = False,
post_processors: Optional[list[Callable[[str], str]]] = None,
# SDK parameters
api_key: Optional[str] = None,
client: Optional[UnstructuredClient] = None,
url: Optional[str] = None,
web_url: Optional[str] = None,
**kwargs: Any,
):
"""Initialize loader."""
if file_path is not None and file is not None:
raise ValueError("file_path and file cannot be defined simultaneously.")
if client is not None:
disallowed_params = [("api_key", api_key), ("url", url)]
bad_params = [
param for param, value in disallowed_params if value is not None
]
if bad_params:
raise ValueError(
"if you are passing a custom `client`, you cannot also pass these "
f"params: {', '.join(bad_params)}."
)
unstructured_api_key = api_key or os.getenv("UNSTRUCTURED_API_KEY") or ""
unstructured_url = url or os.getenv("UNSTRUCTURED_URL") or _DEFAULT_URL
self.client = client or UnstructuredClient(
api_key_auth=unstructured_api_key, server_url=unstructured_url
)
self.file_path = file_path
self.file = file
self.partition_via_api = partition_via_api
self.post_processors = post_processors
self.unstructured_kwargs = kwargs
if web_url:
self.unstructured_kwargs["url"] = web_url
def lazy_load(self) -> Iterator[Document]:
"""Load file(s) to the _UnstructuredBaseLoader."""
def load_file(
f: Optional[IO[bytes]] = None, f_path: Optional[str | Path] = None
) -> Iterator[Document]:
"""Load an individual file to the _UnstructuredBaseLoader."""
return _SingleDocumentLoader(
file=f,
file_path=f_path,
partition_via_api=self.partition_via_api,
post_processors=self.post_processors,
# SDK parameters
client=self.client,
**self.unstructured_kwargs,
).lazy_load()
if isinstance(self.file, list):
for f in self.file:
yield from load_file(f=f)
return
if isinstance(self.file_path, list):
for f_path in self.file_path:
yield from load_file(f_path=f_path)
return
# Call _UnstructuredBaseLoader normally since file and file_path are not lists
yield from load_file(f=self.file, f_path=self.file_path)
class _SingleDocumentLoader(BaseLoader):
"""Provides loader functionality for individual document/file objects.
Encapsulates partitioning individual file objects (file or file_path) either
locally or via the Unstructured API.
"""
def __init__(
self,
file_path: Optional[str | Path] = None,
*,
client: UnstructuredClient,
file: Optional[IO[bytes]] = None,
partition_via_api: bool = False,
post_processors: Optional[list[Callable[[str], str]]] = None,
**kwargs: Any,
):
"""Initialize loader."""
self.file_path = str(file_path) if isinstance(file_path, Path) else file_path
self.file = file
self.partition_via_api = partition_via_api
self.post_processors = post_processors
# SDK parameters
self.client = client
self.unstructured_kwargs = kwargs
def lazy_load(self) -> Iterator[Document]:
"""Load file."""
elements_json = (
self._post_process_elements_json(self._elements_json)
if self.post_processors
else self._elements_json
)
for element in elements_json:
metadata = self._get_metadata()
metadata.update(element.get("metadata")) # type: ignore
metadata.update(
{"category": element.get("category") or element.get("type")}
)
metadata.update({"element_id": element.get("element_id")})
yield Document(
page_content=cast(str, element.get("text")), metadata=metadata
)
@property
def _elements_json(self) -> list[dict[str, Any]]:
"""Get elements as a list of dictionaries from local partition or via API."""
if self.partition_via_api:
return self._elements_via_api
return self._convert_elements_to_dicts(self._elements_via_local)
@property
def _elements_via_local(self) -> list[Element]:
try:
from unstructured.partition.auto import partition # type: ignore
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
if self.file and self.unstructured_kwargs.get("metadata_filename") is None:
raise ValueError(
"If partitioning a fileIO object, metadata_filename must be specified"
" as well.",
)
return partition(
file=self.file, filename=self.file_path, **self.unstructured_kwargs
) # type: ignore
@property
def _elements_via_api(self) -> list[dict[str, Any]]:
"""Retrieve a list of element dicts from the API using the SDK client."""
client = self.client
req = self._sdk_partition_request
response = client.general.partition(req) # type: ignore
if response.status_code == 200:
return json.loads(response.raw_response.text)
raise ValueError(
f"Receive unexpected status code {response.status_code} from the API.",
)
@property
def _file_content(self) -> bytes:
"""Get content from either file or file_path."""
if self.file is not None:
return self.file.read()
elif self.file_path:
with open(self.file_path, "rb") as f:
return f.read()
raise ValueError("file or file_path must be defined.")
@property
def _sdk_partition_request(self) -> operations.PartitionRequest:
return operations.PartitionRequest(
partition_parameters=shared.PartitionParameters(
files=shared.Files(
content=self._file_content, file_name=str(self.file_path)
),
**self.unstructured_kwargs,
),
)
def _convert_elements_to_dicts(
self, elements: list[Element]
) -> list[dict[str, Any]]:
return [element.to_dict() for element in elements]
def _get_metadata(self) -> dict[str, Any]:
"""Get file_path metadata if available."""
return {"source": self.file_path} if self.file_path else {}
def _post_process_elements_json(
self, elements_json: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
if self.post_processors:
for element in elements_json:
for post_processor in self.post_processors:
element["text"] = post_processor(str(element.get("text")))
return elements_json

File diff suppressed because it is too large Load Diff

View File

@ -1,101 +0,0 @@
[tool.poetry]
name = "langchain-unstructured"
version = "0.1.5"
description = "An integration package connecting Unstructured and LangChain"
authors = []
readme = "README.md"
repository = "https://github.com/langchain-ai/langchain"
license = "MIT"
[tool.poetry.urls]
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/unstructured"
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-unstructured%3D%3D0%22&expanded=true"
[tool.poetry.dependencies]
python = ">=3.9,<4.0"
langchain-core = "^0.3"
unstructured-client = { version = "^0.25.0" }
unstructured = { version = "^0.15.7", optional = true, python = "<3.13", extras = [
"all-docs",
] }
[tool.poetry.extras]
local = ["unstructured"]
[tool.poetry.group.test]
optional = true
[tool.poetry.group.test.dependencies]
pytest = "^7.4.3"
pytest-asyncio = "^0.23.2"
pytest-socket = "^0.7.0"
langchain-core = { path = "../../core", develop = true }
# TODO: hack to fix 3.9 builds
cffi = [{version = "<1.17.1", python="<3.10"}, {version = "*", python=">=3.10"}]
[tool.poetry.group.codespell]
optional = true
[tool.poetry.group.codespell.dependencies]
codespell = "^2.2.6"
[tool.poetry.group.test_integration]
optional = true
[tool.poetry.group.test_integration.dependencies]
[tool.poetry.group.lint]
optional = true
[tool.poetry.group.lint.dependencies]
ruff = "^0.5"
# TODO: hack to fix 3.9 builds
cffi = [{version = "<1.17.1", python="<3.10"}, {version = "*", python=">=3.10"}]
[tool.poetry.group.typing.dependencies]
mypy = "^1.7.1"
unstructured = { version = "^0.15.7", python = "<3.13", extras = ["all-docs"] }
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
langchain-core = { path = "../../core", develop = true }
[tool.ruff.lint]
select = [
"E", # pycodestyle
"F", # pyflakes
"I", # isort
"T201", # print
]
[tool.mypy]
disallow_untyped_defs = "True"
[tool.coverage.run]
omit = ["tests/*"]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.pytest.ini_options]
# --strict-markers will raise errors on unknown marks.
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
#
# https://docs.pytest.org/en/7.1.x/reference/reference.html
# --strict-config any warnings encountered while parsing the `pytest`
# section of the configuration file raise errors.
#
# https://github.com/tophat/syrupy
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
addopts = "--strict-markers --strict-config --durations=5"
# Registering custom markers.
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
markers = [
"compile: mark placeholder test used to compile integration tests without running them",
"local: mark tests as requiring a local install, which isn't compatible with CI currently",
]
asyncio_mode = "auto"

View File

@ -1,17 +0,0 @@
import sys
import traceback
from importlib.machinery import SourceFileLoader
if __name__ == "__main__":
files = sys.argv[1:]
has_failure = False
for file in files:
try:
SourceFileLoader("x", file).load_module()
except Exception:
has_faillure = True
print(file) # noqa: T201
traceback.print_exc()
print() # noqa: T201
sys.exit(1 if has_failure else 0)

View File

@ -1,18 +0,0 @@
#!/bin/bash
set -eu
# Initialize a variable to keep track of errors
errors=0
# make sure not importing from langchain, langchain_experimental, or langchain_community
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_community\.' . && errors=$((errors+1))
# Decide on an exit status based on the errors
if [ "$errors" -gt 0 ]; then
exit 1
else
exit 0
fi

View File

@ -1,7 +0,0 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""
pass

View File

@ -1,236 +0,0 @@
import os
from pathlib import Path
from typing import Callable, List
import pytest
from langchain_core.documents import Document
from langchain_unstructured import UnstructuredLoader
EXAMPLE_DOCS_DIRECTORY = str(
Path(__file__).parent.parent.parent.parent.parent
/ "community/tests/integration_tests/examples/"
)
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
def _check_docs_content(docs: List[Document]) -> None:
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert (
sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16
) # 16 page doc
expected_metadata_keys = [
"source",
"languages",
"page_number",
"category",
"coordinates",
"element_id",
]
for doc in docs:
if doc.page_content:
for key in expected_metadata_keys:
assert key in doc.metadata
else:
assert doc.metadata.get("category") == "PageBreak"
page_numbers = []
for doc in docs:
if page_number := doc.metadata.get("page_number"):
page_numbers.append(page_number)
assert set(page_numbers) == set(range(1, 17))
assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks)
page_1_content = ""
for doc in docs:
if doc.metadata.get("page_number") == 1:
page_1_content += f" {doc.page_content}"
assert (
"LayoutParser: A Unified Toolkit for Deep Learning "
"Based Document Image Analysis"
) in page_1_content
categories = set(doc.metadata.get("category") for doc in docs)
assert "NarrativeText" in categories
assert "Title" in categories
# -- Local partition --
@pytest.mark.local
def test_loader_partitions_locally() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
docs = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
).load()
_check_docs_content(docs)
@pytest.mark.local
async def test_loader_partitions_locally_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
)
docs = []
async for doc in loader.alazy_load():
docs.append(doc)
_check_docs_content(docs)
@pytest.mark.local
def test_loader_partition_ignores_invalid_arg() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
docs = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
# mode is no longer a valid argument and is ignored when partitioning locally
mode="single",
).load()
assert len(docs) > 1
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
@pytest.mark.local
def test_loader_partitions_locally_and_applies_post_processors(
get_post_processor: Callable[[str], str],
) -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
post_processors=[get_post_processor],
strategy="fast",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].page_content.endswith("THE END!")
@pytest.mark.local
def test_url_loader() -> None:
docs = UnstructuredLoader(web_url="https://www.example.com/").load()
for doc in docs:
assert doc.page_content
assert doc.metadata["filetype"] == "text/html"
assert doc.metadata["url"] == "https://www.example.com/"
assert doc.metadata["category"]
# -- API partition --
def test_loader_partitions_via_api() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)
docs = loader.load()
_check_docs_content(docs)
async def test_loader_partitions_via_api_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)
docs = []
async for doc in loader.alazy_load():
docs.append(doc)
_check_docs_content(docs)
def test_loader_partitions_multiple_via_api() -> None:
file_paths = [
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml"),
]
loader = UnstructuredLoader(
file_path=file_paths,
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].metadata.get("filename") == "layout-parser-paper.pdf"
assert docs[-1].metadata.get("filename") == "fake-email-attachment.eml"
def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
mode="elements",
)
with pytest.raises(TypeError, match="unexpected keyword argument 'mode'"):
loader.load()
def test_loader_partitions_via_api_hi_res() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="hi_res",
)
docs = loader.load()
categories = set(doc.metadata.get("category") for doc in docs)
assert "Table" in categories
assert "Image" in categories
# -- fixtures ---
@pytest.fixture()
def get_post_processor() -> Callable[[str], str]:
def append_the_end(text: str) -> str:
return text + "THE END!"
return append_the_end

View File

@ -1,218 +0,0 @@
from pathlib import Path
from typing import Any, Callable
from unittest import mock
from unittest.mock import Mock, mock_open, patch
import pytest
from unstructured.documents.elements import Text # type: ignore
from langchain_unstructured.document_loaders import (
UnstructuredLoader,
_SingleDocumentLoader, # type: ignore
)
EXAMPLE_DOCS_DIRECTORY = str(
Path(__file__).parent.parent.parent.parent.parent
/ "community/tests/integration_tests/examples/"
)
# --- UnstructuredLoader.__init__() ---
def test_it_initializes_with_file_path(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.delenv("UNSTRUCTURED_API_KEY", raising=False)
loader = UnstructuredLoader(file_path="dummy_path")
assert loader.file_path == "dummy_path"
assert loader.file is None
assert loader.partition_via_api is False
assert loader.post_processors is None
assert loader.unstructured_kwargs == {}
# A client is always created and passed to _SingleDocumentLoader, but it's not
# used unless partition_via_api=True
assert loader.client is not None
assert loader.client.sdk_configuration.security.api_key_auth == "" # type: ignore
assert (
loader.client.sdk_configuration.server_url == "https://api.unstructuredapp.io"
)
def test_it_initializes_with_env_api_key(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("UNSTRUCTURED_API_KEY", "FAKE_API_KEY")
loader = UnstructuredLoader(file_path="dummy_path")
assert loader.file_path == "dummy_path"
assert loader.file is None
assert loader.partition_via_api is False
assert loader.post_processors is None
assert loader.unstructured_kwargs == {}
assert loader.client is not None
assert loader.client.sdk_configuration.security.api_key_auth == "FAKE_API_KEY" # type: ignore
assert (
loader.client.sdk_configuration.server_url == "https://api.unstructuredapp.io"
)
# --- _SingleDocumentLoader._get_content() ---
def test_it_gets_content_from_file() -> None:
mock_file = Mock()
mock_file.read.return_value = b"content from file"
loader = _SingleDocumentLoader(
client=Mock(), file=mock_file, metadata_filename="fake.txt"
)
content = loader._file_content # type: ignore
assert content == b"content from file"
mock_file.read.assert_called_once()
@patch("builtins.open", new_callable=mock_open, read_data=b"content from file_path")
def test_it_gets_content_from_file_path(mock_file: Mock) -> None:
loader = _SingleDocumentLoader(client=Mock(), file_path="dummy_path")
content = loader._file_content # type: ignore
assert content == b"content from file_path"
mock_file.assert_called_once_with("dummy_path", "rb")
handle = mock_file()
handle.read.assert_called_once()
def test_it_raises_value_error_without_file_or_file_path() -> None:
loader = _SingleDocumentLoader(
client=Mock(),
)
with pytest.raises(ValueError) as e:
loader._file_content # type: ignore
assert str(e.value) == "file or file_path must be defined."
# --- _SingleDocumentLoader._elements_json ---
def test_it_calls_elements_via_api_with_valid_args() -> None:
with patch.object(
_SingleDocumentLoader, "_elements_via_api", new_callable=mock.PropertyMock
) as mock_elements_via_api:
mock_elements_via_api.return_value = [{"element": "data"}]
loader = _SingleDocumentLoader(
client=Mock(),
# Minimum required args for self._elements_via_api to be called:
partition_via_api=True,
api_key="some_key",
)
result = loader._elements_json # type: ignore
mock_elements_via_api.assert_called_once()
assert result == [{"element": "data"}]
@patch.object(_SingleDocumentLoader, "_convert_elements_to_dicts")
def test_it_partitions_locally_by_default(mock_convert_elements_to_dicts: Mock) -> None:
mock_convert_elements_to_dicts.return_value = [{}]
with patch.object(
_SingleDocumentLoader, "_elements_via_local", new_callable=mock.PropertyMock
) as mock_elements_via_local:
mock_elements_via_local.return_value = [{}]
# Minimum required args for self._elements_via_api to be called:
loader = _SingleDocumentLoader(
client=Mock(),
)
result = loader._elements_json # type: ignore
mock_elements_via_local.assert_called_once_with()
mock_convert_elements_to_dicts.assert_called_once_with([{}])
assert result == [{}]
def test_it_partitions_locally_and_logs_warning_with_partition_via_api_False(
caplog: pytest.LogCaptureFixture,
) -> None:
with patch.object(
_SingleDocumentLoader, "_elements_via_local"
) as mock_get_elements_locally:
mock_get_elements_locally.return_value = [Text("Mock text element.")]
loader = _SingleDocumentLoader(
client=Mock(), partition_via_api=False, api_key="some_key"
)
_ = loader._elements_json # type: ignore
# -- fixtures -------------------------------
@pytest.fixture()
def get_post_processor() -> Callable[[str], str]:
def append_the_end(text: str) -> str:
return text + "THE END!"
return append_the_end
@pytest.fixture()
def fake_json_response() -> list[dict[str, Any]]:
return [
{
"type": "Title",
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
"Image Analysis",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
{
"type": "UncategorizedText",
"element_id": "e1c4facddf1f2eb1d0db5be34ad0de18",
"text": "1 2 0 2",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"parent_id": "b7f58c2fd9c15949a55a62eb84e39575",
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
]
@pytest.fixture()
def fake_multiple_docs_json_response() -> list[dict[str, Any]]:
return [
{
"type": "Title",
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
" Image Analysis",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
{
"type": "NarrativeText",
"element_id": "3c4ac9e7f55f1e3dbd87d3a9364642fe",
"text": "6/29/23, 12:16\u202fam - User 4: This message was deleted",
"metadata": {
"filename": "whatsapp_chat.txt",
"languages": ["eng"],
"filetype": "text/plain",
},
},
]

View File

@ -1,10 +0,0 @@
from langchain_unstructured import __all__
EXPECTED_ALL = [
"UnstructuredLoader",
"__version__",
]
def test_all_imports() -> None:
assert sorted(EXPECTED_ALL) == sorted(__all__)