unstructured, community, initialize langchain-unstructured package (#22779)

#### Update (2): 
A single `UnstructuredLoader` is added to handle both local and api
partitioning. This loader also handles single or multiple documents.

#### Changes in `community`:
Changes here do not affect users. In the initial process of using the
SDK for the API Loaders, the Loaders in community were refactored.
Other changes include:
The `UnstructuredBaseLoader` has a new check to see if both
`mode="paged"` and `chunking_strategy="by_page"`. It also now has
`Element.element_id` added to the `Document.metadata`.
`UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. As such,
now both directly inherit from `UnstructuredBaseLoader` and initialize
their `file_path`/`file` attributes respectively and implement their own
`_post_process_elements` methods.

--------
#### Update:
New SDK Loaders in a [partner
package](https://python.langchain.com/v0.1/docs/contributing/integrations/#partner-package-in-langchain-repo)
are introduced to prevent breaking changes for users (see discussion
below).

##### TODO:
- [x] Test docstring examples
--------
- **Description:** UnstructuredAPIFileIOLoader and
UnstructuredAPIFileLoader calls to the unstructured api are now made
using the unstructured-client sdk.
- **New Dependencies:** unstructured-client

- [x] **Add tests and docs**: If you're adding a new integration, please
include
- [x] a test for the integration, preferably unit tests that do not rely
on network access,
- [x] update the description in
`docs/docs/integrations/providers/unstructured.mdx`
- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

TODO:
- [x] Update
https://python.langchain.com/v0.1/docs/integrations/document_loaders/unstructured_file/#unstructured-api
-
`langchain/docs/docs/integrations/document_loaders/unstructured_file.ipynb`
- The description here needs to indicate that users should install
`unstructured-client` instead of `unstructured`. Read over closely to
look for any other changes that need to be made.
- [x] Update the `lazy_load` method in `UnstructuredBaseLoader` to
handle json responses from the API instead of just lists of elements.
- This method may need to be overwritten by the API loaders instead of
changing it in the `UnstructuredBaseLoader`.
- [x] Update the documentation links in the class docstrings (the
Unstructured documents have moved)
- [x] Update Document.metadata to include `element_id` (see thread
[here](https://unstructuredw-kbe4326.slack.com/archives/C044N0YV08G/p1718187499818419))

---------

Signed-off-by: ChengZi <chen.zhang@zilliz.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
Co-authored-by: ChengZi <chen.zhang@zilliz.com>
This commit is contained in:
John
2024-07-24 19:21:20 -04:00
committed by GitHub
parent 2394807033
commit d59c656ea5
23 changed files with 5929 additions and 347 deletions

View File

@@ -0,0 +1,7 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""
pass

View File

@@ -0,0 +1,135 @@
import os
from pathlib import Path
from typing import Callable
import pytest
from langchain_unstructured import UnstructuredLoader
EXAMPLE_DOCS_DIRECTORY = str(
Path(__file__).parent.parent.parent.parent.parent
/ "community/tests/integration_tests/examples/"
)
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
# -- Local partition --
@pytest.mark.local
def test_loader_partitions_locally() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
docs = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
).load()
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
@pytest.mark.local
def test_loader_partition_ignores_invalid_arg() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
docs = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
# mode is no longer a valid argument and is ignored when partitioning locally
mode="single",
).load()
assert len(docs) > 1
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
@pytest.mark.local
def test_loader_partitions_locally_and_applies_post_processors(
get_post_processor: Callable[[str], str],
) -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
post_processors=[get_post_processor],
strategy="fast",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].page_content.endswith("THE END!")
# -- API partition --
def test_loader_partitions_via_api() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
)
docs = loader.load()
assert len(docs) > 1
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert docs[0].metadata.get("element_id") is not None
def test_loader_partitions_multiple_via_api() -> None:
file_paths = [
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml"),
]
loader = UnstructuredLoader(
file_path=file_paths,
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].metadata.get("filename") == "layout-parser-paper.pdf"
assert docs[-1].metadata.get("filename") == "fake-email-attachment.eml"
def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
api_key=UNSTRUCTURED_API_KEY,
partition_via_api=True,
mode="elements",
)
with pytest.raises(TypeError, match="unexpected keyword argument 'mode'"):
loader.load()
# -- fixtures ---
@pytest.fixture()
def get_post_processor() -> Callable[[str], str]:
def append_the_end(text: str) -> str:
return text + "THE END!"
return append_the_end

View File

@@ -0,0 +1,178 @@
from pathlib import Path
from typing import Any, Callable
from unittest import mock
from unittest.mock import Mock, mock_open, patch
import pytest
from unstructured.documents.elements import Text # type: ignore
from langchain_unstructured.document_loaders import (
_SingleDocumentLoader, # type: ignore
)
EXAMPLE_DOCS_DIRECTORY = str(
Path(__file__).parent.parent.parent.parent.parent
/ "community/tests/integration_tests/examples/"
)
# --- _SingleDocumentLoader._get_content() ---
def test_it_gets_content_from_file() -> None:
mock_file = Mock()
mock_file.read.return_value = b"content from file"
loader = _SingleDocumentLoader(
client=Mock(), file=mock_file, metadata_filename="fake.txt"
)
content = loader._file_content # type: ignore
assert content == b"content from file"
mock_file.read.assert_called_once()
@patch("builtins.open", new_callable=mock_open, read_data=b"content from file_path")
def test_it_gets_content_from_file_path(mock_file: Mock) -> None:
loader = _SingleDocumentLoader(client=Mock(), file_path="dummy_path")
content = loader._file_content # type: ignore
assert content == b"content from file_path"
mock_file.assert_called_once_with("dummy_path", "rb")
handle = mock_file()
handle.read.assert_called_once()
def test_it_raises_value_error_without_file_or_file_path() -> None:
loader = _SingleDocumentLoader(
client=Mock(),
)
with pytest.raises(ValueError) as e:
loader._file_content # type: ignore
assert str(e.value) == "file or file_path must be defined."
# --- _SingleDocumentLoader._elements_json ---
def test_it_calls_elements_via_api_with_valid_args() -> None:
with patch.object(
_SingleDocumentLoader, "_elements_via_api", new_callable=mock.PropertyMock
) as mock_elements_via_api:
mock_elements_via_api.return_value = [{"element": "data"}]
loader = _SingleDocumentLoader(
client=Mock(),
# Minimum required args for self._elements_via_api to be called:
partition_via_api=True,
api_key="some_key",
)
result = loader._elements_json # type: ignore
mock_elements_via_api.assert_called_once()
assert result == [{"element": "data"}]
@patch.object(_SingleDocumentLoader, "_convert_elements_to_dicts")
def test_it_partitions_locally_by_default(mock_convert_elements_to_dicts: Mock) -> None:
mock_convert_elements_to_dicts.return_value = [{}]
with patch.object(
_SingleDocumentLoader, "_elements_via_local", new_callable=mock.PropertyMock
) as mock_elements_via_local:
mock_elements_via_local.return_value = [{}]
# Minimum required args for self._elements_via_api to be called:
loader = _SingleDocumentLoader(
client=Mock(),
)
result = loader._elements_json # type: ignore
mock_elements_via_local.assert_called_once_with()
mock_convert_elements_to_dicts.assert_called_once_with([{}])
assert result == [{}]
def test_it_partitions_locally_and_logs_warning_with_partition_via_api_False(
caplog: pytest.LogCaptureFixture,
) -> None:
with patch.object(
_SingleDocumentLoader, "_elements_via_local"
) as mock_get_elements_locally:
mock_get_elements_locally.return_value = [Text("Mock text element.")]
loader = _SingleDocumentLoader(
client=Mock(), partition_via_api=False, api_key="some_key"
)
_ = loader._elements_json # type: ignore
# -- fixtures -------------------------------
@pytest.fixture()
def get_post_processor() -> Callable[[str], str]:
def append_the_end(text: str) -> str:
return text + "THE END!"
return append_the_end
@pytest.fixture()
def fake_json_response() -> list[dict[str, Any]]:
return [
{
"type": "Title",
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
"Image Analysis",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
{
"type": "UncategorizedText",
"element_id": "e1c4facddf1f2eb1d0db5be34ad0de18",
"text": "1 2 0 2",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"parent_id": "b7f58c2fd9c15949a55a62eb84e39575",
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
]
@pytest.fixture()
def fake_multiple_docs_json_response() -> list[dict[str, Any]]:
return [
{
"type": "Title",
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
" Image Analysis",
"metadata": {
"languages": ["eng"],
"page_number": 1,
"filename": "layout-parser-paper.pdf",
"filetype": "application/pdf",
},
},
{
"type": "NarrativeText",
"element_id": "3c4ac9e7f55f1e3dbd87d3a9364642fe",
"text": "6/29/23, 12:16\u202fam - User 4: This message was deleted",
"metadata": {
"filename": "whatsapp_chat.txt",
"languages": ["eng"],
"filetype": "text/plain",
},
},
]

View File

@@ -0,0 +1,10 @@
from langchain_unstructured import __all__
EXPECTED_ALL = [
"UnstructuredLoader",
"__version__",
]
def test_all_imports() -> None:
assert sorted(EXPECTED_ALL) == sorted(__all__)