mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-03 11:47:49 +00:00
feat: batch multiple files in a single Unstructured API request (#4525)
### Submit Multiple Files to the Unstructured API Enables batching multiple files into a single Unstructured API requests. Support for requests with multiple files was added to both `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. Note that if you submit multiple files in "single" mode, the result will be concatenated into a single document. We recommend using this feature in "elements" mode. ### Testing The following should load both documents, using two of the example docs from the integration tests folder. ```python from langchain.document_loaders import UnstructuredAPIFileLoader file_paths = ["examples/layout-parser-paper.pdf", "examples/whatsapp_chat.txt"] loader = UnstructuredAPIFileLoader( file_paths=file_paths, api_key="FAKE_API_KEY", strategy="fast", mode="elements", ) docs = loader.load() ```
This commit is contained in:
parent
0c3de0a0b3
commit
bf3f554357
@ -287,10 +287,118 @@
|
||||
"docs[:5]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b066cb5a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Unstructured API\n",
|
||||
"\n",
|
||||
"If you want to get up and running with less set up, you can simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or `UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API. Note that currently (as of 11 May 2023) the Unstructured API is open, but it will soon require an API. The [Unstructured documentation](https://unstructured-io.github.io/) page will have instructions on how to generate an API key once they’re available. Check out the instructions [here](https://github.com/Unstructured-IO/unstructured-api#dizzy-instructions-for-using-the-docker-image) if you’d like to self-host the Unstructured API or run it locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b50c70bc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import UnstructuredAPIFileLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "12b6d2cf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"filenames = [\"example_data/fake.docx\", \"example_data/fake-email.eml\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "39a9894d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredAPIFileLoader(\n",
|
||||
" file_path=filenames[0],\n",
|
||||
" api_key=\"FAKE_API_KEY\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "386eb63c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "94158999",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also batch multiple files through the Unstructured API in a single API using `UnstructuredAPIFileLoader`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "79a18e7e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredAPIFileLoader(\n",
|
||||
" file_path=filenames,\n",
|
||||
" api_key=\"FAKE_API_KEY\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a3d7c846",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Lorem ipsum dolor sit amet.\\n\\nThis is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', metadata={'source': ['example_data/fake.docx', 'example_data/fake-email.eml']})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f52b04cb",
|
||||
"id": "0e510495",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@ -312,7 +420,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -23,7 +23,7 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
|
||||
is_ppt = detect_filetype(self.file_path) == FileType.PPT
|
||||
except ImportError:
|
||||
_, extension = os.path.splitext(self.file_path)
|
||||
_, extension = os.path.splitext(str(self.file_path))
|
||||
is_ppt = extension == ".ppt"
|
||||
|
||||
if is_ppt and unstructured_version < (0, 4, 11):
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""Loader that uses unstructured to load files."""
|
||||
import collections
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import IO, Any, List
|
||||
from typing import IO, Any, List, Sequence, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
@ -92,7 +93,10 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
"""Loader that uses unstructured to load files."""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, List[str]],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
@ -107,12 +111,48 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
return {"source": self.file_path}
|
||||
|
||||
|
||||
def get_elements_from_api(
|
||||
file_path: Union[str, List[str], None] = None,
|
||||
file: Union[IO, Sequence[IO], None] = None,
|
||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
) -> List:
|
||||
"""Retrieves a list of elements from the Unstructured API."""
|
||||
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
|
||||
from unstructured.partition.api import partition_multiple_via_api
|
||||
|
||||
_doc_elements = partition_multiple_via_api(
|
||||
filenames=file_path,
|
||||
files=file,
|
||||
api_key=api_key,
|
||||
api_url=api_url,
|
||||
**unstructured_kwargs,
|
||||
)
|
||||
|
||||
elements = []
|
||||
for _elements in _doc_elements:
|
||||
elements.extend(_elements)
|
||||
|
||||
return elements
|
||||
else:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
return partition_via_api(
|
||||
filename=file_path,
|
||||
file=file,
|
||||
api_key=api_key,
|
||||
api_url=api_url,
|
||||
**unstructured_kwargs,
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses the unstructured web API to load files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, List[str]] = "",
|
||||
mode: str = "single",
|
||||
url: str = "https://api.unstructured.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
@ -120,23 +160,22 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
|
||||
min_unstructured_version = "0.6.2"
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
"Partitioning via API is only supported in "
|
||||
f"unstructured>={min_unstructured_version}."
|
||||
)
|
||||
if isinstance(file_path, str):
|
||||
validate_unstructured_version(min_unstructured_version="0.6.2")
|
||||
else:
|
||||
validate_unstructured_version(min_unstructured_version="0.6.3")
|
||||
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
def _get_metadata(self) -> dict:
|
||||
return {"source": self.file_path}
|
||||
|
||||
return partition_via_api(
|
||||
filename=self.file_path,
|
||||
def _get_elements(self) -> List:
|
||||
return get_elements_from_api(
|
||||
file_path=self.file_path,
|
||||
api_key=self.api_key,
|
||||
api_url=self.url,
|
||||
**self.unstructured_kwargs,
|
||||
@ -146,7 +185,12 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
"""Loader that uses unstructured to load file IO objects."""
|
||||
|
||||
def __init__(self, file: IO, mode: str = "single", **unstructured_kwargs: Any):
|
||||
def __init__(
|
||||
self,
|
||||
file: Union[IO, Sequence[IO]],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self.file = file
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
@ -165,7 +209,7 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file: IO,
|
||||
file: Union[IO, Sequence[IO]],
|
||||
mode: str = "single",
|
||||
url: str = "https://api.unstructured.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
@ -173,21 +217,18 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
|
||||
min_unstructured_version = "0.6.2"
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
"Partitioning via API is only supported in "
|
||||
f"unstructured>={min_unstructured_version}."
|
||||
)
|
||||
if isinstance(file, collections.abc.Sequence):
|
||||
validate_unstructured_version(min_unstructured_version="0.6.3")
|
||||
if file:
|
||||
validate_unstructured_version(min_unstructured_version="0.6.2")
|
||||
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
|
||||
super().__init__(file=file, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
return partition_via_api(
|
||||
return get_elements_from_api(
|
||||
file=self.file,
|
||||
api_key=self.api_key,
|
||||
api_url=self.url,
|
||||
|
@ -82,7 +82,7 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
||||
|
||||
is_doc = detect_filetype(self.file_path) == FileType.DOC
|
||||
except ImportError:
|
||||
_, extension = os.path.splitext(self.file_path)
|
||||
_, extension = os.path.splitext(str(self.file_path))
|
||||
is_doc = extension == ".doc"
|
||||
|
||||
if is_doc and unstructured_version < (0, 4, 11):
|
||||
|
@ -0,0 +1,82 @@
|
||||
import os
|
||||
from contextlib import ExitStack
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import (
|
||||
UnstructuredAPIFileIOLoader,
|
||||
UnstructuredAPIFileLoader,
|
||||
)
|
||||
|
||||
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredAPIFileLoader(
|
||||
file_path=file_path,
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader_multiple_files() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_paths = [
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
|
||||
]
|
||||
|
||||
loader = UnstructuredAPIFileLoader(
|
||||
file_path=file_paths,
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
|
||||
|
||||
def test_unstructured_api_file_io_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
loader = UnstructuredAPIFileIOLoader(
|
||||
file=f,
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
file_filename=file_path,
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader_io_multiple_files() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_paths = [
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
|
||||
]
|
||||
|
||||
with ExitStack() as stack:
|
||||
files = [stack.enter_context(open(file_path, "rb")) for file_path in file_paths]
|
||||
|
||||
loader = UnstructuredAPIFileIOLoader(
|
||||
file=files, # type: ignore
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
file_filenames=file_paths,
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
Loading…
Reference in New Issue
Block a user