diff --git a/docs/docs/integrations/document_loaders/unstructured_file.ipynb b/docs/docs/integrations/document_loaders/unstructured_file.ipynb index 113e42bf2bb..0d26030ab7e 100644 --- a/docs/docs/integrations/document_loaders/unstructured_file.ipynb +++ b/docs/docs/integrations/document_loaders/unstructured_file.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "2886982e", "metadata": {}, "outputs": [], @@ -100,6 +100,54 @@ "docs[0].page_content[:400]" ] }, + { + "cell_type": "markdown", + "id": "b4ab0a79", + "metadata": {}, + "source": [ + "### Load list of files" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "092d9a0b", + "metadata": {}, + "outputs": [], + "source": [ + "files = [\"./example_data/whatsapp_chat.txt\", \"./example_data/layout-parser-paper.pdf\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f841c4f8", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredFileLoader(files)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "993c240b", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ce4ff07", + "metadata": {}, + "outputs": [], + "source": [ + "docs[0].page_content[:400]" + ] + }, { "cell_type": "markdown", "id": "7874d01d", @@ -495,7 +543,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.0" } }, "nbformat": 4, diff --git a/libs/community/langchain_community/document_loaders/unstructured.py b/libs/community/langchain_community/document_loaders/unstructured.py index 9d8223ff860..b7ee7717056 100644 --- a/libs/community/langchain_community/document_loaders/unstructured.py +++ b/libs/community/langchain_community/document_loaders/unstructured.py @@ -170,7 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): def _get_elements(self) -> List: from unstructured.partition.auto import partition - return partition(filename=self.file_path, **self.unstructured_kwargs) + if isinstance(self.file_path, list): + elements = [] + for file in self.file_path: + elements.extend(partition(filename=file, **self.unstructured_kwargs)) + return elements + else: + return partition(filename=self.file_path, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {"source": self.file_path} diff --git a/libs/community/tests/integration_tests/document_loaders/test_unstructured.py b/libs/community/tests/integration_tests/document_loaders/test_unstructured.py index bb1d809ca5d..5bdd30f2c2e 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_unstructured.py +++ b/libs/community/tests/integration_tests/document_loaders/test_unstructured.py @@ -28,6 +28,23 @@ def test_unstructured_loader_with_post_processor() -> None: assert docs[0].page_content.endswith("THE END!") +def test_unstructured_file_loader_multiple_files() -> None: + """Test unstructured loader.""" + file_paths = [ + os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"), + os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"), + ] + + loader = UnstructuredFileLoader( + file_path=file_paths, + strategy="fast", + mode="elements", + ) + docs = loader.load() + + assert len(docs) > 1 + + def test_unstructured_api_file_loader() -> None: """Test unstructured loader.""" file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")