community[patch]: Load list of files using UnstructuredFileLoader (#16216)

- **Description:** Updated `_get_elements()` function of
`UnstructuredFileLoader `class to check if the argument self.file_path
is a file or list of files. If it is a list of files then it iterates
over the list of file paths, calls the partition function for each one,
and appends the results to the elements list. If self.file_path is not a
list, it calls the partition function as before.
  
  - **Issue:** Fixed #15607,
  - **Dependencies:** NA
  - **Twitter handle:** NA

Co-authored-by: H161961 <Raunak.Raunak@Honeywell.com>
This commit is contained in:
Raunak 2024-01-24 09:07:37 +05:30 committed by GitHub
parent 019b6ebe8d
commit 476bf8b763
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 74 additions and 3 deletions

View File

@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "2886982e",
"metadata": {},
"outputs": [],
@ -100,6 +100,54 @@
"docs[0].page_content[:400]"
]
},
{
"cell_type": "markdown",
"id": "b4ab0a79",
"metadata": {},
"source": [
"### Load list of files"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "092d9a0b",
"metadata": {},
"outputs": [],
"source": [
"files = [\"./example_data/whatsapp_chat.txt\", \"./example_data/layout-parser-paper.pdf\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f841c4f8",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredFileLoader(files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "993c240b",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ce4ff07",
"metadata": {},
"outputs": [],
"source": [
"docs[0].page_content[:400]"
]
},
{
"cell_type": "markdown",
"id": "7874d01d",
@ -495,7 +543,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.9.0"
}
},
"nbformat": 4,

View File

@ -170,7 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
return partition(filename=self.file_path, **self.unstructured_kwargs)
if isinstance(self.file_path, list):
elements = []
for file in self.file_path:
elements.extend(partition(filename=file, **self.unstructured_kwargs))
return elements
else:
return partition(filename=self.file_path, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
return {"source": self.file_path}

View File

@ -28,6 +28,23 @@ def test_unstructured_loader_with_post_processor() -> None:
assert docs[0].page_content.endswith("THE END!")
def test_unstructured_file_loader_multiple_files() -> None:
"""Test unstructured loader."""
file_paths = [
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
]
loader = UnstructuredFileLoader(
file_path=file_paths,
strategy="fast",
mode="elements",
)
docs = loader.load()
assert len(docs) > 1
def test_unstructured_api_file_loader() -> None:
"""Test unstructured loader."""
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")