mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-07 12:06:43 +00:00
community[patch]: Load list of files using UnstructuredFileLoader (#16216)
- **Description:** Updated `_get_elements()` function of `UnstructuredFileLoader `class to check if the argument self.file_path is a file or list of files. If it is a list of files then it iterates over the list of file paths, calls the partition function for each one, and appends the results to the elements list. If self.file_path is not a list, it calls the partition function as before. - **Issue:** Fixed #15607, - **Dependencies:** NA - **Twitter handle:** NA Co-authored-by: H161961 <Raunak.Raunak@Honeywell.com>
This commit is contained in:
parent
019b6ebe8d
commit
476bf8b763
@ -12,7 +12,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"id": "2886982e",
|
"id": "2886982e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -100,6 +100,54 @@
|
|||||||
"docs[0].page_content[:400]"
|
"docs[0].page_content[:400]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b4ab0a79",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Load list of files"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "092d9a0b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"files = [\"./example_data/whatsapp_chat.txt\", \"./example_data/layout-parser-paper.pdf\"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f841c4f8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredFileLoader(files)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "993c240b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5ce4ff07",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs[0].page_content[:400]"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "7874d01d",
|
"id": "7874d01d",
|
||||||
@ -495,7 +543,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.9.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -170,7 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
|||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
from unstructured.partition.auto import partition
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
return partition(filename=self.file_path, **self.unstructured_kwargs)
|
if isinstance(self.file_path, list):
|
||||||
|
elements = []
|
||||||
|
for file in self.file_path:
|
||||||
|
elements.extend(partition(filename=file, **self.unstructured_kwargs))
|
||||||
|
return elements
|
||||||
|
else:
|
||||||
|
return partition(filename=self.file_path, **self.unstructured_kwargs)
|
||||||
|
|
||||||
def _get_metadata(self) -> dict:
|
def _get_metadata(self) -> dict:
|
||||||
return {"source": self.file_path}
|
return {"source": self.file_path}
|
||||||
|
@ -28,6 +28,23 @@ def test_unstructured_loader_with_post_processor() -> None:
|
|||||||
assert docs[0].page_content.endswith("THE END!")
|
assert docs[0].page_content.endswith("THE END!")
|
||||||
|
|
||||||
|
|
||||||
|
def test_unstructured_file_loader_multiple_files() -> None:
|
||||||
|
"""Test unstructured loader."""
|
||||||
|
file_paths = [
|
||||||
|
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
|
||||||
|
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
|
||||||
|
]
|
||||||
|
|
||||||
|
loader = UnstructuredFileLoader(
|
||||||
|
file_path=file_paths,
|
||||||
|
strategy="fast",
|
||||||
|
mode="elements",
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) > 1
|
||||||
|
|
||||||
|
|
||||||
def test_unstructured_api_file_loader() -> None:
|
def test_unstructured_api_file_loader() -> None:
|
||||||
"""Test unstructured loader."""
|
"""Test unstructured loader."""
|
||||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||||
|
Loading…
Reference in New Issue
Block a user