mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 06:14:37 +00:00
community[patch]: support unstructured_kwargs for s3 loader (#15473)
fix https://github.com/langchain-ai/langchain/issues/15472 Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
b901649032
commit
be2adb1083
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import TYPE_CHECKING, Callable, List, Optional, Union
|
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union
|
||||||
|
|
||||||
from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader
|
from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader
|
||||||
|
|
||||||
@ -29,6 +29,7 @@ class S3FileLoader(UnstructuredBaseLoader):
|
|||||||
boto_config: Optional[botocore.client.Config] = None,
|
boto_config: Optional[botocore.client.Config] = None,
|
||||||
mode: str = "single",
|
mode: str = "single",
|
||||||
post_processors: Optional[List[Callable]] = None,
|
post_processors: Optional[List[Callable]] = None,
|
||||||
|
**unstructured_kwargs: Any,
|
||||||
):
|
):
|
||||||
"""Initialize with bucket and key name.
|
"""Initialize with bucket and key name.
|
||||||
|
|
||||||
@ -85,11 +86,13 @@ class S3FileLoader(UnstructuredBaseLoader):
|
|||||||
the client will be the result of calling ``merge()`` on the
|
the client will be the result of calling ``merge()`` on the
|
||||||
default config with the config provided to this call.
|
default config with the config provided to this call.
|
||||||
:param mode: Mode in which to read the file. Valid options are: single,
|
:param mode: Mode in which to read the file. Valid options are: single,
|
||||||
paged and elements
|
paged and elements.
|
||||||
:param post_processors: Post processing functions to be applied to
|
:param post_processors: Post processing functions to be applied to
|
||||||
extracted elements
|
extracted elements.
|
||||||
|
:param **unstructured_kwargs: Arbitrary additional kwargs to pass in when
|
||||||
|
calling `partition`
|
||||||
"""
|
"""
|
||||||
super().__init__(mode, post_processors)
|
super().__init__(mode, post_processors, **unstructured_kwargs)
|
||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.key = key
|
self.key = key
|
||||||
self.region_name = region_name
|
self.region_name = region_name
|
||||||
@ -129,7 +132,7 @@ class S3FileLoader(UnstructuredBaseLoader):
|
|||||||
file_path = f"{temp_dir}/{self.key}"
|
file_path = f"{temp_dir}/{self.key}"
|
||||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||||
s3.download_file(self.bucket, self.key, file_path)
|
s3.download_file(self.bucket, self.key, file_path)
|
||||||
return partition(filename=file_path)
|
return partition(filename=file_path, **self.unstructured_kwargs)
|
||||||
|
|
||||||
def _get_metadata(self) -> dict:
|
def _get_metadata(self) -> dict:
|
||||||
return {"source": f"s3://{self.bucket}/{self.key}"}
|
return {"source": f"s3://{self.bucket}/{self.key}"}
|
||||||
|
Loading…
Reference in New Issue
Block a user