mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-28 17:38:36 +00:00
community[minor]: S3FileLoader to use expose mode and post_processors arguments of unstructured loader (#19270)
**Description:** Update s3_file.py to use arguments **mode** and **post_processors** from the base class **UnstructuredBaseLoader** to include more metadata about the files from the S3 bucket such as *'page_number', 'languages'* etc. **Issue:** NA **Dependencies:** None **Twitter handle:** preak95 --------- Co-authored-by: ccurme <chester.curme@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
560e2182d8
commit
6ea3e57a63
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import TYPE_CHECKING, List, Optional, Union
|
from typing import TYPE_CHECKING, Callable, List, Optional, Union
|
||||||
|
|
||||||
from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader
|
from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader
|
||||||
|
|
||||||
@ -27,6 +27,8 @@ class S3FileLoader(UnstructuredBaseLoader):
|
|||||||
aws_secret_access_key: Optional[str] = None,
|
aws_secret_access_key: Optional[str] = None,
|
||||||
aws_session_token: Optional[str] = None,
|
aws_session_token: Optional[str] = None,
|
||||||
boto_config: Optional[botocore.client.Config] = None,
|
boto_config: Optional[botocore.client.Config] = None,
|
||||||
|
mode: str = "single",
|
||||||
|
post_processors: Optional[List[Callable]] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with bucket and key name.
|
"""Initialize with bucket and key name.
|
||||||
|
|
||||||
@ -82,8 +84,12 @@ class S3FileLoader(UnstructuredBaseLoader):
|
|||||||
object is set on the session, the config object used when creating
|
object is set on the session, the config object used when creating
|
||||||
the client will be the result of calling ``merge()`` on the
|
the client will be the result of calling ``merge()`` on the
|
||||||
default config with the config provided to this call.
|
default config with the config provided to this call.
|
||||||
|
:param mode: Mode in which to read the file. Valid options are: single,
|
||||||
|
paged and elements
|
||||||
|
:param post_processors: Post processing functions to be applied to
|
||||||
|
extracted elements
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__(mode, post_processors)
|
||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.key = key
|
self.key = key
|
||||||
self.region_name = region_name
|
self.region_name = region_name
|
||||||
|
Loading…
Reference in New Issue
Block a user