community[minor]: S3FileLoader to use expose mode and post_processors arguments of unstructured loader (#19270)

**Description:** Update s3_file.py to use arguments **mode** and
**post_processors** from the base class **UnstructuredBaseLoader** to
include more metadata about the files from the S3 bucket such as
*'page_number', 'languages'* etc.

**Issue:** NA
**Dependencies:** None
**Twitter handle:** preak95

---------

Co-authored-by: ccurme <chester.curme@gmail.com>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
preak95 2024-03-25 12:26:55 +05:30 committed by GitHub
parent 560e2182d8
commit 6ea3e57a63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import os import os
import tempfile import tempfile
from typing import TYPE_CHECKING, List, Optional, Union from typing import TYPE_CHECKING, Callable, List, Optional, Union
from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader
@ -27,6 +27,8 @@ class S3FileLoader(UnstructuredBaseLoader):
aws_secret_access_key: Optional[str] = None, aws_secret_access_key: Optional[str] = None,
aws_session_token: Optional[str] = None, aws_session_token: Optional[str] = None,
boto_config: Optional[botocore.client.Config] = None, boto_config: Optional[botocore.client.Config] = None,
mode: str = "single",
post_processors: Optional[List[Callable]] = None,
): ):
"""Initialize with bucket and key name. """Initialize with bucket and key name.
@ -82,8 +84,12 @@ class S3FileLoader(UnstructuredBaseLoader):
object is set on the session, the config object used when creating object is set on the session, the config object used when creating
the client will be the result of calling ``merge()`` on the the client will be the result of calling ``merge()`` on the
default config with the config provided to this call. default config with the config provided to this call.
:param mode: Mode in which to read the file. Valid options are: single,
paged and elements
:param post_processors: Post processing functions to be applied to
extracted elements
""" """
super().__init__() super().__init__(mode, post_processors)
self.bucket = bucket self.bucket = bucket
self.key = key self.key = key
self.region_name = region_name self.region_name = region_name