From 6ea3e57a637f8d27ec3608016ead00a1756efe28 Mon Sep 17 00:00:00 2001 From: preak95 Date: Mon, 25 Mar 2024 12:26:55 +0530 Subject: [PATCH] community[minor]: S3FileLoader to use expose mode and post_processors arguments of unstructured loader (#19270) **Description:** Update s3_file.py to use arguments **mode** and **post_processors** from the base class **UnstructuredBaseLoader** to include more metadata about the files from the S3 bucket such as *'page_number', 'languages'* etc. **Issue:** NA **Dependencies:** None **Twitter handle:** preak95 --------- Co-authored-by: ccurme Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> --- .../langchain_community/document_loaders/s3_file.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/s3_file.py b/libs/community/langchain_community/document_loaders/s3_file.py index eaca0761f2b..59b3164993a 100644 --- a/libs/community/langchain_community/document_loaders/s3_file.py +++ b/libs/community/langchain_community/document_loaders/s3_file.py @@ -2,7 +2,7 @@ from __future__ import annotations import os import tempfile -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, Callable, List, Optional, Union from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader @@ -27,6 +27,8 @@ class S3FileLoader(UnstructuredBaseLoader): aws_secret_access_key: Optional[str] = None, aws_session_token: Optional[str] = None, boto_config: Optional[botocore.client.Config] = None, + mode: str = "single", + post_processors: Optional[List[Callable]] = None, ): """Initialize with bucket and key name. @@ -82,8 +84,12 @@ class S3FileLoader(UnstructuredBaseLoader): object is set on the session, the config object used when creating the client will be the result of calling ``merge()`` on the default config with the config provided to this call. + :param mode: Mode in which to read the file. Valid options are: single, + paged and elements + :param post_processors: Post processing functions to be applied to + extracted elements """ - super().__init__() + super().__init__(mode, post_processors) self.bucket = bucket self.key = key self.region_name = region_name