diff --git a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb index eb21a2a3d19..683408995b4 100644 --- a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb +++ b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb @@ -90,7 +90,7 @@ { "data": { "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]" + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" ] }, "execution_count": 6, diff --git a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb index ecf20098565..4646e4101e1 100644 --- a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb +++ b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb @@ -53,7 +53,7 @@ { "data": { "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]" + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" ] }, "execution_count": 9, @@ -96,3 +96,4 @@ "nbformat": 4, "nbformat_minor": 5 } + diff --git a/libs/langchain/langchain/document_loaders/s3_file.py b/libs/langchain/langchain/document_loaders/s3_file.py index 509b1ea1eec..2e0b56700e8 100644 --- a/libs/langchain/langchain/document_loaders/s3_file.py +++ b/libs/langchain/langchain/document_loaders/s3_file.py @@ -2,12 +2,10 @@ import os import tempfile from typing import List -from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader -from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.document_loaders.unstructured import UnstructuredBaseLoader -class S3FileLoader(BaseLoader): +class S3FileLoader(UnstructuredBaseLoader): """Load from `Amazon AWS S3` file.""" def __init__(self, bucket: str, key: str): @@ -17,11 +15,14 @@ class S3FileLoader(BaseLoader): bucket: The name of the S3 bucket. key: The key of the S3 object. """ + super().__init__() self.bucket = bucket self.key = key - def load(self) -> List[Document]: - """Load documents.""" + def _get_elements(self) -> List: + """Get elements.""" + from unstructured.partition.auto import partition + try: import boto3 except ImportError: @@ -34,5 +35,7 @@ class S3FileLoader(BaseLoader): file_path = f"{temp_dir}/{self.key}" os.makedirs(os.path.dirname(file_path), exist_ok=True) s3.download_file(self.bucket, self.key, file_path) - loader = UnstructuredFileLoader(file_path) - return loader.load() + return partition(filename=file_path) + + def _get_metadata(self) -> dict: + return {"source": f"s3://{self.bucket}/{self.key}"}