diff --git a/docs/modules/document_loaders/examples/azure_blob_storage_container.ipynb b/docs/modules/document_loaders/examples/azure_blob_storage_container.ipynb new file mode 100644 index 00000000000..d843352f646 --- /dev/null +++ b/docs/modules/document_loaders/examples/azure_blob_storage_container.ipynb @@ -0,0 +1,135 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "a634365e", + "metadata": {}, + "source": [ + "# Azure Blob Storage Container\n", + "\n", + "This covers how to load document objects from a container on Azure Blob Storage." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2f0cd6a5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import AzureBlobStorageContainerLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "49815096", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install azure-storage-blob" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "321cc7f1", + "metadata": {}, + "outputs": [], + "source": [ + "loader = AzureBlobStorageContainerLoader(conn_str=\"\", container=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2b11d155", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpaa9xl6ch/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "0690c40a", + "metadata": {}, + "source": [ + "## Specifying a prefix\n", + "You can also specify a prefix for more finegrained control over what files to load." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72d44781", + "metadata": {}, + "outputs": [], + "source": [ + "loader = AzureBlobStorageContainerLoader(conn_str=\"\", container=\"\", prefix=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2d3c32db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885dc280", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/azure_blob_storage_file.ipynb b/docs/modules/document_loaders/examples/azure_blob_storage_file.ipynb new file mode 100644 index 00000000000..a7d3448717a --- /dev/null +++ b/docs/modules/document_loaders/examples/azure_blob_storage_file.ipynb @@ -0,0 +1,95 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "66a7777e", + "metadata": {}, + "source": [ + "# Azure Blob Storage File\n", + "\n", + "This covers how to load document objects from a Azure Blob Storage file." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9ec8a3b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import AzureBlobStorageFileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "43128d8d", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install azure-storage-blob" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "35d6809a", + "metadata": {}, + "outputs": [], + "source": [ + "loader = AzureBlobStorageFileLoader(conn_str='', container='', blob_name='')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "efd6be84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93689594", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index fb4f22ff311..ceac2b9be8c 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -2,6 +2,12 @@ from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.azlyrics import AZLyricsLoader +from langchain.document_loaders.azure_blob_storage_container import ( + AzureBlobStorageContainerLoader, +) +from langchain.document_loaders.azure_blob_storage_file import ( + AzureBlobStorageFileLoader, +) from langchain.document_loaders.blackboard import BlackboardLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.conllu import CoNLLULoader @@ -104,4 +110,6 @@ __all__ = [ "GoogleApiClient", "CSVLoader", "BlackboardLoader", + "AzureBlobStorageFileLoader", + "AzureBlobStorageContainerLoader", ] diff --git a/langchain/document_loaders/azure_blob_storage_container.py b/langchain/document_loaders/azure_blob_storage_container.py new file mode 100644 index 00000000000..0598de88521 --- /dev/null +++ b/langchain/document_loaders/azure_blob_storage_container.py @@ -0,0 +1,40 @@ +"""Loading logic for loading documents from an Azure Blob Storage container.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.azure_blob_storage_file import ( + AzureBlobStorageFileLoader, +) +from langchain.document_loaders.base import BaseLoader + + +class AzureBlobStorageContainerLoader(BaseLoader): + """Loading logic for loading documents from Azure Blob Storage.""" + + def __init__(self, conn_str: str, container: str, prefix: str = ""): + """Initialize with connection string, container and blob prefix.""" + self.conn_str = conn_str + self.container = container + self.prefix = prefix + + def load(self) -> List[Document]: + """Load documents.""" + try: + from azure.storage.blob import ContainerClient + except ImportError as exc: + raise ValueError( + "Could not import azure storage blob python package. " + "Please it install it with `pip install azure-storage-blob`." + ) from exc + + container = ContainerClient.from_connection_string( + conn_str=self.conn_str, container_name=self.container + ) + docs = [] + blob_list = container.list_blobs(name_starts_with=self.prefix) + for blob in blob_list: + loader = AzureBlobStorageFileLoader( + self.conn_str, self.container, blob.name # type: ignore + ) + docs.extend(loader.load()) + return docs diff --git a/langchain/document_loaders/azure_blob_storage_file.py b/langchain/document_loaders/azure_blob_storage_file.py new file mode 100644 index 00000000000..59c8e8c1333 --- /dev/null +++ b/langchain/document_loaders/azure_blob_storage_file.py @@ -0,0 +1,41 @@ +"""Loading logic for loading documents from an Azure Blob Storage file.""" +import os +import tempfile +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class AzureBlobStorageFileLoader(BaseLoader): + """Loading logic for loading documents from Azure Blob Storage.""" + + def __init__(self, conn_str: str, container: str, blob_name: str): + """Initialize with connection string, container and blob name.""" + self.conn_str = conn_str + self.container = container + self.blob = blob_name + + def load(self) -> List[Document]: + """Load documents.""" + try: + from azure.storage.blob import BlobClient + except ImportError as exc: + raise ValueError( + "Could not import azure storage blob python package. " + "Please it install it with `pip install azure-storage-blob`." + ) from exc + + client = BlobClient.from_connection_string( + conn_str=self.conn_str, container_name=self.container, blob_name=self.blob + ) + + with tempfile.TemporaryDirectory() as temp_dir: + file_path = f"{temp_dir}/{self.container}/{self.blob}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(f"{file_path}", "wb") as file: + blob_data = client.download_blob() + blob_data.readinto(file) + loader = UnstructuredFileLoader(file_path) + return loader.load()