From fd781c89cc554ee3b99c2c0a6be3562725401648 Mon Sep 17 00:00:00 2001 From: Samuel Kemp Date: Sat, 2 Dec 2023 03:25:55 +0000 Subject: [PATCH] langchain[minor]: add azure ai data document loader (#13404) This PR adds an "Azure AI data" document loader, which allows Azure AI users to load their registered data assets as a document object in langchain. --------- Co-authored-by: Bagatur --- .../document_loaders/azure_ai_data.ipynb | 174 ++++++++++++++++++ .../langchain/document_loaders/__init__.py | 4 + .../document_loaders/azure_ai_data.py | 43 +++++ .../document_loaders/test_imports.py | 1 + 4 files changed, 222 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/azure_ai_data.ipynb create mode 100644 libs/langchain/langchain/document_loaders/azure_ai_data.py diff --git a/docs/docs/integrations/document_loaders/azure_ai_data.ipynb b/docs/docs/integrations/document_loaders/azure_ai_data.ipynb new file mode 100644 index 00000000000..93ab36edbb0 --- /dev/null +++ b/docs/docs/integrations/document_loaders/azure_ai_data.ipynb @@ -0,0 +1,174 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a634365e", + "metadata": {}, + "source": [ + "# Azure AI Data\n", + "\n", + ">[Azure AI Studio](https://ai.azure.com/) provides the capability to upload data assets to cloud storage and register existing data assets from the following sources:\n", + "\n", + "- Microsoft OneLake\n", + "- Azure Blob Storage\n", + "- Azure Data Lake gen 2\n", + "\n", + "The benefit of this approach over `AzureBlobStorageContainerLoader` and `AzureBlobStorageFileLoader` is that authentication is handled seamlessly to cloud storage. You can use either *identity-based* data access control to the data or *credential-based* (e.g. SAS token, account key). In the case of credential-based data access you do not need to specify secrets in your code or set up key vaults - the system handles that for you.\n", + "\n", + "This notebook covers how to load document objects from a data asset in AI Studio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49815096", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install azureml-fsspec, azure-ai-generative" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2f0cd6a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from azure.ai.resources.client import AIClient\n", + "from azure.identity import DefaultAzureCredential\n", + "from langchain.document_loaders import AzureAIDataLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08d40b11-e87a-426e-a6b0-89f24e47ce2c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a connection to your project\n", + "client = AIClient(\n", + " credential=DefaultAzureCredential(),\n", + " subscription_id=\"\",\n", + " resource_group_name=\"\",\n", + " project_name=\"\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "321cc7f1", + "metadata": {}, + "outputs": [], + "source": [ + "# get the latest version of your data asset\n", + "data_asset = client.data.get(name=\"\", label=\"latest\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25d91cea-c5f2-4a53-ac19-442810451ec6", + "metadata": {}, + "outputs": [], + "source": [ + "# load the data asset\n", + "loader = AzureAIDataLoader(url=data_asset.path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2b11d155", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpaa9xl6ch/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "0690c40a", + "metadata": {}, + "source": [ + "## Specifying a glob pattern\n", + "You can also specify a glob pattern for more finegrained control over what files to load. In the example below, only files with a `pdf` extension will be loaded." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72d44781", + "metadata": {}, + "outputs": [], + "source": [ + "loader = AzureAIDataLoader(url=data_asset.path, glob=\"*.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2d3c32db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885dc280", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 96cfd9e1b54..119496f9c66 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -34,6 +34,9 @@ from langchain.document_loaders.arxiv import ArxivLoader from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader from langchain.document_loaders.async_html import AsyncHtmlLoader from langchain.document_loaders.azlyrics import AZLyricsLoader +from langchain.document_loaders.azure_ai_data import ( + AzureAIDataLoader, +) from langchain.document_loaders.azure_blob_storage_container import ( AzureBlobStorageContainerLoader, ) @@ -226,6 +229,7 @@ __all__ = [ "ArxivLoader", "AssemblyAIAudioTranscriptLoader", "AsyncHtmlLoader", + "AzureAIDataLoader", "AzureBlobStorageContainerLoader", "AzureBlobStorageFileLoader", "BSHTMLLoader", diff --git a/libs/langchain/langchain/document_loaders/azure_ai_data.py b/libs/langchain/langchain/document_loaders/azure_ai_data.py new file mode 100644 index 00000000000..f9d08875ba2 --- /dev/null +++ b/libs/langchain/langchain/document_loaders/azure_ai_data.py @@ -0,0 +1,43 @@ +from typing import Iterator, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileIOLoader + + +class AzureAIDataLoader(BaseLoader): + """Load from Azure AI Data.""" + + def __init__(self, url: str, glob: Optional[str] = None): + """Initialize with URL to a data asset or storage location + .""" + self.url = url + """URL to the data asset or storage location.""" + self.glob_pattern = glob + """Optional glob pattern to select files. Defaults to None.""" + + def load(self) -> List[Document]: + """Load documents.""" + return list(self.lazy_load()) + + def lazy_load(self) -> Iterator[Document]: + """A lazy loader for Documents.""" + try: + from azureml.fsspec import AzureMachineLearningFileSystem + except ImportError as exc: + raise ImportError( + "Could not import azureml-fspec package." + "Please install it with `pip install azureml-fsspec`." + ) from exc + + fs = AzureMachineLearningFileSystem(self.url) + + if self.glob_pattern: + remote_paths_list = fs.glob(self.glob_pattern) + else: + remote_paths_list = fs.ls() + + for remote_path in remote_paths_list: + with fs.open(remote_path) as f: + loader = UnstructuredFileIOLoader(file=f) + yield from loader.load() diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py index 9f35b895e46..db754275234 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py @@ -22,6 +22,7 @@ EXPECTED_ALL = [ "ArxivLoader", "AssemblyAIAudioTranscriptLoader", "AsyncHtmlLoader", + "AzureAIDataLoader", "AzureBlobStorageContainerLoader", "AzureBlobStorageFileLoader", "BSHTMLLoader",