mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 06:53:16 +00:00
langchain[minor]: add azure ai data document loader (#13404)
This PR adds an "Azure AI data" document loader, which allows Azure AI users to load their registered data assets as a document object in langchain. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -34,6 +34,9 @@ from langchain.document_loaders.arxiv import ArxivLoader
|
||||
from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader
|
||||
from langchain.document_loaders.async_html import AsyncHtmlLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.azure_ai_data import (
|
||||
AzureAIDataLoader,
|
||||
)
|
||||
from langchain.document_loaders.azure_blob_storage_container import (
|
||||
AzureBlobStorageContainerLoader,
|
||||
)
|
||||
@@ -226,6 +229,7 @@ __all__ = [
|
||||
"ArxivLoader",
|
||||
"AssemblyAIAudioTranscriptLoader",
|
||||
"AsyncHtmlLoader",
|
||||
"AzureAIDataLoader",
|
||||
"AzureBlobStorageContainerLoader",
|
||||
"AzureBlobStorageFileLoader",
|
||||
"BSHTMLLoader",
|
||||
|
43
libs/langchain/langchain/document_loaders/azure_ai_data.py
Normal file
43
libs/langchain/langchain/document_loaders/azure_ai_data.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileIOLoader
|
||||
|
||||
|
||||
class AzureAIDataLoader(BaseLoader):
|
||||
"""Load from Azure AI Data."""
|
||||
|
||||
def __init__(self, url: str, glob: Optional[str] = None):
|
||||
"""Initialize with URL to a data asset or storage location
|
||||
."""
|
||||
self.url = url
|
||||
"""URL to the data asset or storage location."""
|
||||
self.glob_pattern = glob
|
||||
"""Optional glob pattern to select files. Defaults to None."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""A lazy loader for Documents."""
|
||||
try:
|
||||
from azureml.fsspec import AzureMachineLearningFileSystem
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Could not import azureml-fspec package."
|
||||
"Please install it with `pip install azureml-fsspec`."
|
||||
) from exc
|
||||
|
||||
fs = AzureMachineLearningFileSystem(self.url)
|
||||
|
||||
if self.glob_pattern:
|
||||
remote_paths_list = fs.glob(self.glob_pattern)
|
||||
else:
|
||||
remote_paths_list = fs.ls()
|
||||
|
||||
for remote_path in remote_paths_list:
|
||||
with fs.open(remote_path) as f:
|
||||
loader = UnstructuredFileIOLoader(file=f)
|
||||
yield from loader.load()
|
@@ -22,6 +22,7 @@ EXPECTED_ALL = [
|
||||
"ArxivLoader",
|
||||
"AssemblyAIAudioTranscriptLoader",
|
||||
"AsyncHtmlLoader",
|
||||
"AzureAIDataLoader",
|
||||
"AzureBlobStorageContainerLoader",
|
||||
"AzureBlobStorageFileLoader",
|
||||
"BSHTMLLoader",
|
||||
|
Reference in New Issue
Block a user