Add SharePoint Loader (#4284)

- Added a loader (`SharePointLoader`) that can pull documents (`pdf`,
`docx`, `doc`) from the [SharePoint Document
Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872).
- Added a Base Loader (`O365BaseLoader`) to be used for all Loaders that
use [O365](https://github.com/O365/python-o365) Package
- Code refactoring on `OneDriveLoader` to use the new `O365BaseLoader`.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
José Ferraz Neto
2023-08-21 11:49:07 -03:00
committed by GitHub
parent bb4f7936f9
commit f116e10d53
7 changed files with 426 additions and 183 deletions

View File

@@ -147,6 +147,7 @@ from langchain.document_loaders.rst import UnstructuredRSTLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.sharepoint import SharePointLoader
from langchain.document_loaders.sitemap import SitemapLoader
from langchain.document_loaders.slack_directory import SlackDirectoryLoader
from langchain.document_loaders.snowflake_loader import SnowflakeLoader
@@ -316,6 +317,7 @@ __all__ = [
"S3FileLoader",
"SRTLoader",
"SeleniumURLLoader",
"SharePointLoader",
"SitemapLoader",
"SlackDirectoryLoader",
"SnowflakeLoader",

View File

@@ -0,0 +1,182 @@
"""Base class for all loaders that uses O365 Package"""
from __future__ import annotations
import logging
import os
import tempfile
from abc import abstractmethod
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader
from langchain.document_loaders.blob_loaders.schema import Blob
from langchain.pydantic_v1 import BaseModel, BaseSettings, Field, FilePath, SecretStr
if TYPE_CHECKING:
from O365 import Account
from O365.drive import Drive, Folder
logger = logging.getLogger(__name__)
CHUNK_SIZE = 1024 * 1024 * 5
class _O365Settings(BaseSettings):
client_id: str = Field(..., env="O365_CLIENT_ID")
client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
class Config:
env_prefix = ""
case_sentive = False
env_file = ".env"
class _O365TokenStorage(BaseSettings):
token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"
class _FileType(str, Enum):
DOC = "doc"
DOCX = "docx"
PDF = "pdf"
def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]:
mime_types_mapping = {}
for file_type in file_types:
if file_type.value == "doc":
mime_types_mapping[file_type.value] = "application/msword"
elif file_type.value == "docx":
mime_types_mapping[
file_type.value
] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501
elif file_type.value == "pdf":
mime_types_mapping[file_type.value] = "application/pdf"
return mime_types_mapping
class O365BaseLoader(BaseLoader, BaseModel):
settings: _O365Settings = Field(default_factory=_O365Settings)
"""Settings for the Office365 API client."""
auth_with_token: bool = False
"""Whether to authenticate with a token or not. Defaults to False."""
chunk_size: Union[int, str] = CHUNK_SIZE
"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""
@property
@abstractmethod
def _file_types(self) -> Sequence[_FileType]:
"""Return supported file types."""
@property
def _fetch_mime_types(self) -> Dict[str, str]:
"""Return a dict of supported file types to corresponding mime types."""
return fetch_mime_types(self._file_types)
@property
@abstractmethod
def _scopes(self) -> List[str]:
"""Return required scopes."""
def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
"""Lazily load all files from a specified folder of the configured MIME type.
Args:
folder: The Folder instance from which the files are to be loaded. This
Folder instance should represent a directory in a file system where the
files are stored.
Yields:
An iterator that yields Blob instances, which are binary representations of
the files loaded from the folder.
"""
file_mime_types = self._fetch_mime_types
items = folder.get_items()
with tempfile.TemporaryDirectory() as temp_dir:
os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
for file in items:
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
loader = FileSystemBlobLoader(path=temp_dir)
yield from loader.yield_blobs()
def _load_from_object_ids(
self, drive: Drive, object_ids: List[str]
) -> Iterable[Blob]:
"""Lazily load files specified by their object_ids from a drive.
Load files into the system as binary large objects (Blobs) and return Iterable.
Args:
drive: The Drive instance from which the files are to be loaded. This Drive
instance should represent a cloud storage service or similar storage
system where the files are stored.
object_ids: A list of object_id strings. Each object_id represents a unique
identifier for a file in the drive.
Yields:
An iterator that yields Blob instances, which are binary representations of
the files loaded from the drive using the specified object_ids.
"""
file_mime_types = self._fetch_mime_types
with tempfile.TemporaryDirectory() as temp_dir:
for object_id in object_ids:
file = drive.get_item(object_id)
if not file:
logging.warning(
"There isn't a file with"
f"object_id {object_id} in drive {drive}."
)
continue
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
loader = FileSystemBlobLoader(path=temp_dir)
yield from loader.yield_blobs()
def _auth(self) -> Account:
"""Authenticates the OneDrive API client
Returns:
The authenticated Account object.
"""
try:
from O365 import Account, FileSystemTokenBackend
except ImportError:
raise ImportError(
"O365 package not found, please install it with `pip install o365`"
)
if self.auth_with_token:
token_storage = _O365TokenStorage()
token_path = token_storage.token_path
token_backend = FileSystemTokenBackend(
token_path=token_path.parent, token_filename=token_path.name
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=self._scopes,
token_backend=token_backend,
**{"raise_http_errors": False},
)
else:
token_backend = FileSystemTokenBackend(
token_path=Path.home() / ".credentials"
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=self._scopes,
token_backend=token_backend,
**{"raise_http_errors": False},
)
# make the auth
account.authenticate()
return account

View File

@@ -2,129 +2,49 @@
from __future__ import annotations
import logging
import os
import tempfile
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.onedrive_file import OneDriveFileLoader
from langchain.pydantic_v1 import BaseModel, BaseSettings, Field, FilePath, SecretStr
from langchain.document_loaders.base_o365 import (
O365BaseLoader,
_FileType,
)
from langchain.document_loaders.parsers.registry import get_parser
from langchain.pydantic_v1 import Field
if TYPE_CHECKING:
from O365 import Account
from O365.drive import Drive, Folder
SCOPES = ["offline_access", "Files.Read.All"]
logger = logging.getLogger(__name__)
class _OneDriveSettings(BaseSettings):
client_id: str = Field(..., env="O365_CLIENT_ID")
client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
class Config:
env_prefix = ""
case_sentive = False
env_file = ".env"
class _OneDriveTokenStorage(BaseSettings):
token_path: FilePath = Field(Path.home() / ".credentials" / "o365_token.txt")
class _FileType(str, Enum):
DOC = "doc"
DOCX = "docx"
PDF = "pdf"
class _SupportedFileTypes(BaseModel):
file_types: List[_FileType]
def fetch_mime_types(self) -> Dict[str, str]:
mime_types_mapping = {}
for file_type in self.file_types:
if file_type.value == "doc":
mime_types_mapping[file_type.value] = "application/msword"
elif file_type.value == "docx":
mime_types_mapping[
file_type.value
] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501
elif file_type.value == "pdf":
mime_types_mapping[file_type.value] = "application/pdf"
return mime_types_mapping
class OneDriveLoader(BaseLoader, BaseModel):
class OneDriveLoader(O365BaseLoader):
"""Load from `Microsoft OneDrive`."""
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
""" The settings for the OneDrive API client."""
drive_id: str = Field(...)
""" The ID of the OneDrive drive to load data from."""
folder_path: Optional[str] = None
""" The path to the folder to load data from."""
object_ids: Optional[List[str]] = None
""" The IDs of the objects to load data from."""
auth_with_token: bool = False
""" Whether to authenticate with a token or not. Defaults to False."""
def _auth(self) -> Type[Account]:
"""
Authenticates the OneDrive API client using the specified
authentication method and returns the Account object.
@property
def _file_types(self) -> Sequence[_FileType]:
"""Return supported file types."""
return _FileType.DOC, _FileType.DOCX, _FileType.PDF
Returns:
Type[Account]: The authenticated Account object.
"""
try:
from O365 import FileSystemTokenBackend
except ImportError:
raise ImportError(
"O365 package not found, please install it with `pip install o365`"
)
if self.auth_with_token:
token_storage = _OneDriveTokenStorage()
token_path = token_storage.token_path
token_backend = FileSystemTokenBackend(
token_path=token_path.parent, token_filename=token_path.name
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=SCOPES,
token_backend=token_backend,
**{"raise_http_errors": False},
)
else:
token_backend = FileSystemTokenBackend(
token_path=Path.home() / ".credentials"
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=SCOPES,
token_backend=token_backend,
**{"raise_http_errors": False},
)
# make the auth
account.authenticate()
return account
@property
def _scopes(self) -> List[str]:
"""Return required scopes."""
return ["offline_access", "Files.Read.All"]
def _get_folder_from_path(self, drive: Type[Drive]) -> Union[Folder, Drive]:
def _get_folder_from_path(self, drive: Drive) -> Union[Folder, Drive]:
"""
Returns the folder or drive object located at the
specified path relative to the given drive.
Args:
drive (Type[Drive]): The root drive from which the folder path is relative.
drive (Drive): The root drive from which the folder path is relative.
Returns:
Union[Folder, Drive]: The folder or drive object
@@ -151,90 +71,26 @@ class OneDriveLoader(BaseLoader, BaseModel):
raise FileNotFoundError("Path {} not exist.".format(self.folder_path))
return subfolder_drive
def _load_from_folder(self, folder: Type[Folder]) -> List[Document]:
"""
Loads all supported document files from the specified folder
and returns a list of Document objects.
Args:
folder (Type[Folder]): The folder object to load the documents from.
Returns:
List[Document]: A list of Document objects representing
the loaded documents.
"""
docs = []
file_types = _SupportedFileTypes(file_types=["doc", "docx", "pdf"])
file_mime_types = file_types.fetch_mime_types()
items = folder.get_items()
with tempfile.TemporaryDirectory() as temp_dir:
file_path = f"{temp_dir}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
for file in items:
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
loader = OneDriveFileLoader(file=file)
docs.extend(loader.load())
return docs
def _load_from_object_ids(self, drive: Type[Drive]) -> List[Document]:
"""
Loads all supported document files from the specified OneDrive
drive based on their object IDs and returns a list
of Document objects.
Args:
drive (Type[Drive]): The OneDrive drive object
to load the documents from.
Returns:
List[Document]: A list of Document objects representing
the loaded documents.
"""
docs = []
file_types = _SupportedFileTypes(file_types=["doc", "docx", "pdf"])
file_mime_types = file_types.fetch_mime_types()
with tempfile.TemporaryDirectory() as temp_dir:
file_path = f"{temp_dir}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
for object_id in self.object_ids if self.object_ids else [""]:
file = drive.get_item(object_id)
if not file:
logger.warning(
"There isn't a file with "
f"object_id {object_id} in drive {drive}."
)
continue
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
loader = OneDriveFileLoader(file=file)
docs.extend(loader.load())
return docs
def lazy_load(self) -> Iterator[Document]:
"""Load documents lazily. Use this when working at a large scale."""
try:
from O365.drive import Drive
except ImportError:
raise ImportError(
"O365 package not found, please install it with `pip install o365`"
)
drive = self._auth().storage().get_drive(self.drive_id)
if not isinstance(drive, Drive):
raise ValueError(f"There isn't a Drive with id {self.drive_id}.")
blob_parser = get_parser("default")
if self.folder_path:
folder = self._get_folder_from_path(drive)
for blob in self._load_from_folder(folder):
yield from blob_parser.lazy_parse(blob)
if self.object_ids:
for blob in self._load_from_object_ids(drive, self.object_ids):
yield from blob_parser.lazy_parse(blob)
def load(self) -> List[Document]:
"""
Loads all supported document files from the specified OneDrive drive
and return a list of Document objects.
Returns:
List[Document]: A list of Document objects
representing the loaded documents.
Raises:
ValueError: If the specified drive ID
does not correspond to a drive in the OneDrive storage.
"""
account = self._auth()
storage = account.storage()
drive = storage.get_drive(self.drive_id)
docs: List[Document] = []
if not drive:
raise ValueError(f"There isn't a drive with id {self.drive_id}.")
if self.folder_path:
folder = self._get_folder_from_path(drive=drive)
docs.extend(self._load_from_folder(folder=folder))
elif self.object_ids:
docs.extend(self._load_from_object_ids(drive=drive))
return docs
"""Load all documents."""
return list(self.lazy_load())

View File

@@ -0,0 +1,34 @@
from typing import Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
class MsWordParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
try:
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
except ImportError as e:
raise ImportError(
"Could not import unstructured, please install with `pip install "
"unstructured`."
) from e
mime_type_parser = {
"application/msword": partition_doc,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
partition_docx
),
}
if blob.mimetype not in (
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
):
raise ValueError("This blob type is not supported for this parser.")
with blob.as_bytes_io() as word_document:
elements = mime_type_parser[blob.mimetype](file=word_document)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": blob.source}
yield Document(page_content=text, metadata=metadata)

View File

@@ -1,6 +1,7 @@
"""Module includes a registry of default parser configurations."""
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain.document_loaders.parsers.msword import MsWordParser
from langchain.document_loaders.parsers.pdf import PyMuPDFParser
from langchain.document_loaders.parsers.txt import TextParser
@@ -11,6 +12,10 @@ def _get_default_parser() -> BaseBlobParser:
handlers={
"application/pdf": PyMuPDFParser(),
"text/plain": TextParser(),
"application/msword": MsWordParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
MsWordParser()
),
},
fallback_parser=None,
)

View File

@@ -0,0 +1,59 @@
"""Loader that loads data from Sharepoint Document Library"""
from __future__ import annotations
from typing import Iterator, List, Optional, Sequence
from langchain.docstore.document import Document
from langchain.document_loaders.base_o365 import (
O365BaseLoader,
_FileType,
)
from langchain.document_loaders.parsers.registry import get_parser
from langchain.pydantic_v1 import Field
class SharePointLoader(O365BaseLoader):
"""Load from `SharePoint`."""
document_library_id: str = Field(...)
""" The ID of the SharePoint document library to load data from."""
folder_path: Optional[str] = None
""" The path to the folder to load data from."""
object_ids: Optional[List[str]] = None
""" The IDs of the objects to load data from."""
@property
def _file_types(self) -> Sequence[_FileType]:
"""Return supported file types."""
return _FileType.DOC, _FileType.DOCX, _FileType.PDF
@property
def _scopes(self) -> List[str]:
"""Return required scopes."""
return ["sharepoint", "basic"]
def lazy_load(self) -> Iterator[Document]:
"""Load documents lazily. Use this when working at a large scale."""
try:
from O365.drive import Drive, Folder
except ImportError:
raise ImportError(
"O365 package not found, please install it with `pip install o365`"
)
drive = self._auth().storage().get_drive(self.document_library_id)
if not isinstance(drive, Drive):
raise ValueError(f"There isn't a Drive with id {self.document_library_id}.")
blob_parser = get_parser("default")
if self.folder_path:
target_folder = drive.get_item_by_path(self.folder_path)
if not isinstance(target_folder, Folder):
raise ValueError(f"There isn't a folder with path {self.folder_path}.")
for blob in self._load_from_folder(target_folder):
yield from blob_parser.lazy_parse(blob)
if self.object_ids:
for blob in self._load_from_object_ids(drive, self.object_ids):
yield from blob_parser.lazy_parse(blob)
def load(self) -> List[Document]:
"""Load all documents."""
return list(self.lazy_load())