mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-12 12:59:07 +00:00
Add SharePoint Loader (#4284)
- Added a loader (`SharePointLoader`) that can pull documents (`pdf`, `docx`, `doc`) from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). - Added a Base Loader (`O365BaseLoader`) to be used for all Loaders that use [O365](https://github.com/O365/python-o365) Package - Code refactoring on `OneDriveLoader` to use the new `O365BaseLoader`. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -147,6 +147,7 @@ from langchain.document_loaders.rst import UnstructuredRSTLoader
|
||||
from langchain.document_loaders.rtf import UnstructuredRTFLoader
|
||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
from langchain.document_loaders.sharepoint import SharePointLoader
|
||||
from langchain.document_loaders.sitemap import SitemapLoader
|
||||
from langchain.document_loaders.slack_directory import SlackDirectoryLoader
|
||||
from langchain.document_loaders.snowflake_loader import SnowflakeLoader
|
||||
@@ -316,6 +317,7 @@ __all__ = [
|
||||
"S3FileLoader",
|
||||
"SRTLoader",
|
||||
"SeleniumURLLoader",
|
||||
"SharePointLoader",
|
||||
"SitemapLoader",
|
||||
"SlackDirectoryLoader",
|
||||
"SnowflakeLoader",
|
||||
|
182
libs/langchain/langchain/document_loaders/base_o365.py
Normal file
182
libs/langchain/langchain/document_loaders/base_o365.py
Normal file
@@ -0,0 +1,182 @@
|
||||
"""Base class for all loaders that uses O365 Package"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from abc import abstractmethod
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
|
||||
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader
|
||||
from langchain.document_loaders.blob_loaders.schema import Blob
|
||||
from langchain.pydantic_v1 import BaseModel, BaseSettings, Field, FilePath, SecretStr
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from O365 import Account
|
||||
from O365.drive import Drive, Folder
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CHUNK_SIZE = 1024 * 1024 * 5
|
||||
|
||||
|
||||
class _O365Settings(BaseSettings):
|
||||
client_id: str = Field(..., env="O365_CLIENT_ID")
|
||||
client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
|
||||
|
||||
class Config:
|
||||
env_prefix = ""
|
||||
case_sentive = False
|
||||
env_file = ".env"
|
||||
|
||||
|
||||
class _O365TokenStorage(BaseSettings):
|
||||
token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"
|
||||
|
||||
|
||||
class _FileType(str, Enum):
|
||||
DOC = "doc"
|
||||
DOCX = "docx"
|
||||
PDF = "pdf"
|
||||
|
||||
|
||||
def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]:
|
||||
mime_types_mapping = {}
|
||||
for file_type in file_types:
|
||||
if file_type.value == "doc":
|
||||
mime_types_mapping[file_type.value] = "application/msword"
|
||||
elif file_type.value == "docx":
|
||||
mime_types_mapping[
|
||||
file_type.value
|
||||
] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501
|
||||
elif file_type.value == "pdf":
|
||||
mime_types_mapping[file_type.value] = "application/pdf"
|
||||
return mime_types_mapping
|
||||
|
||||
|
||||
class O365BaseLoader(BaseLoader, BaseModel):
|
||||
settings: _O365Settings = Field(default_factory=_O365Settings)
|
||||
"""Settings for the Office365 API client."""
|
||||
auth_with_token: bool = False
|
||||
"""Whether to authenticate with a token or not. Defaults to False."""
|
||||
chunk_size: Union[int, str] = CHUNK_SIZE
|
||||
"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def _file_types(self) -> Sequence[_FileType]:
|
||||
"""Return supported file types."""
|
||||
|
||||
@property
|
||||
def _fetch_mime_types(self) -> Dict[str, str]:
|
||||
"""Return a dict of supported file types to corresponding mime types."""
|
||||
return fetch_mime_types(self._file_types)
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def _scopes(self) -> List[str]:
|
||||
"""Return required scopes."""
|
||||
|
||||
def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
|
||||
"""Lazily load all files from a specified folder of the configured MIME type.
|
||||
|
||||
Args:
|
||||
folder: The Folder instance from which the files are to be loaded. This
|
||||
Folder instance should represent a directory in a file system where the
|
||||
files are stored.
|
||||
|
||||
Yields:
|
||||
An iterator that yields Blob instances, which are binary representations of
|
||||
the files loaded from the folder.
|
||||
"""
|
||||
file_mime_types = self._fetch_mime_types
|
||||
items = folder.get_items()
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
|
||||
for file in items:
|
||||
if file.is_file:
|
||||
if file.mime_type in list(file_mime_types.values()):
|
||||
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
||||
loader = FileSystemBlobLoader(path=temp_dir)
|
||||
yield from loader.yield_blobs()
|
||||
|
||||
def _load_from_object_ids(
|
||||
self, drive: Drive, object_ids: List[str]
|
||||
) -> Iterable[Blob]:
|
||||
"""Lazily load files specified by their object_ids from a drive.
|
||||
|
||||
Load files into the system as binary large objects (Blobs) and return Iterable.
|
||||
|
||||
Args:
|
||||
drive: The Drive instance from which the files are to be loaded. This Drive
|
||||
instance should represent a cloud storage service or similar storage
|
||||
system where the files are stored.
|
||||
object_ids: A list of object_id strings. Each object_id represents a unique
|
||||
identifier for a file in the drive.
|
||||
|
||||
Yields:
|
||||
An iterator that yields Blob instances, which are binary representations of
|
||||
the files loaded from the drive using the specified object_ids.
|
||||
"""
|
||||
file_mime_types = self._fetch_mime_types
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
for object_id in object_ids:
|
||||
file = drive.get_item(object_id)
|
||||
if not file:
|
||||
logging.warning(
|
||||
"There isn't a file with"
|
||||
f"object_id {object_id} in drive {drive}."
|
||||
)
|
||||
continue
|
||||
if file.is_file:
|
||||
if file.mime_type in list(file_mime_types.values()):
|
||||
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
||||
loader = FileSystemBlobLoader(path=temp_dir)
|
||||
yield from loader.yield_blobs()
|
||||
|
||||
def _auth(self) -> Account:
|
||||
"""Authenticates the OneDrive API client
|
||||
|
||||
Returns:
|
||||
The authenticated Account object.
|
||||
"""
|
||||
try:
|
||||
from O365 import Account, FileSystemTokenBackend
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"O365 package not found, please install it with `pip install o365`"
|
||||
)
|
||||
if self.auth_with_token:
|
||||
token_storage = _O365TokenStorage()
|
||||
token_path = token_storage.token_path
|
||||
token_backend = FileSystemTokenBackend(
|
||||
token_path=token_path.parent, token_filename=token_path.name
|
||||
)
|
||||
account = Account(
|
||||
credentials=(
|
||||
self.settings.client_id,
|
||||
self.settings.client_secret.get_secret_value(),
|
||||
),
|
||||
scopes=self._scopes,
|
||||
token_backend=token_backend,
|
||||
**{"raise_http_errors": False},
|
||||
)
|
||||
else:
|
||||
token_backend = FileSystemTokenBackend(
|
||||
token_path=Path.home() / ".credentials"
|
||||
)
|
||||
account = Account(
|
||||
credentials=(
|
||||
self.settings.client_id,
|
||||
self.settings.client_secret.get_secret_value(),
|
||||
),
|
||||
scopes=self._scopes,
|
||||
token_backend=token_backend,
|
||||
**{"raise_http_errors": False},
|
||||
)
|
||||
# make the auth
|
||||
account.authenticate()
|
||||
return account
|
@@ -2,129 +2,49 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
|
||||
from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.onedrive_file import OneDriveFileLoader
|
||||
from langchain.pydantic_v1 import BaseModel, BaseSettings, Field, FilePath, SecretStr
|
||||
from langchain.document_loaders.base_o365 import (
|
||||
O365BaseLoader,
|
||||
_FileType,
|
||||
)
|
||||
from langchain.document_loaders.parsers.registry import get_parser
|
||||
from langchain.pydantic_v1 import Field
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from O365 import Account
|
||||
from O365.drive import Drive, Folder
|
||||
|
||||
SCOPES = ["offline_access", "Files.Read.All"]
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _OneDriveSettings(BaseSettings):
|
||||
client_id: str = Field(..., env="O365_CLIENT_ID")
|
||||
client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
|
||||
|
||||
class Config:
|
||||
env_prefix = ""
|
||||
case_sentive = False
|
||||
env_file = ".env"
|
||||
|
||||
|
||||
class _OneDriveTokenStorage(BaseSettings):
|
||||
token_path: FilePath = Field(Path.home() / ".credentials" / "o365_token.txt")
|
||||
|
||||
|
||||
class _FileType(str, Enum):
|
||||
DOC = "doc"
|
||||
DOCX = "docx"
|
||||
PDF = "pdf"
|
||||
|
||||
|
||||
class _SupportedFileTypes(BaseModel):
|
||||
file_types: List[_FileType]
|
||||
|
||||
def fetch_mime_types(self) -> Dict[str, str]:
|
||||
mime_types_mapping = {}
|
||||
for file_type in self.file_types:
|
||||
if file_type.value == "doc":
|
||||
mime_types_mapping[file_type.value] = "application/msword"
|
||||
elif file_type.value == "docx":
|
||||
mime_types_mapping[
|
||||
file_type.value
|
||||
] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501
|
||||
elif file_type.value == "pdf":
|
||||
mime_types_mapping[file_type.value] = "application/pdf"
|
||||
return mime_types_mapping
|
||||
|
||||
|
||||
class OneDriveLoader(BaseLoader, BaseModel):
|
||||
class OneDriveLoader(O365BaseLoader):
|
||||
"""Load from `Microsoft OneDrive`."""
|
||||
|
||||
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
|
||||
""" The settings for the OneDrive API client."""
|
||||
drive_id: str = Field(...)
|
||||
""" The ID of the OneDrive drive to load data from."""
|
||||
folder_path: Optional[str] = None
|
||||
""" The path to the folder to load data from."""
|
||||
object_ids: Optional[List[str]] = None
|
||||
""" The IDs of the objects to load data from."""
|
||||
auth_with_token: bool = False
|
||||
""" Whether to authenticate with a token or not. Defaults to False."""
|
||||
|
||||
def _auth(self) -> Type[Account]:
|
||||
"""
|
||||
Authenticates the OneDrive API client using the specified
|
||||
authentication method and returns the Account object.
|
||||
@property
|
||||
def _file_types(self) -> Sequence[_FileType]:
|
||||
"""Return supported file types."""
|
||||
return _FileType.DOC, _FileType.DOCX, _FileType.PDF
|
||||
|
||||
Returns:
|
||||
Type[Account]: The authenticated Account object.
|
||||
"""
|
||||
try:
|
||||
from O365 import FileSystemTokenBackend
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"O365 package not found, please install it with `pip install o365`"
|
||||
)
|
||||
if self.auth_with_token:
|
||||
token_storage = _OneDriveTokenStorage()
|
||||
token_path = token_storage.token_path
|
||||
token_backend = FileSystemTokenBackend(
|
||||
token_path=token_path.parent, token_filename=token_path.name
|
||||
)
|
||||
account = Account(
|
||||
credentials=(
|
||||
self.settings.client_id,
|
||||
self.settings.client_secret.get_secret_value(),
|
||||
),
|
||||
scopes=SCOPES,
|
||||
token_backend=token_backend,
|
||||
**{"raise_http_errors": False},
|
||||
)
|
||||
else:
|
||||
token_backend = FileSystemTokenBackend(
|
||||
token_path=Path.home() / ".credentials"
|
||||
)
|
||||
account = Account(
|
||||
credentials=(
|
||||
self.settings.client_id,
|
||||
self.settings.client_secret.get_secret_value(),
|
||||
),
|
||||
scopes=SCOPES,
|
||||
token_backend=token_backend,
|
||||
**{"raise_http_errors": False},
|
||||
)
|
||||
# make the auth
|
||||
account.authenticate()
|
||||
return account
|
||||
@property
|
||||
def _scopes(self) -> List[str]:
|
||||
"""Return required scopes."""
|
||||
return ["offline_access", "Files.Read.All"]
|
||||
|
||||
def _get_folder_from_path(self, drive: Type[Drive]) -> Union[Folder, Drive]:
|
||||
def _get_folder_from_path(self, drive: Drive) -> Union[Folder, Drive]:
|
||||
"""
|
||||
Returns the folder or drive object located at the
|
||||
specified path relative to the given drive.
|
||||
|
||||
Args:
|
||||
drive (Type[Drive]): The root drive from which the folder path is relative.
|
||||
drive (Drive): The root drive from which the folder path is relative.
|
||||
|
||||
Returns:
|
||||
Union[Folder, Drive]: The folder or drive object
|
||||
@@ -151,90 +71,26 @@ class OneDriveLoader(BaseLoader, BaseModel):
|
||||
raise FileNotFoundError("Path {} not exist.".format(self.folder_path))
|
||||
return subfolder_drive
|
||||
|
||||
def _load_from_folder(self, folder: Type[Folder]) -> List[Document]:
|
||||
"""
|
||||
Loads all supported document files from the specified folder
|
||||
and returns a list of Document objects.
|
||||
|
||||
Args:
|
||||
folder (Type[Folder]): The folder object to load the documents from.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects representing
|
||||
the loaded documents.
|
||||
|
||||
"""
|
||||
docs = []
|
||||
file_types = _SupportedFileTypes(file_types=["doc", "docx", "pdf"])
|
||||
file_mime_types = file_types.fetch_mime_types()
|
||||
items = folder.get_items()
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}"
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
for file in items:
|
||||
if file.is_file:
|
||||
if file.mime_type in list(file_mime_types.values()):
|
||||
loader = OneDriveFileLoader(file=file)
|
||||
docs.extend(loader.load())
|
||||
return docs
|
||||
|
||||
def _load_from_object_ids(self, drive: Type[Drive]) -> List[Document]:
|
||||
"""
|
||||
Loads all supported document files from the specified OneDrive
|
||||
drive based on their object IDs and returns a list
|
||||
of Document objects.
|
||||
|
||||
Args:
|
||||
drive (Type[Drive]): The OneDrive drive object
|
||||
to load the documents from.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects representing
|
||||
the loaded documents.
|
||||
"""
|
||||
docs = []
|
||||
file_types = _SupportedFileTypes(file_types=["doc", "docx", "pdf"])
|
||||
file_mime_types = file_types.fetch_mime_types()
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}"
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
|
||||
for object_id in self.object_ids if self.object_ids else [""]:
|
||||
file = drive.get_item(object_id)
|
||||
if not file:
|
||||
logger.warning(
|
||||
"There isn't a file with "
|
||||
f"object_id {object_id} in drive {drive}."
|
||||
)
|
||||
continue
|
||||
if file.is_file:
|
||||
if file.mime_type in list(file_mime_types.values()):
|
||||
loader = OneDriveFileLoader(file=file)
|
||||
docs.extend(loader.load())
|
||||
return docs
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load documents lazily. Use this when working at a large scale."""
|
||||
try:
|
||||
from O365.drive import Drive
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"O365 package not found, please install it with `pip install o365`"
|
||||
)
|
||||
drive = self._auth().storage().get_drive(self.drive_id)
|
||||
if not isinstance(drive, Drive):
|
||||
raise ValueError(f"There isn't a Drive with id {self.drive_id}.")
|
||||
blob_parser = get_parser("default")
|
||||
if self.folder_path:
|
||||
folder = self._get_folder_from_path(drive)
|
||||
for blob in self._load_from_folder(folder):
|
||||
yield from blob_parser.lazy_parse(blob)
|
||||
if self.object_ids:
|
||||
for blob in self._load_from_object_ids(drive, self.object_ids):
|
||||
yield from blob_parser.lazy_parse(blob)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""
|
||||
Loads all supported document files from the specified OneDrive drive
|
||||
and return a list of Document objects.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects
|
||||
representing the loaded documents.
|
||||
|
||||
Raises:
|
||||
ValueError: If the specified drive ID
|
||||
does not correspond to a drive in the OneDrive storage.
|
||||
"""
|
||||
account = self._auth()
|
||||
storage = account.storage()
|
||||
drive = storage.get_drive(self.drive_id)
|
||||
docs: List[Document] = []
|
||||
if not drive:
|
||||
raise ValueError(f"There isn't a drive with id {self.drive_id}.")
|
||||
if self.folder_path:
|
||||
folder = self._get_folder_from_path(drive=drive)
|
||||
docs.extend(self._load_from_folder(folder=folder))
|
||||
elif self.object_ids:
|
||||
docs.extend(self._load_from_object_ids(drive=drive))
|
||||
return docs
|
||||
"""Load all documents."""
|
||||
return list(self.lazy_load())
|
||||
|
34
libs/langchain/langchain/document_loaders/parsers/msword.py
Normal file
34
libs/langchain/langchain/document_loaders/parsers/msword.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from typing import Iterator
|
||||
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.schema import Document
|
||||
|
||||
|
||||
class MsWordParser(BaseBlobParser):
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
try:
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import unstructured, please install with `pip install "
|
||||
"unstructured`."
|
||||
) from e
|
||||
|
||||
mime_type_parser = {
|
||||
"application/msword": partition_doc,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
|
||||
partition_docx
|
||||
),
|
||||
}
|
||||
if blob.mimetype not in (
|
||||
"application/msword",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
):
|
||||
raise ValueError("This blob type is not supported for this parser.")
|
||||
with blob.as_bytes_io() as word_document:
|
||||
elements = mime_type_parser[blob.mimetype](file=word_document)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": blob.source}
|
||||
yield Document(page_content=text, metadata=metadata)
|
@@ -1,6 +1,7 @@
|
||||
"""Module includes a registry of default parser configurations."""
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
from langchain.document_loaders.parsers.msword import MsWordParser
|
||||
from langchain.document_loaders.parsers.pdf import PyMuPDFParser
|
||||
from langchain.document_loaders.parsers.txt import TextParser
|
||||
|
||||
@@ -11,6 +12,10 @@ def _get_default_parser() -> BaseBlobParser:
|
||||
handlers={
|
||||
"application/pdf": PyMuPDFParser(),
|
||||
"text/plain": TextParser(),
|
||||
"application/msword": MsWordParser(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
|
||||
MsWordParser()
|
||||
),
|
||||
},
|
||||
fallback_parser=None,
|
||||
)
|
||||
|
59
libs/langchain/langchain/document_loaders/sharepoint.py
Normal file
59
libs/langchain/langchain/document_loaders/sharepoint.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Loader that loads data from Sharepoint Document Library"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterator, List, Optional, Sequence
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base_o365 import (
|
||||
O365BaseLoader,
|
||||
_FileType,
|
||||
)
|
||||
from langchain.document_loaders.parsers.registry import get_parser
|
||||
from langchain.pydantic_v1 import Field
|
||||
|
||||
|
||||
class SharePointLoader(O365BaseLoader):
|
||||
"""Load from `SharePoint`."""
|
||||
|
||||
document_library_id: str = Field(...)
|
||||
""" The ID of the SharePoint document library to load data from."""
|
||||
folder_path: Optional[str] = None
|
||||
""" The path to the folder to load data from."""
|
||||
object_ids: Optional[List[str]] = None
|
||||
""" The IDs of the objects to load data from."""
|
||||
|
||||
@property
|
||||
def _file_types(self) -> Sequence[_FileType]:
|
||||
"""Return supported file types."""
|
||||
return _FileType.DOC, _FileType.DOCX, _FileType.PDF
|
||||
|
||||
@property
|
||||
def _scopes(self) -> List[str]:
|
||||
"""Return required scopes."""
|
||||
return ["sharepoint", "basic"]
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load documents lazily. Use this when working at a large scale."""
|
||||
try:
|
||||
from O365.drive import Drive, Folder
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"O365 package not found, please install it with `pip install o365`"
|
||||
)
|
||||
drive = self._auth().storage().get_drive(self.document_library_id)
|
||||
if not isinstance(drive, Drive):
|
||||
raise ValueError(f"There isn't a Drive with id {self.document_library_id}.")
|
||||
blob_parser = get_parser("default")
|
||||
if self.folder_path:
|
||||
target_folder = drive.get_item_by_path(self.folder_path)
|
||||
if not isinstance(target_folder, Folder):
|
||||
raise ValueError(f"There isn't a folder with path {self.folder_path}.")
|
||||
for blob in self._load_from_folder(target_folder):
|
||||
yield from blob_parser.lazy_parse(blob)
|
||||
if self.object_ids:
|
||||
for blob in self._load_from_object_ids(drive, self.object_ids):
|
||||
yield from blob_parser.lazy_parse(blob)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load all documents."""
|
||||
return list(self.lazy_load())
|
Reference in New Issue
Block a user