Add SharePoint Loader (#4284)

- Added a loader (`SharePointLoader`) that can pull documents (`pdf`, `docx`, `doc`) from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). - Added a Base Loader (`O365BaseLoader`) to be used for all Loaders that use [O365](https://github.com/O365/python-o365) Package - Code refactoring on `OneDriveLoader` to use the new `O365BaseLoader`. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-12 12:59:07 +00:00 · 2023-08-21 11:49:07 -03:00
parent bb4f7936f9
commit f116e10d53
7 changed files with 426 additions and 183 deletions
--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@@ -147,6 +147,7 @@ from langchain.document_loaders.rst import UnstructuredRSTLoader
 from langchain.document_loaders.rtf import UnstructuredRTFLoader
 from langchain.document_loaders.s3_directory import S3DirectoryLoader
 from langchain.document_loaders.s3_file import S3FileLoader
+from langchain.document_loaders.sharepoint import SharePointLoader
 from langchain.document_loaders.sitemap import SitemapLoader
 from langchain.document_loaders.slack_directory import SlackDirectoryLoader
 from langchain.document_loaders.snowflake_loader import SnowflakeLoader
@@ -316,6 +317,7 @@ __all__ = [
    "S3FileLoader",
    "SRTLoader",
    "SeleniumURLLoader",
+    "SharePointLoader",
    "SitemapLoader",
    "SlackDirectoryLoader",
    "SnowflakeLoader",
--- a/libs/langchain/langchain/document_loaders/base_o365.py
+++ b/libs/langchain/langchain/document_loaders/base_o365.py
@@ -0,0 +1,182 @@
+"""Base class for all loaders that uses O365 Package"""
+from __future__ import annotations
+
+import logging
+import os
+import tempfile
+from abc import abstractmethod
+from enum import Enum
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader
+from langchain.document_loaders.blob_loaders.schema import Blob
+from langchain.pydantic_v1 import BaseModel, BaseSettings, Field, FilePath, SecretStr
+
+if TYPE_CHECKING:
+    from O365 import Account
+    from O365.drive import Drive, Folder
+
+logger = logging.getLogger(__name__)
+
+CHUNK_SIZE = 1024 * 1024 * 5
+
+
+class _O365Settings(BaseSettings):
+    client_id: str = Field(..., env="O365_CLIENT_ID")
+    client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
+
+    class Config:
+        env_prefix = ""
+        case_sentive = False
+        env_file = ".env"
+
+
+class _O365TokenStorage(BaseSettings):
+    token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"
+
+
+class _FileType(str, Enum):
+    DOC = "doc"
+    DOCX = "docx"
+    PDF = "pdf"
+
+
+def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]:
+    mime_types_mapping = {}
+    for file_type in file_types:
+        if file_type.value == "doc":
+            mime_types_mapping[file_type.value] = "application/msword"
+        elif file_type.value == "docx":
+            mime_types_mapping[
+                file_type.value
+            ] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"  # noqa: E501
+        elif file_type.value == "pdf":
+            mime_types_mapping[file_type.value] = "application/pdf"
+    return mime_types_mapping
+
+
+class O365BaseLoader(BaseLoader, BaseModel):
+    settings: _O365Settings = Field(default_factory=_O365Settings)
+    """Settings for the Office365 API client."""
+    auth_with_token: bool = False
+    """Whether to authenticate with a token or not. Defaults to False."""
+    chunk_size: Union[int, str] = CHUNK_SIZE
+    """Number of bytes to retrieve from each api call to the server. int or 'auto'."""
+
+    @property
+    @abstractmethod
+    def _file_types(self) -> Sequence[_FileType]:
+        """Return supported file types."""
+
+    @property
+    def _fetch_mime_types(self) -> Dict[str, str]:
+        """Return a dict of supported file types to corresponding mime types."""
+        return fetch_mime_types(self._file_types)
+
+    @property
+    @abstractmethod
+    def _scopes(self) -> List[str]:
+        """Return required scopes."""
+
+    def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
+        """Lazily load all files from a specified folder of the configured MIME type.
+
+        Args:
+            folder: The Folder instance from which the files are to be loaded. This
+                Folder instance should represent a directory in a file system where the
+                files are stored.
+
+        Yields:
+            An iterator that yields Blob instances, which are binary representations of
+                the files loaded from the folder.
+        """
+        file_mime_types = self._fetch_mime_types
+        items = folder.get_items()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
+            for file in items:
+                if file.is_file:
+                    if file.mime_type in list(file_mime_types.values()):
+                        file.download(to_path=temp_dir, chunk_size=self.chunk_size)
+            loader = FileSystemBlobLoader(path=temp_dir)
+            yield from loader.yield_blobs()
+
+    def _load_from_object_ids(
+        self, drive: Drive, object_ids: List[str]
+    ) -> Iterable[Blob]:
+        """Lazily load files specified by their object_ids from a drive.
+
+        Load files into the system as binary large objects (Blobs) and return Iterable.
+
+        Args:
+            drive: The Drive instance from which the files are to be loaded. This Drive
+                instance should represent a cloud storage service or similar storage
+                system where the files are stored.
+            object_ids: A list of object_id strings. Each object_id represents a unique
+                identifier for a file in the drive.
+
+        Yields:
+            An iterator that yields Blob instances, which are binary representations of
+            the files loaded from the drive using the specified object_ids.
+        """
+        file_mime_types = self._fetch_mime_types
+        with tempfile.TemporaryDirectory() as temp_dir:
+            for object_id in object_ids:
+                file = drive.get_item(object_id)
+                if not file:
+                    logging.warning(
+                        "There isn't a file with"
+                        f"object_id {object_id} in drive {drive}."
+                    )
+                    continue
+                if file.is_file:
+                    if file.mime_type in list(file_mime_types.values()):
+                        file.download(to_path=temp_dir, chunk_size=self.chunk_size)
+            loader = FileSystemBlobLoader(path=temp_dir)
+            yield from loader.yield_blobs()
+
+    def _auth(self) -> Account:
+        """Authenticates the OneDrive API client
+
+        Returns:
+            The authenticated Account object.
+        """
+        try:
+            from O365 import Account, FileSystemTokenBackend
+        except ImportError:
+            raise ImportError(
+                "O365 package not found, please install it with `pip install o365`"
+            )
+        if self.auth_with_token:
+            token_storage = _O365TokenStorage()
+            token_path = token_storage.token_path
+            token_backend = FileSystemTokenBackend(
+                token_path=token_path.parent, token_filename=token_path.name
+            )
+            account = Account(
+                credentials=(
+                    self.settings.client_id,
+                    self.settings.client_secret.get_secret_value(),
+                ),
+                scopes=self._scopes,
+                token_backend=token_backend,
+                **{"raise_http_errors": False},
+            )
+        else:
+            token_backend = FileSystemTokenBackend(
+                token_path=Path.home() / ".credentials"
+            )
+            account = Account(
+                credentials=(
+                    self.settings.client_id,
+                    self.settings.client_secret.get_secret_value(),
+                ),
+                scopes=self._scopes,
+                token_backend=token_backend,
+                **{"raise_http_errors": False},
+            )
+            # make the auth
+            account.authenticate()
+        return account
--- a/libs/langchain/langchain/document_loaders/onedrive.py
+++ b/libs/langchain/langchain/document_loaders/onedrive.py
@@ -2,129 +2,49 @@
 from __future__ import annotations

 import logging
-import os
-import tempfile
-from enum import Enum
-from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence, Union

 from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
-from langchain.document_loaders.onedrive_file import OneDriveFileLoader
-from langchain.pydantic_v1 import BaseModel, BaseSettings, Field, FilePath, SecretStr
+from langchain.document_loaders.base_o365 import (
+    O365BaseLoader,
+    _FileType,
+)
+from langchain.document_loaders.parsers.registry import get_parser
+from langchain.pydantic_v1 import Field

 if TYPE_CHECKING:
-    from O365 import Account
    from O365.drive import Drive, Folder

-SCOPES = ["offline_access", "Files.Read.All"]
 logger = logging.getLogger(__name__)


-class _OneDriveSettings(BaseSettings):
-    client_id: str = Field(..., env="O365_CLIENT_ID")
-    client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
-
-    class Config:
-        env_prefix = ""
-        case_sentive = False
-        env_file = ".env"
-
-
-class _OneDriveTokenStorage(BaseSettings):
-    token_path: FilePath = Field(Path.home() / ".credentials" / "o365_token.txt")
-
-
-class _FileType(str, Enum):
-    DOC = "doc"
-    DOCX = "docx"
-    PDF = "pdf"
-
-
-class _SupportedFileTypes(BaseModel):
-    file_types: List[_FileType]
-
-    def fetch_mime_types(self) -> Dict[str, str]:
-        mime_types_mapping = {}
-        for file_type in self.file_types:
-            if file_type.value == "doc":
-                mime_types_mapping[file_type.value] = "application/msword"
-            elif file_type.value == "docx":
-                mime_types_mapping[
-                    file_type.value
-                ] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"  # noqa: E501
-            elif file_type.value == "pdf":
-                mime_types_mapping[file_type.value] = "application/pdf"
-        return mime_types_mapping
-
-
-class OneDriveLoader(BaseLoader, BaseModel):
+class OneDriveLoader(O365BaseLoader):
    """Load from `Microsoft OneDrive`."""

-    settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
-    """ The settings for the OneDrive API client."""
    drive_id: str = Field(...)
    """ The ID of the OneDrive drive to load data from."""
    folder_path: Optional[str] = None
    """ The path to the folder to load data from."""
    object_ids: Optional[List[str]] = None
    """ The IDs of the objects to load data from."""
-    auth_with_token: bool = False
-    """ Whether to authenticate with a token or not. Defaults to False."""

-    def _auth(self) -> Type[Account]:
-        """
-        Authenticates the OneDrive API client using the specified
-        authentication method and returns the Account object.
+    @property
+    def _file_types(self) -> Sequence[_FileType]:
+        """Return supported file types."""
+        return _FileType.DOC, _FileType.DOCX, _FileType.PDF

-        Returns:
-            Type[Account]: The authenticated Account object.
-        """
-        try:
-            from O365 import FileSystemTokenBackend
-        except ImportError:
-            raise ImportError(
-                "O365 package not found, please install it with `pip install o365`"
-            )
-        if self.auth_with_token:
-            token_storage = _OneDriveTokenStorage()
-            token_path = token_storage.token_path
-            token_backend = FileSystemTokenBackend(
-                token_path=token_path.parent, token_filename=token_path.name
-            )
-            account = Account(
-                credentials=(
-                    self.settings.client_id,
-                    self.settings.client_secret.get_secret_value(),
-                ),
-                scopes=SCOPES,
-                token_backend=token_backend,
-                **{"raise_http_errors": False},
-            )
-        else:
-            token_backend = FileSystemTokenBackend(
-                token_path=Path.home() / ".credentials"
-            )
-            account = Account(
-                credentials=(
-                    self.settings.client_id,
-                    self.settings.client_secret.get_secret_value(),
-                ),
-                scopes=SCOPES,
-                token_backend=token_backend,
-                **{"raise_http_errors": False},
-            )
-            # make the auth
-            account.authenticate()
-        return account
+    @property
+    def _scopes(self) -> List[str]:
+        """Return required scopes."""
+        return ["offline_access", "Files.Read.All"]

-    def _get_folder_from_path(self, drive: Type[Drive]) -> Union[Folder, Drive]:
+    def _get_folder_from_path(self, drive: Drive) -> Union[Folder, Drive]:
        """
        Returns the folder or drive object located at the
        specified path relative to the given drive.

        Args:
-            drive (Type[Drive]): The root drive from which the folder path is relative.
+            drive (Drive): The root drive from which the folder path is relative.

        Returns:
            Union[Folder, Drive]: The folder or drive object
@@ -151,90 +71,26 @@ class OneDriveLoader(BaseLoader, BaseModel):
                raise FileNotFoundError("Path {} not exist.".format(self.folder_path))
        return subfolder_drive

-    def _load_from_folder(self, folder: Type[Folder]) -> List[Document]:
-        """
-        Loads all supported document files from the specified folder
-        and returns a list of Document objects.
-
-        Args:
-            folder (Type[Folder]): The folder object to load the documents from.
-
-        Returns:
-            List[Document]: A list of Document objects representing
-            the loaded documents.
-
-        """
-        docs = []
-        file_types = _SupportedFileTypes(file_types=["doc", "docx", "pdf"])
-        file_mime_types = file_types.fetch_mime_types()
-        items = folder.get_items()
-        with tempfile.TemporaryDirectory() as temp_dir:
-            file_path = f"{temp_dir}"
-            os.makedirs(os.path.dirname(file_path), exist_ok=True)
-            for file in items:
-                if file.is_file:
-                    if file.mime_type in list(file_mime_types.values()):
-                        loader = OneDriveFileLoader(file=file)
-                        docs.extend(loader.load())
-        return docs
-
-    def _load_from_object_ids(self, drive: Type[Drive]) -> List[Document]:
-        """
-        Loads all supported document files from the specified OneDrive
-        drive based on their object IDs and returns a list
-        of Document objects.
-
-        Args:
-            drive (Type[Drive]): The OneDrive drive object
-            to load the documents from.
-
-        Returns:
-            List[Document]: A list of Document objects representing
-            the loaded documents.
-        """
-        docs = []
-        file_types = _SupportedFileTypes(file_types=["doc", "docx", "pdf"])
-        file_mime_types = file_types.fetch_mime_types()
-        with tempfile.TemporaryDirectory() as temp_dir:
-            file_path = f"{temp_dir}"
-            os.makedirs(os.path.dirname(file_path), exist_ok=True)
-
-            for object_id in self.object_ids if self.object_ids else [""]:
-                file = drive.get_item(object_id)
-                if not file:
-                    logger.warning(
-                        "There isn't a file with "
-                        f"object_id {object_id} in drive {drive}."
-                    )
-                    continue
-                if file.is_file:
-                    if file.mime_type in list(file_mime_types.values()):
-                        loader = OneDriveFileLoader(file=file)
-                        docs.extend(loader.load())
-        return docs
+    def lazy_load(self) -> Iterator[Document]:
+        """Load documents lazily. Use this when working at a large scale."""
+        try:
+            from O365.drive import Drive
+        except ImportError:
+            raise ImportError(
+                "O365 package not found, please install it with `pip install o365`"
+            )
+        drive = self._auth().storage().get_drive(self.drive_id)
+        if not isinstance(drive, Drive):
+            raise ValueError(f"There isn't a Drive with id {self.drive_id}.")
+        blob_parser = get_parser("default")
+        if self.folder_path:
+            folder = self._get_folder_from_path(drive)
+            for blob in self._load_from_folder(folder):
+                yield from blob_parser.lazy_parse(blob)
+        if self.object_ids:
+            for blob in self._load_from_object_ids(drive, self.object_ids):
+                yield from blob_parser.lazy_parse(blob)

    def load(self) -> List[Document]:
-        """
-        Loads all supported document files from the specified OneDrive drive
-        and return a list of Document objects.
-
-        Returns:
-            List[Document]: A list of Document objects
-            representing the loaded documents.
-
-        Raises:
-            ValueError: If the specified drive ID
-            does not correspond to a drive in the OneDrive storage.
-        """
-        account = self._auth()
-        storage = account.storage()
-        drive = storage.get_drive(self.drive_id)
-        docs: List[Document] = []
-        if not drive:
-            raise ValueError(f"There isn't a drive with id {self.drive_id}.")
-        if self.folder_path:
-            folder = self._get_folder_from_path(drive=drive)
-            docs.extend(self._load_from_folder(folder=folder))
-        elif self.object_ids:
-            docs.extend(self._load_from_object_ids(drive=drive))
-        return docs
+        """Load all documents."""
+        return list(self.lazy_load())
--- a/libs/langchain/langchain/document_loaders/parsers/msword.py
+++ b/libs/langchain/langchain/document_loaders/parsers/msword.py
@@ -0,0 +1,34 @@
+from typing import Iterator
+
+from langchain.document_loaders.base import BaseBlobParser
+from langchain.document_loaders.blob_loaders import Blob
+from langchain.schema import Document
+
+
+class MsWordParser(BaseBlobParser):
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        try:
+            from unstructured.partition.doc import partition_doc
+            from unstructured.partition.docx import partition_docx
+        except ImportError as e:
+            raise ImportError(
+                "Could not import unstructured, please install with `pip install "
+                "unstructured`."
+            ) from e
+
+        mime_type_parser = {
+            "application/msword": partition_doc,
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
+                partition_docx
+            ),
+        }
+        if blob.mimetype not in (
+            "application/msword",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ):
+            raise ValueError("This blob type is not supported for this parser.")
+        with blob.as_bytes_io() as word_document:
+            elements = mime_type_parser[blob.mimetype](file=word_document)
+            text = "\n\n".join([str(el) for el in elements])
+            metadata = {"source": blob.source}
+            yield Document(page_content=text, metadata=metadata)
--- a/libs/langchain/langchain/document_loaders/parsers/registry.py
+++ b/libs/langchain/langchain/document_loaders/parsers/registry.py
@@ -1,6 +1,7 @@
 """Module includes a registry of default parser configurations."""
 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
+from langchain.document_loaders.parsers.msword import MsWordParser
 from langchain.document_loaders.parsers.pdf import PyMuPDFParser
 from langchain.document_loaders.parsers.txt import TextParser

@@ -11,6 +12,10 @@ def _get_default_parser() -> BaseBlobParser:
        handlers={
            "application/pdf": PyMuPDFParser(),
            "text/plain": TextParser(),
+            "application/msword": MsWordParser(),
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
+                MsWordParser()
+            ),
        },
        fallback_parser=None,
    )
--- a/libs/langchain/langchain/document_loaders/sharepoint.py
+++ b/libs/langchain/langchain/document_loaders/sharepoint.py
@@ -0,0 +1,59 @@
+"""Loader that loads data from Sharepoint Document Library"""
+from __future__ import annotations
+
+from typing import Iterator, List, Optional, Sequence
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base_o365 import (
+    O365BaseLoader,
+    _FileType,
+)
+from langchain.document_loaders.parsers.registry import get_parser
+from langchain.pydantic_v1 import Field
+
+
+class SharePointLoader(O365BaseLoader):
+    """Load  from `SharePoint`."""
+
+    document_library_id: str = Field(...)
+    """ The ID of the SharePoint document library to load data from."""
+    folder_path: Optional[str] = None
+    """ The path to the folder to load data from."""
+    object_ids: Optional[List[str]] = None
+    """ The IDs of the objects to load data from."""
+
+    @property
+    def _file_types(self) -> Sequence[_FileType]:
+        """Return supported file types."""
+        return _FileType.DOC, _FileType.DOCX, _FileType.PDF
+
+    @property
+    def _scopes(self) -> List[str]:
+        """Return required scopes."""
+        return ["sharepoint", "basic"]
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Load documents lazily. Use this when working at a large scale."""
+        try:
+            from O365.drive import Drive, Folder
+        except ImportError:
+            raise ImportError(
+                "O365 package not found, please install it with `pip install o365`"
+            )
+        drive = self._auth().storage().get_drive(self.document_library_id)
+        if not isinstance(drive, Drive):
+            raise ValueError(f"There isn't a Drive with id {self.document_library_id}.")
+        blob_parser = get_parser("default")
+        if self.folder_path:
+            target_folder = drive.get_item_by_path(self.folder_path)
+            if not isinstance(target_folder, Folder):
+                raise ValueError(f"There isn't a folder with path {self.folder_path}.")
+            for blob in self._load_from_folder(target_folder):
+                yield from blob_parser.lazy_parse(blob)
+        if self.object_ids:
+            for blob in self._load_from_object_ids(drive, self.object_ids):
+                yield from blob_parser.lazy_parse(blob)
+
+    def load(self) -> List[Document]:
+        """Load all documents."""
+        return list(self.lazy_load())