community: Allow other than default parsers in SharePointLoader and OneDriveLoader (#27716)

## What this PR does? ### Currently `O365BaseLoader` (and consequently both derived loaders) are limited to `pdf`, `doc`, `docx` files. - **Solution: here we introduce _handlers_ attribute that allows for custom handlers to be passed in. This is done in _dict_ form:** **Example:** ```python from langchain_community.document_loaders.parsers.documentloader_adapter import DocumentLoaderAsParser # PR for DocumentLoaderAsParser here: https://github.com/langchain-ai/langchain/pull/27749 from langchain_community.document_loaders.excel import UnstructuredExcelLoader xlsx_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="paged") # create dictionary mapping file types to handlers (parsers) handlers = { "doc": MsWordParser() "pdf": PDFMinerParser() "txt": TextParser() "xlsx": xlsx_parser } loader = SharePointLoader(document_library_id="...", handlers=handlers # pass handlers to SharePointLoader ) documents = loader.load() # works the same in OneDriveLoader loader = OneDriveLoader(document_library_id="...", handlers=handlers ) ``` This dictionary is then passed to `MimeTypeBasedParser` same as in the [current implementation](5a2cfb49e0/libs/community/langchain_community/document_loaders/parsers/registry.py (L13)). ### Currently `SharePointLoader` and `OneDriveLoader` are separate loaders that both inherit from `O365BaseLoader` However both of these implement the same functionality. The only differences are: - `SharePointLoader` requires argument `document_library_id` whereas `OneDriveLoader` requires `drive_id`. These are just different names for the same thing. - `SharePointLoader` implements significantly more features. - **Solution: `OneDriveLoader` is replaced with an empty shell just renaming `drive_id` to `document_library_id` and inheriting from `SharePointLoader`** **Dependencies:** None **Twitter handle:** @martintriska1 If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
2025-09-07 22:11:51 +00:00 · 2024-11-06 23:44:34 +01:00
parent 482c168b3e
commit 90189f5639
5 changed files with 229 additions and 138 deletions
--- a/libs/community/langchain_community/document_loaders/sharepoint.py
+++ b/libs/community/langchain_community/document_loaders/sharepoint.py
@@ -4,7 +4,7 @@ from __future__ import annotations

 import json
 from pathlib import Path
-from typing import Any, Iterator, List, Optional, Sequence
+from typing import Any, Dict, Iterator, List, Optional

 import requests  # type: ignore
 from langchain_core.document_loaders import BaseLoader
@@ -13,9 +13,7 @@ from pydantic import Field

 from langchain_community.document_loaders.base_o365 import (
    O365BaseLoader,
-    _FileType,
 )
-from langchain_community.document_loaders.parsers.registry import get_parser


 class SharePointLoader(O365BaseLoader, BaseLoader):
@@ -36,14 +34,6 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
    load_extended_metadata: Optional[bool] = False
    """ Whether to load extended metadata. Size, Owner and full_path."""

-    @property
-    def _file_types(self) -> Sequence[_FileType]:
-        """Return supported file types.
-        Returns:
-            A sequence of supported file types.
-        """
-        return _FileType.DOC, _FileType.DOCX, _FileType.PDF
-
    @property
    def _scopes(self) -> List[str]:
        """Return required scopes.
@@ -67,7 +57,6 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
        drive = self._auth().storage().get_drive(self.document_library_id)
        if not isinstance(drive, Drive):
            raise ValueError(f"There isn't a Drive with id {self.document_library_id}.")
-        blob_parser = get_parser("default")
        if self.folder_path:
            target_folder = drive.get_item_by_path(self.folder_path)
            if not isinstance(target_folder, Folder):
@@ -79,7 +68,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
                if self.load_extended_metadata is True:
                    extended_metadata = self.get_extended_metadata(file_id)
                    extended_metadata.update({"source_full_url": target_folder.web_url})
-                for parsed_blob in blob_parser.lazy_parse(blob):
+                for parsed_blob in self._blob_parser.lazy_parse(blob):
                    if self.load_auth is True:
                        parsed_blob.metadata["authorized_identities"] = auth_identities
                    if self.load_extended_metadata is True:
@@ -96,7 +85,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
                if self.load_extended_metadata is True:
                    extended_metadata = self.get_extended_metadata(file_id)
                    extended_metadata.update({"source_full_url": target_folder.web_url})
-                for parsed_blob in blob_parser.lazy_parse(blob):
+                for parsed_blob in self._blob_parser.lazy_parse(blob):
                    if self.load_auth is True:
                        parsed_blob.metadata["authorized_identities"] = auth_identities
                    if self.load_extended_metadata is True:
@@ -109,7 +98,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
                    auth_identities = self.authorized_identities(file_id)
                if self.load_extended_metadata is True:
                    extended_metadata = self.get_extended_metadata(file_id)
-                for parsed_blob in blob_parser.lazy_parse(blob):
+                for parsed_blob in self._blob_parser.lazy_parse(blob):
                    if self.load_auth is True:
                        parsed_blob.metadata["authorized_identities"] = auth_identities
                    if self.load_extended_metadata is True:
@@ -126,7 +115,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
                    auth_identities = self.authorized_identities(file_id)
                if self.load_extended_metadata is True:
                    extended_metadata = self.get_extended_metadata(file_id)
-                for blob_part in blob_parser.lazy_parse(blob):
+                for blob_part in self._blob_parser.lazy_parse(blob):
                    blob_part.metadata.update(blob.metadata)
                    if self.load_auth is True:
                        blob_part.metadata["authorized_identities"] = auth_identities
@@ -182,7 +171,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
        data = json.loads(s)
        return data

-    def get_extended_metadata(self, file_id: str) -> dict:
+    def get_extended_metadata(self, file_id: str) -> Dict:
        """
        Retrieve extended metadata for a file in SharePoint.
        As of today, following fields are supported in the extended metadata: