diff --git a/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb b/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb index b42c141f8fe..20feef0f9cc 100644 --- a/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb @@ -8,7 +8,7 @@ "\n", ">[Microsoft OneDrive](https://en.wikipedia.org/wiki/OneDrive) (formerly `SkyDrive`) is a file hosting service operated by Microsoft.\n", "\n", - "This notebook covers how to load documents from `OneDrive`. Currently, only docx, doc, and pdf files are supported.\n", + "This notebook covers how to load documents from `OneDrive`. By default the document loader loads `pdf`, `doc`, `docx` and `txt` files. You can load other file types by providing appropriate parsers (see more below).\n", "\n", "## Prerequisites\n", "1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n", @@ -77,15 +77,64 @@ "\n", "loader = OneDriveLoader(drive_id=\"YOUR DRIVE ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n", "documents = loader.load()\n", - "```\n" + "```\n", + "\n", + "#### πŸ“‘ Choosing supported file types and preffered parsers\n", + "By default `OneDriveLoader` loads file types defined in [`document_loaders/parsers/registry`](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/parsers/registry.py#L10-L22) using the default parsers (see below).\n", + "```python\n", + "def _get_default_parser() -> BaseBlobParser:\n", + " \"\"\"Get default mime-type based parser.\"\"\"\n", + " return MimeTypeBasedParser(\n", + " handlers={\n", + " \"application/pdf\": PyMuPDFParser(),\n", + " \"text/plain\": TextParser(),\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\": (\n", + " MsWordParser()\n", + " ),\n", + " },\n", + " fallback_parser=None,\n", + " )\n", + "```\n", + "You can override this behavior by passing `handlers` argument to `OneDriveLoader`. \n", + "Pass a dictionary mapping either file extensions (like `\"doc\"`, `\"pdf\"`, etc.) \n", + "or MIME types (like `\"application/pdf\"`, `\"text/plain\"`, etc.) to parsers. \n", + "Note that you must use either file extensions or MIME types exclusively and \n", + "cannot mix them.\n", + "\n", + "Do not include the leading dot for file extensions.\n", + "\n", + "```python\n", + "# using file extensions:\n", + "handlers = {\n", + " \"doc\": MsWordParser(),\n", + " \"pdf\": PDFMinerParser(),\n", + " \"mp3\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "# using MIME types:\n", + "handlers = {\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/pdf\": PDFMinerParser(),\n", + " \"audio/mpeg\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "loader = OneDriveLoader(document_library_id=\"...\",\n", + " handlers=handlers # pass handlers to OneDriveLoader\n", + " )\n", + "```\n", + "In case multiple file extensions map to the same MIME type, the last dictionary item will\n", + "apply.\n", + "Example:\n", + "```python\n", + "# 'jpg' and 'jpeg' both map to 'image/jpeg' MIME type. SecondParser() will be used \n", + "# to parse all jpg/jpeg files.\n", + "handlers = {\n", + " \"jpg\": FirstParser(),\n", + " \"jpeg\": SecondParser()\n", + "}\n", + "```" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb b/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb index 930346675df..b49abe39f51 100644 --- a/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb @@ -9,7 +9,7 @@ "\n", "> [Microsoft SharePoint](https://en.wikipedia.org/wiki/SharePoint) is a website-based collaboration system that uses workflow applications, β€œlist” databases, and other web parts and security features to empower business teams to work together developed by Microsoft.\n", "\n", - "This notebook covers how to load documents from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). Currently, only docx, doc, and pdf files are supported.\n", + "This notebook covers how to load documents from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). By default the document loader loads `pdf`, `doc`, `docx` and `txt` files. You can load other file types by providing appropriate parsers (see more below).\n", "\n", "## Prerequisites\n", "1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n", @@ -100,7 +100,63 @@ "\n", "loader = SharePointLoader(document_library_id=\"YOUR DOCUMENT LIBRARY ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n", "documents = loader.load()\n", - "```\n" + "```\n", + "\n", + "#### πŸ“‘ Choosing supported file types and preffered parsers\n", + "By default `SharePointLoader` loads file types defined in [`document_loaders/parsers/registry`](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/parsers/registry.py#L10-L22) using the default parsers (see below).\n", + "```python\n", + "def _get_default_parser() -> BaseBlobParser:\n", + " \"\"\"Get default mime-type based parser.\"\"\"\n", + " return MimeTypeBasedParser(\n", + " handlers={\n", + " \"application/pdf\": PyMuPDFParser(),\n", + " \"text/plain\": TextParser(),\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\": (\n", + " MsWordParser()\n", + " ),\n", + " },\n", + " fallback_parser=None,\n", + " )\n", + "```\n", + "You can override this behavior by passing `handlers` argument to `SharePointLoader`. \n", + "Pass a dictionary mapping either file extensions (like `\"doc\"`, `\"pdf\"`, etc.) \n", + "or MIME types (like `\"application/pdf\"`, `\"text/plain\"`, etc.) to parsers. \n", + "Note that you must use either file extensions or MIME types exclusively and \n", + "cannot mix them.\n", + "\n", + "Do not include the leading dot for file extensions.\n", + "\n", + "```python\n", + "# using file extensions:\n", + "handlers = {\n", + " \"doc\": MsWordParser(),\n", + " \"pdf\": PDFMinerParser(),\n", + " \"mp3\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "# using MIME types:\n", + "handlers = {\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/pdf\": PDFMinerParser(),\n", + " \"audio/mpeg\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "loader = SharePointLoader(document_library_id=\"...\",\n", + " handlers=handlers # pass handlers to SharePointLoader\n", + " )\n", + "```\n", + "In case multiple file extensions map to the same MIME type, the last dictionary item will\n", + "apply.\n", + "Example:\n", + "```python\n", + "# 'jpg' and 'jpeg' both map to 'image/jpeg' MIME type. SecondParser() will be used \n", + "# to parse all jpg/jpeg files.\n", + "handlers = {\n", + " \"jpg\": FirstParser(),\n", + " \"jpeg\": SecondParser()\n", + "}\n", + "```" ] } ], diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py index 44002842bf2..5f89d0794fc 100644 --- a/libs/community/langchain_community/document_loaders/base_o365.py +++ b/libs/community/langchain_community/document_loaders/base_o365.py @@ -3,26 +3,29 @@ from __future__ import annotations import logging +import mimetypes import os import tempfile from abc import abstractmethod -from enum import Enum from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union from pydantic import ( BaseModel, Field, FilePath, + PrivateAttr, SecretStr, ) from pydantic_settings import BaseSettings, SettingsConfigDict -from langchain_community.document_loaders.base import BaseLoader +from langchain_community.document_loaders.base import BaseBlobParser, BaseLoader from langchain_community.document_loaders.blob_loaders.file_system import ( FileSystemBlobLoader, ) from langchain_community.document_loaders.blob_loaders.schema import Blob +from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser +from langchain_community.document_loaders.parsers.registry import get_parser if TYPE_CHECKING: from O365 import Account @@ -46,24 +49,27 @@ class _O365TokenStorage(BaseSettings): token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt" -class _FileType(str, Enum): - DOC = "doc" - DOCX = "docx" - PDF = "pdf" - - -def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]: +def fetch_mime_types(file_types: Sequence[str]) -> Dict[str, str]: """Fetch the mime types for the specified file types.""" mime_types_mapping = {} - for file_type in file_types: - if file_type.value == "doc": - mime_types_mapping[file_type.value] = "application/msword" - elif file_type.value == "docx": - mime_types_mapping[file_type.value] = ( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501 - ) - elif file_type.value == "pdf": - mime_types_mapping[file_type.value] = "application/pdf" + for ext in file_types: + mime_type, _ = mimetypes.guess_type(f"file.{ext}") + if mime_type: + mime_types_mapping[ext] = mime_type + else: + raise ValueError(f"Unknown mimetype of extention {ext}") + return mime_types_mapping + + +def fetch_extensions(mime_types: Sequence[str]) -> Dict[str, str]: + """Fetch the mime types for the specified file types.""" + mime_types_mapping = {} + for mime_type in mime_types: + ext = mimetypes.guess_extension(mime_type) + if ext: + mime_types_mapping[ext[1:]] = mime_type # ignore leading `.` + else: + raise ValueError(f"Unknown mimetype {mime_type}") return mime_types_mapping @@ -78,16 +84,82 @@ class O365BaseLoader(BaseLoader, BaseModel): """Number of bytes to retrieve from each api call to the server. int or 'auto'.""" recursive: bool = False """Should the loader recursively load subfolders?""" + handlers: Optional[Dict[str, Any]] = {} + """ + Provide custom handlers for MimeTypeBasedParser. - @property - @abstractmethod - def _file_types(self) -> Sequence[_FileType]: - """Return supported file types.""" + Pass a dictionary mapping either file extensions (like "doc", "pdf", etc.) + or MIME types (like "application/pdf", "text/plain", etc.) to parsers. + Note that you must use either file extensions or MIME types exclusively and + cannot mix them. + + Do not include the leading dot for file extensions. + + Example using file extensions: + ```python + handlers = { + "doc": MsWordParser(), + "pdf": PDFMinerParser(), + "txt": TextParser() + } + ``` + + Example using MIME types: + ```python + handlers = { + "application/msword": MsWordParser(), + "application/pdf": PDFMinerParser(), + "text/plain": TextParser() + } + ``` + """ + + _blob_parser: BaseBlobParser = PrivateAttr() + _file_types: Sequence[str] = PrivateAttr() + _mime_types: Dict[str, str] = PrivateAttr() + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + if self.handlers: + handler_keys = list(self.handlers.keys()) + try: + # assume handlers.keys() are file extensions + self._mime_types = fetch_mime_types(handler_keys) + self._file_types = list(set(handler_keys)) + mime_handlers = { + self._mime_types[extension]: handler + for extension, handler in self.handlers.items() + } + except ValueError: + try: + # assume handlers.keys() are mime types + self._mime_types = fetch_extensions(handler_keys) + self._file_types = list(set(self._mime_types.keys())) + mime_handlers = self.handlers + except ValueError: + raise ValueError( + "`handlers` keys must be either file extensions or mimetypes.\n" + f"{handler_keys} could not be interpreted as either.\n" + "File extensions and mimetypes cannot mix. " + "Use either one or the other" + ) + + self._blob_parser = MimeTypeBasedParser( + handlers=mime_handlers, fallback_parser=None + ) + else: + self._blob_parser = get_parser("default") + if not isinstance(self._blob_parser, MimeTypeBasedParser): + raise TypeError( + 'get_parser("default) was supposed to return MimeTypeBasedParser.' + f"It returned {type(self._blob_parser)}" + ) + self._mime_types = fetch_extensions(list(self._blob_parser.handlers.keys())) @property def _fetch_mime_types(self) -> Dict[str, str]: """Return a dict of supported file types to corresponding mime types.""" - return fetch_mime_types(self._file_types) + return self._mime_types @property @abstractmethod diff --git a/libs/community/langchain_community/document_loaders/onedrive.py b/libs/community/langchain_community/document_loaders/onedrive.py index ecc1d232bfe..e0369233c22 100644 --- a/libs/community/langchain_community/document_loaders/onedrive.py +++ b/libs/community/langchain_community/document_loaders/onedrive.py @@ -1,94 +1,19 @@ -"""Loads data from OneDrive""" +from typing import Any -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence, Union - -from langchain_core.documents import Document from pydantic import Field -from langchain_community.document_loaders.base_o365 import ( - O365BaseLoader, - _FileType, -) -from langchain_community.document_loaders.parsers.registry import get_parser - -if TYPE_CHECKING: - from O365.drive import Drive, Folder - -logger = logging.getLogger(__name__) +from langchain_community.document_loaders import SharePointLoader -class OneDriveLoader(O365BaseLoader): - """Load from `Microsoft OneDrive`.""" +class OneDriveLoader(SharePointLoader): + """ + Load documents from Microsoft OneDrive. + Uses `SharePointLoader` under the hood. + """ drive_id: str = Field(...) - """ The ID of the OneDrive drive to load data from.""" - folder_path: Optional[str] = None - """ The path to the folder to load data from.""" - object_ids: Optional[List[str]] = None - """ The IDs of the objects to load data from.""" + """The ID of the OneDrive drive to load data from.""" - @property - def _file_types(self) -> Sequence[_FileType]: - """Return supported file types.""" - return _FileType.DOC, _FileType.DOCX, _FileType.PDF - - @property - def _scopes(self) -> List[str]: - """Return required scopes.""" - return ["offline_access", "Files.Read.All"] - - def _get_folder_from_path(self, drive: Drive) -> Union[Folder, Drive]: - """ - Returns the folder or drive object located at the - specified path relative to the given drive. - - Args: - drive (Drive): The root drive from which the folder path is relative. - - Returns: - Union[Folder, Drive]: The folder or drive object - located at the specified path. - - Raises: - FileNotFoundError: If the path does not exist. - """ - - subfolder_drive = drive - if self.folder_path is None: - return subfolder_drive - - subfolders = [f for f in self.folder_path.split("/") if f != ""] - if len(subfolders) == 0: - return subfolder_drive - - items = subfolder_drive.get_items() - for subfolder in subfolders: - try: - subfolder_drive = list(filter(lambda x: subfolder in x.name, items))[0] - items = subfolder_drive.get_items() - except (IndexError, AttributeError): - raise FileNotFoundError("Path {} not exist.".format(self.folder_path)) - return subfolder_drive - - def lazy_load(self) -> Iterator[Document]: - """Load documents lazily. Use this when working at a large scale.""" - try: - from O365.drive import Drive - except ImportError: - raise ImportError( - "O365 package not found, please install it with `pip install o365`" - ) - drive = self._auth().storage().get_drive(self.drive_id) - if not isinstance(drive, Drive): - raise ValueError(f"There isn't a Drive with id {self.drive_id}.") - blob_parser = get_parser("default") - if self.folder_path: - folder = self._get_folder_from_path(drive) - for blob in self._load_from_folder(folder): - yield from blob_parser.lazy_parse(blob) - if self.object_ids: - for blob in self._load_from_object_ids(drive, self.object_ids): - yield from blob_parser.lazy_parse(blob) + def __init__(self, **kwargs: Any) -> None: + kwargs["document_library_id"] = kwargs["drive_id"] + super().__init__(**kwargs) diff --git a/libs/community/langchain_community/document_loaders/sharepoint.py b/libs/community/langchain_community/document_loaders/sharepoint.py index 06426a7038f..6d5a820248e 100644 --- a/libs/community/langchain_community/document_loaders/sharepoint.py +++ b/libs/community/langchain_community/document_loaders/sharepoint.py @@ -4,7 +4,7 @@ from __future__ import annotations import json from pathlib import Path -from typing import Any, Iterator, List, Optional, Sequence +from typing import Any, Dict, Iterator, List, Optional import requests # type: ignore from langchain_core.document_loaders import BaseLoader @@ -13,9 +13,7 @@ from pydantic import Field from langchain_community.document_loaders.base_o365 import ( O365BaseLoader, - _FileType, ) -from langchain_community.document_loaders.parsers.registry import get_parser class SharePointLoader(O365BaseLoader, BaseLoader): @@ -36,14 +34,6 @@ class SharePointLoader(O365BaseLoader, BaseLoader): load_extended_metadata: Optional[bool] = False """ Whether to load extended metadata. Size, Owner and full_path.""" - @property - def _file_types(self) -> Sequence[_FileType]: - """Return supported file types. - Returns: - A sequence of supported file types. - """ - return _FileType.DOC, _FileType.DOCX, _FileType.PDF - @property def _scopes(self) -> List[str]: """Return required scopes. @@ -67,7 +57,6 @@ class SharePointLoader(O365BaseLoader, BaseLoader): drive = self._auth().storage().get_drive(self.document_library_id) if not isinstance(drive, Drive): raise ValueError(f"There isn't a Drive with id {self.document_library_id}.") - blob_parser = get_parser("default") if self.folder_path: target_folder = drive.get_item_by_path(self.folder_path) if not isinstance(target_folder, Folder): @@ -79,7 +68,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader): if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) extended_metadata.update({"source_full_url": target_folder.web_url}) - for parsed_blob in blob_parser.lazy_parse(blob): + for parsed_blob in self._blob_parser.lazy_parse(blob): if self.load_auth is True: parsed_blob.metadata["authorized_identities"] = auth_identities if self.load_extended_metadata is True: @@ -96,7 +85,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader): if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) extended_metadata.update({"source_full_url": target_folder.web_url}) - for parsed_blob in blob_parser.lazy_parse(blob): + for parsed_blob in self._blob_parser.lazy_parse(blob): if self.load_auth is True: parsed_blob.metadata["authorized_identities"] = auth_identities if self.load_extended_metadata is True: @@ -109,7 +98,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader): auth_identities = self.authorized_identities(file_id) if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) - for parsed_blob in blob_parser.lazy_parse(blob): + for parsed_blob in self._blob_parser.lazy_parse(blob): if self.load_auth is True: parsed_blob.metadata["authorized_identities"] = auth_identities if self.load_extended_metadata is True: @@ -126,7 +115,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader): auth_identities = self.authorized_identities(file_id) if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) - for blob_part in blob_parser.lazy_parse(blob): + for blob_part in self._blob_parser.lazy_parse(blob): blob_part.metadata.update(blob.metadata) if self.load_auth is True: blob_part.metadata["authorized_identities"] = auth_identities @@ -182,7 +171,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader): data = json.loads(s) return data - def get_extended_metadata(self, file_id: str) -> dict: + def get_extended_metadata(self, file_id: str) -> Dict: """ Retrieve extended metadata for a file in SharePoint. As of today, following fields are supported in the extended metadata: