mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
community: Allow other than default parsers in SharePointLoader and OneDriveLoader (#27716)
## What this PR does?
### Currently `O365BaseLoader` (and consequently both derived loaders)
are limited to `pdf`, `doc`, `docx` files.
- **Solution: here we introduce _handlers_ attribute that allows for
custom handlers to be passed in. This is done in _dict_ form:**
**Example:**
```python
from langchain_community.document_loaders.parsers.documentloader_adapter import DocumentLoaderAsParser
# PR for DocumentLoaderAsParser here: https://github.com/langchain-ai/langchain/pull/27749
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
xlsx_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="paged")
# create dictionary mapping file types to handlers (parsers)
handlers = {
"doc": MsWordParser()
"pdf": PDFMinerParser()
"txt": TextParser()
"xlsx": xlsx_parser
}
loader = SharePointLoader(document_library_id="...",
handlers=handlers # pass handlers to SharePointLoader
)
documents = loader.load()
# works the same in OneDriveLoader
loader = OneDriveLoader(document_library_id="...",
handlers=handlers
)
```
This dictionary is then passed to `MimeTypeBasedParser` same as in the
[current
implementation](5a2cfb49e0/libs/community/langchain_community/document_loaders/parsers/registry.py (L13)
).
### Currently `SharePointLoader` and `OneDriveLoader` are separate
loaders that both inherit from `O365BaseLoader`
However both of these implement the same functionality. The only
differences are:
- `SharePointLoader` requires argument `document_library_id` whereas
`OneDriveLoader` requires `drive_id`. These are just different names for
the same thing.
- `SharePointLoader` implements significantly more features.
- **Solution: `OneDriveLoader` is replaced with an empty shell just
renaming `drive_id` to `document_library_id` and inheriting from
`SharePointLoader`**
**Dependencies:** None
**Twitter handle:** @martintriska1
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
This commit is contained in:
parent
482c168b3e
commit
90189f5639
@ -8,7 +8,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
">[Microsoft OneDrive](https://en.wikipedia.org/wiki/OneDrive) (formerly `SkyDrive`) is a file hosting service operated by Microsoft.\n",
|
">[Microsoft OneDrive](https://en.wikipedia.org/wiki/OneDrive) (formerly `SkyDrive`) is a file hosting service operated by Microsoft.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This notebook covers how to load documents from `OneDrive`. Currently, only docx, doc, and pdf files are supported.\n",
|
"This notebook covers how to load documents from `OneDrive`. By default the document loader loads `pdf`, `doc`, `docx` and `txt` files. You can load other file types by providing appropriate parsers (see more below).\n",
|
||||||
"\n",
|
"\n",
|
||||||
"## Prerequisites\n",
|
"## Prerequisites\n",
|
||||||
"1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n",
|
"1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n",
|
||||||
@ -77,15 +77,64 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"loader = OneDriveLoader(drive_id=\"YOUR DRIVE ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n",
|
"loader = OneDriveLoader(drive_id=\"YOUR DRIVE ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n",
|
||||||
"documents = loader.load()\n",
|
"documents = loader.load()\n",
|
||||||
"```\n"
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"#### 📑 Choosing supported file types and preffered parsers\n",
|
||||||
|
"By default `OneDriveLoader` loads file types defined in [`document_loaders/parsers/registry`](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/parsers/registry.py#L10-L22) using the default parsers (see below).\n",
|
||||||
|
"```python\n",
|
||||||
|
"def _get_default_parser() -> BaseBlobParser:\n",
|
||||||
|
" \"\"\"Get default mime-type based parser.\"\"\"\n",
|
||||||
|
" return MimeTypeBasedParser(\n",
|
||||||
|
" handlers={\n",
|
||||||
|
" \"application/pdf\": PyMuPDFParser(),\n",
|
||||||
|
" \"text/plain\": TextParser(),\n",
|
||||||
|
" \"application/msword\": MsWordParser(),\n",
|
||||||
|
" \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\": (\n",
|
||||||
|
" MsWordParser()\n",
|
||||||
|
" ),\n",
|
||||||
|
" },\n",
|
||||||
|
" fallback_parser=None,\n",
|
||||||
|
" )\n",
|
||||||
|
"```\n",
|
||||||
|
"You can override this behavior by passing `handlers` argument to `OneDriveLoader`. \n",
|
||||||
|
"Pass a dictionary mapping either file extensions (like `\"doc\"`, `\"pdf\"`, etc.) \n",
|
||||||
|
"or MIME types (like `\"application/pdf\"`, `\"text/plain\"`, etc.) to parsers. \n",
|
||||||
|
"Note that you must use either file extensions or MIME types exclusively and \n",
|
||||||
|
"cannot mix them.\n",
|
||||||
|
"\n",
|
||||||
|
"Do not include the leading dot for file extensions.\n",
|
||||||
|
"\n",
|
||||||
|
"```python\n",
|
||||||
|
"# using file extensions:\n",
|
||||||
|
"handlers = {\n",
|
||||||
|
" \"doc\": MsWordParser(),\n",
|
||||||
|
" \"pdf\": PDFMinerParser(),\n",
|
||||||
|
" \"mp3\": OpenAIWhisperParser()\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"# using MIME types:\n",
|
||||||
|
"handlers = {\n",
|
||||||
|
" \"application/msword\": MsWordParser(),\n",
|
||||||
|
" \"application/pdf\": PDFMinerParser(),\n",
|
||||||
|
" \"audio/mpeg\": OpenAIWhisperParser()\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"loader = OneDriveLoader(document_library_id=\"...\",\n",
|
||||||
|
" handlers=handlers # pass handlers to OneDriveLoader\n",
|
||||||
|
" )\n",
|
||||||
|
"```\n",
|
||||||
|
"In case multiple file extensions map to the same MIME type, the last dictionary item will\n",
|
||||||
|
"apply.\n",
|
||||||
|
"Example:\n",
|
||||||
|
"```python\n",
|
||||||
|
"# 'jpg' and 'jpeg' both map to 'image/jpeg' MIME type. SecondParser() will be used \n",
|
||||||
|
"# to parse all jpg/jpeg files.\n",
|
||||||
|
"handlers = {\n",
|
||||||
|
" \"jpg\": FirstParser(),\n",
|
||||||
|
" \"jpeg\": SecondParser()\n",
|
||||||
|
"}\n",
|
||||||
|
"```"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"> [Microsoft SharePoint](https://en.wikipedia.org/wiki/SharePoint) is a website-based collaboration system that uses workflow applications, “list” databases, and other web parts and security features to empower business teams to work together developed by Microsoft.\n",
|
"> [Microsoft SharePoint](https://en.wikipedia.org/wiki/SharePoint) is a website-based collaboration system that uses workflow applications, “list” databases, and other web parts and security features to empower business teams to work together developed by Microsoft.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This notebook covers how to load documents from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). Currently, only docx, doc, and pdf files are supported.\n",
|
"This notebook covers how to load documents from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). By default the document loader loads `pdf`, `doc`, `docx` and `txt` files. You can load other file types by providing appropriate parsers (see more below).\n",
|
||||||
"\n",
|
"\n",
|
||||||
"## Prerequisites\n",
|
"## Prerequisites\n",
|
||||||
"1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n",
|
"1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n",
|
||||||
@ -100,7 +100,63 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"loader = SharePointLoader(document_library_id=\"YOUR DOCUMENT LIBRARY ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n",
|
"loader = SharePointLoader(document_library_id=\"YOUR DOCUMENT LIBRARY ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n",
|
||||||
"documents = loader.load()\n",
|
"documents = loader.load()\n",
|
||||||
"```\n"
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"#### 📑 Choosing supported file types and preffered parsers\n",
|
||||||
|
"By default `SharePointLoader` loads file types defined in [`document_loaders/parsers/registry`](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/parsers/registry.py#L10-L22) using the default parsers (see below).\n",
|
||||||
|
"```python\n",
|
||||||
|
"def _get_default_parser() -> BaseBlobParser:\n",
|
||||||
|
" \"\"\"Get default mime-type based parser.\"\"\"\n",
|
||||||
|
" return MimeTypeBasedParser(\n",
|
||||||
|
" handlers={\n",
|
||||||
|
" \"application/pdf\": PyMuPDFParser(),\n",
|
||||||
|
" \"text/plain\": TextParser(),\n",
|
||||||
|
" \"application/msword\": MsWordParser(),\n",
|
||||||
|
" \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\": (\n",
|
||||||
|
" MsWordParser()\n",
|
||||||
|
" ),\n",
|
||||||
|
" },\n",
|
||||||
|
" fallback_parser=None,\n",
|
||||||
|
" )\n",
|
||||||
|
"```\n",
|
||||||
|
"You can override this behavior by passing `handlers` argument to `SharePointLoader`. \n",
|
||||||
|
"Pass a dictionary mapping either file extensions (like `\"doc\"`, `\"pdf\"`, etc.) \n",
|
||||||
|
"or MIME types (like `\"application/pdf\"`, `\"text/plain\"`, etc.) to parsers. \n",
|
||||||
|
"Note that you must use either file extensions or MIME types exclusively and \n",
|
||||||
|
"cannot mix them.\n",
|
||||||
|
"\n",
|
||||||
|
"Do not include the leading dot for file extensions.\n",
|
||||||
|
"\n",
|
||||||
|
"```python\n",
|
||||||
|
"# using file extensions:\n",
|
||||||
|
"handlers = {\n",
|
||||||
|
" \"doc\": MsWordParser(),\n",
|
||||||
|
" \"pdf\": PDFMinerParser(),\n",
|
||||||
|
" \"mp3\": OpenAIWhisperParser()\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"# using MIME types:\n",
|
||||||
|
"handlers = {\n",
|
||||||
|
" \"application/msword\": MsWordParser(),\n",
|
||||||
|
" \"application/pdf\": PDFMinerParser(),\n",
|
||||||
|
" \"audio/mpeg\": OpenAIWhisperParser()\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"loader = SharePointLoader(document_library_id=\"...\",\n",
|
||||||
|
" handlers=handlers # pass handlers to SharePointLoader\n",
|
||||||
|
" )\n",
|
||||||
|
"```\n",
|
||||||
|
"In case multiple file extensions map to the same MIME type, the last dictionary item will\n",
|
||||||
|
"apply.\n",
|
||||||
|
"Example:\n",
|
||||||
|
"```python\n",
|
||||||
|
"# 'jpg' and 'jpeg' both map to 'image/jpeg' MIME type. SecondParser() will be used \n",
|
||||||
|
"# to parse all jpg/jpeg files.\n",
|
||||||
|
"handlers = {\n",
|
||||||
|
" \"jpg\": FirstParser(),\n",
|
||||||
|
" \"jpeg\": SecondParser()\n",
|
||||||
|
"}\n",
|
||||||
|
"```"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -3,26 +3,29 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from enum import Enum
|
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
||||||
|
|
||||||
from pydantic import (
|
from pydantic import (
|
||||||
BaseModel,
|
BaseModel,
|
||||||
Field,
|
Field,
|
||||||
FilePath,
|
FilePath,
|
||||||
|
PrivateAttr,
|
||||||
SecretStr,
|
SecretStr,
|
||||||
)
|
)
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseBlobParser, BaseLoader
|
||||||
from langchain_community.document_loaders.blob_loaders.file_system import (
|
from langchain_community.document_loaders.blob_loaders.file_system import (
|
||||||
FileSystemBlobLoader,
|
FileSystemBlobLoader,
|
||||||
)
|
)
|
||||||
from langchain_community.document_loaders.blob_loaders.schema import Blob
|
from langchain_community.document_loaders.blob_loaders.schema import Blob
|
||||||
|
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||||
|
from langchain_community.document_loaders.parsers.registry import get_parser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from O365 import Account
|
from O365 import Account
|
||||||
@ -46,24 +49,27 @@ class _O365TokenStorage(BaseSettings):
|
|||||||
token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"
|
token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"
|
||||||
|
|
||||||
|
|
||||||
class _FileType(str, Enum):
|
def fetch_mime_types(file_types: Sequence[str]) -> Dict[str, str]:
|
||||||
DOC = "doc"
|
|
||||||
DOCX = "docx"
|
|
||||||
PDF = "pdf"
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]:
|
|
||||||
"""Fetch the mime types for the specified file types."""
|
"""Fetch the mime types for the specified file types."""
|
||||||
mime_types_mapping = {}
|
mime_types_mapping = {}
|
||||||
for file_type in file_types:
|
for ext in file_types:
|
||||||
if file_type.value == "doc":
|
mime_type, _ = mimetypes.guess_type(f"file.{ext}")
|
||||||
mime_types_mapping[file_type.value] = "application/msword"
|
if mime_type:
|
||||||
elif file_type.value == "docx":
|
mime_types_mapping[ext] = mime_type
|
||||||
mime_types_mapping[file_type.value] = (
|
else:
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501
|
raise ValueError(f"Unknown mimetype of extention {ext}")
|
||||||
)
|
return mime_types_mapping
|
||||||
elif file_type.value == "pdf":
|
|
||||||
mime_types_mapping[file_type.value] = "application/pdf"
|
|
||||||
|
def fetch_extensions(mime_types: Sequence[str]) -> Dict[str, str]:
|
||||||
|
"""Fetch the mime types for the specified file types."""
|
||||||
|
mime_types_mapping = {}
|
||||||
|
for mime_type in mime_types:
|
||||||
|
ext = mimetypes.guess_extension(mime_type)
|
||||||
|
if ext:
|
||||||
|
mime_types_mapping[ext[1:]] = mime_type # ignore leading `.`
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown mimetype {mime_type}")
|
||||||
return mime_types_mapping
|
return mime_types_mapping
|
||||||
|
|
||||||
|
|
||||||
@ -78,16 +84,82 @@ class O365BaseLoader(BaseLoader, BaseModel):
|
|||||||
"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""
|
"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""
|
||||||
recursive: bool = False
|
recursive: bool = False
|
||||||
"""Should the loader recursively load subfolders?"""
|
"""Should the loader recursively load subfolders?"""
|
||||||
|
handlers: Optional[Dict[str, Any]] = {}
|
||||||
|
"""
|
||||||
|
Provide custom handlers for MimeTypeBasedParser.
|
||||||
|
|
||||||
@property
|
Pass a dictionary mapping either file extensions (like "doc", "pdf", etc.)
|
||||||
@abstractmethod
|
or MIME types (like "application/pdf", "text/plain", etc.) to parsers.
|
||||||
def _file_types(self) -> Sequence[_FileType]:
|
Note that you must use either file extensions or MIME types exclusively and
|
||||||
"""Return supported file types."""
|
cannot mix them.
|
||||||
|
|
||||||
|
Do not include the leading dot for file extensions.
|
||||||
|
|
||||||
|
Example using file extensions:
|
||||||
|
```python
|
||||||
|
handlers = {
|
||||||
|
"doc": MsWordParser(),
|
||||||
|
"pdf": PDFMinerParser(),
|
||||||
|
"txt": TextParser()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Example using MIME types:
|
||||||
|
```python
|
||||||
|
handlers = {
|
||||||
|
"application/msword": MsWordParser(),
|
||||||
|
"application/pdf": PDFMinerParser(),
|
||||||
|
"text/plain": TextParser()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
_blob_parser: BaseBlobParser = PrivateAttr()
|
||||||
|
_file_types: Sequence[str] = PrivateAttr()
|
||||||
|
_mime_types: Dict[str, str] = PrivateAttr()
|
||||||
|
|
||||||
|
def __init__(self, **kwargs: Any) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if self.handlers:
|
||||||
|
handler_keys = list(self.handlers.keys())
|
||||||
|
try:
|
||||||
|
# assume handlers.keys() are file extensions
|
||||||
|
self._mime_types = fetch_mime_types(handler_keys)
|
||||||
|
self._file_types = list(set(handler_keys))
|
||||||
|
mime_handlers = {
|
||||||
|
self._mime_types[extension]: handler
|
||||||
|
for extension, handler in self.handlers.items()
|
||||||
|
}
|
||||||
|
except ValueError:
|
||||||
|
try:
|
||||||
|
# assume handlers.keys() are mime types
|
||||||
|
self._mime_types = fetch_extensions(handler_keys)
|
||||||
|
self._file_types = list(set(self._mime_types.keys()))
|
||||||
|
mime_handlers = self.handlers
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(
|
||||||
|
"`handlers` keys must be either file extensions or mimetypes.\n"
|
||||||
|
f"{handler_keys} could not be interpreted as either.\n"
|
||||||
|
"File extensions and mimetypes cannot mix. "
|
||||||
|
"Use either one or the other"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._blob_parser = MimeTypeBasedParser(
|
||||||
|
handlers=mime_handlers, fallback_parser=None
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._blob_parser = get_parser("default")
|
||||||
|
if not isinstance(self._blob_parser, MimeTypeBasedParser):
|
||||||
|
raise TypeError(
|
||||||
|
'get_parser("default) was supposed to return MimeTypeBasedParser.'
|
||||||
|
f"It returned {type(self._blob_parser)}"
|
||||||
|
)
|
||||||
|
self._mime_types = fetch_extensions(list(self._blob_parser.handlers.keys()))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _fetch_mime_types(self) -> Dict[str, str]:
|
def _fetch_mime_types(self) -> Dict[str, str]:
|
||||||
"""Return a dict of supported file types to corresponding mime types."""
|
"""Return a dict of supported file types to corresponding mime types."""
|
||||||
return fetch_mime_types(self._file_types)
|
return self._mime_types
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -1,94 +1,19 @@
|
|||||||
"""Loads data from OneDrive"""
|
from typing import Any
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence, Union
|
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
|
|
||||||
from langchain_community.document_loaders.base_o365 import (
|
from langchain_community.document_loaders import SharePointLoader
|
||||||
O365BaseLoader,
|
|
||||||
_FileType,
|
|
||||||
)
|
|
||||||
from langchain_community.document_loaders.parsers.registry import get_parser
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from O365.drive import Drive, Folder
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class OneDriveLoader(O365BaseLoader):
|
class OneDriveLoader(SharePointLoader):
|
||||||
"""Load from `Microsoft OneDrive`."""
|
"""
|
||||||
|
Load documents from Microsoft OneDrive.
|
||||||
|
Uses `SharePointLoader` under the hood.
|
||||||
|
"""
|
||||||
|
|
||||||
drive_id: str = Field(...)
|
drive_id: str = Field(...)
|
||||||
""" The ID of the OneDrive drive to load data from."""
|
"""The ID of the OneDrive drive to load data from."""
|
||||||
folder_path: Optional[str] = None
|
|
||||||
""" The path to the folder to load data from."""
|
|
||||||
object_ids: Optional[List[str]] = None
|
|
||||||
""" The IDs of the objects to load data from."""
|
|
||||||
|
|
||||||
@property
|
def __init__(self, **kwargs: Any) -> None:
|
||||||
def _file_types(self) -> Sequence[_FileType]:
|
kwargs["document_library_id"] = kwargs["drive_id"]
|
||||||
"""Return supported file types."""
|
super().__init__(**kwargs)
|
||||||
return _FileType.DOC, _FileType.DOCX, _FileType.PDF
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _scopes(self) -> List[str]:
|
|
||||||
"""Return required scopes."""
|
|
||||||
return ["offline_access", "Files.Read.All"]
|
|
||||||
|
|
||||||
def _get_folder_from_path(self, drive: Drive) -> Union[Folder, Drive]:
|
|
||||||
"""
|
|
||||||
Returns the folder or drive object located at the
|
|
||||||
specified path relative to the given drive.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
drive (Drive): The root drive from which the folder path is relative.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Union[Folder, Drive]: The folder or drive object
|
|
||||||
located at the specified path.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
FileNotFoundError: If the path does not exist.
|
|
||||||
"""
|
|
||||||
|
|
||||||
subfolder_drive = drive
|
|
||||||
if self.folder_path is None:
|
|
||||||
return subfolder_drive
|
|
||||||
|
|
||||||
subfolders = [f for f in self.folder_path.split("/") if f != ""]
|
|
||||||
if len(subfolders) == 0:
|
|
||||||
return subfolder_drive
|
|
||||||
|
|
||||||
items = subfolder_drive.get_items()
|
|
||||||
for subfolder in subfolders:
|
|
||||||
try:
|
|
||||||
subfolder_drive = list(filter(lambda x: subfolder in x.name, items))[0]
|
|
||||||
items = subfolder_drive.get_items()
|
|
||||||
except (IndexError, AttributeError):
|
|
||||||
raise FileNotFoundError("Path {} not exist.".format(self.folder_path))
|
|
||||||
return subfolder_drive
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
|
||||||
"""Load documents lazily. Use this when working at a large scale."""
|
|
||||||
try:
|
|
||||||
from O365.drive import Drive
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
"O365 package not found, please install it with `pip install o365`"
|
|
||||||
)
|
|
||||||
drive = self._auth().storage().get_drive(self.drive_id)
|
|
||||||
if not isinstance(drive, Drive):
|
|
||||||
raise ValueError(f"There isn't a Drive with id {self.drive_id}.")
|
|
||||||
blob_parser = get_parser("default")
|
|
||||||
if self.folder_path:
|
|
||||||
folder = self._get_folder_from_path(drive)
|
|
||||||
for blob in self._load_from_folder(folder):
|
|
||||||
yield from blob_parser.lazy_parse(blob)
|
|
||||||
if self.object_ids:
|
|
||||||
for blob in self._load_from_object_ids(drive, self.object_ids):
|
|
||||||
yield from blob_parser.lazy_parse(blob)
|
|
||||||
|
@ -4,7 +4,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterator, List, Optional, Sequence
|
from typing import Any, Dict, Iterator, List, Optional
|
||||||
|
|
||||||
import requests # type: ignore
|
import requests # type: ignore
|
||||||
from langchain_core.document_loaders import BaseLoader
|
from langchain_core.document_loaders import BaseLoader
|
||||||
@ -13,9 +13,7 @@ from pydantic import Field
|
|||||||
|
|
||||||
from langchain_community.document_loaders.base_o365 import (
|
from langchain_community.document_loaders.base_o365 import (
|
||||||
O365BaseLoader,
|
O365BaseLoader,
|
||||||
_FileType,
|
|
||||||
)
|
)
|
||||||
from langchain_community.document_loaders.parsers.registry import get_parser
|
|
||||||
|
|
||||||
|
|
||||||
class SharePointLoader(O365BaseLoader, BaseLoader):
|
class SharePointLoader(O365BaseLoader, BaseLoader):
|
||||||
@ -36,14 +34,6 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
load_extended_metadata: Optional[bool] = False
|
load_extended_metadata: Optional[bool] = False
|
||||||
""" Whether to load extended metadata. Size, Owner and full_path."""
|
""" Whether to load extended metadata. Size, Owner and full_path."""
|
||||||
|
|
||||||
@property
|
|
||||||
def _file_types(self) -> Sequence[_FileType]:
|
|
||||||
"""Return supported file types.
|
|
||||||
Returns:
|
|
||||||
A sequence of supported file types.
|
|
||||||
"""
|
|
||||||
return _FileType.DOC, _FileType.DOCX, _FileType.PDF
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _scopes(self) -> List[str]:
|
def _scopes(self) -> List[str]:
|
||||||
"""Return required scopes.
|
"""Return required scopes.
|
||||||
@ -67,7 +57,6 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
drive = self._auth().storage().get_drive(self.document_library_id)
|
drive = self._auth().storage().get_drive(self.document_library_id)
|
||||||
if not isinstance(drive, Drive):
|
if not isinstance(drive, Drive):
|
||||||
raise ValueError(f"There isn't a Drive with id {self.document_library_id}.")
|
raise ValueError(f"There isn't a Drive with id {self.document_library_id}.")
|
||||||
blob_parser = get_parser("default")
|
|
||||||
if self.folder_path:
|
if self.folder_path:
|
||||||
target_folder = drive.get_item_by_path(self.folder_path)
|
target_folder = drive.get_item_by_path(self.folder_path)
|
||||||
if not isinstance(target_folder, Folder):
|
if not isinstance(target_folder, Folder):
|
||||||
@ -79,7 +68,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
if self.load_extended_metadata is True:
|
if self.load_extended_metadata is True:
|
||||||
extended_metadata = self.get_extended_metadata(file_id)
|
extended_metadata = self.get_extended_metadata(file_id)
|
||||||
extended_metadata.update({"source_full_url": target_folder.web_url})
|
extended_metadata.update({"source_full_url": target_folder.web_url})
|
||||||
for parsed_blob in blob_parser.lazy_parse(blob):
|
for parsed_blob in self._blob_parser.lazy_parse(blob):
|
||||||
if self.load_auth is True:
|
if self.load_auth is True:
|
||||||
parsed_blob.metadata["authorized_identities"] = auth_identities
|
parsed_blob.metadata["authorized_identities"] = auth_identities
|
||||||
if self.load_extended_metadata is True:
|
if self.load_extended_metadata is True:
|
||||||
@ -96,7 +85,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
if self.load_extended_metadata is True:
|
if self.load_extended_metadata is True:
|
||||||
extended_metadata = self.get_extended_metadata(file_id)
|
extended_metadata = self.get_extended_metadata(file_id)
|
||||||
extended_metadata.update({"source_full_url": target_folder.web_url})
|
extended_metadata.update({"source_full_url": target_folder.web_url})
|
||||||
for parsed_blob in blob_parser.lazy_parse(blob):
|
for parsed_blob in self._blob_parser.lazy_parse(blob):
|
||||||
if self.load_auth is True:
|
if self.load_auth is True:
|
||||||
parsed_blob.metadata["authorized_identities"] = auth_identities
|
parsed_blob.metadata["authorized_identities"] = auth_identities
|
||||||
if self.load_extended_metadata is True:
|
if self.load_extended_metadata is True:
|
||||||
@ -109,7 +98,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
auth_identities = self.authorized_identities(file_id)
|
auth_identities = self.authorized_identities(file_id)
|
||||||
if self.load_extended_metadata is True:
|
if self.load_extended_metadata is True:
|
||||||
extended_metadata = self.get_extended_metadata(file_id)
|
extended_metadata = self.get_extended_metadata(file_id)
|
||||||
for parsed_blob in blob_parser.lazy_parse(blob):
|
for parsed_blob in self._blob_parser.lazy_parse(blob):
|
||||||
if self.load_auth is True:
|
if self.load_auth is True:
|
||||||
parsed_blob.metadata["authorized_identities"] = auth_identities
|
parsed_blob.metadata["authorized_identities"] = auth_identities
|
||||||
if self.load_extended_metadata is True:
|
if self.load_extended_metadata is True:
|
||||||
@ -126,7 +115,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
auth_identities = self.authorized_identities(file_id)
|
auth_identities = self.authorized_identities(file_id)
|
||||||
if self.load_extended_metadata is True:
|
if self.load_extended_metadata is True:
|
||||||
extended_metadata = self.get_extended_metadata(file_id)
|
extended_metadata = self.get_extended_metadata(file_id)
|
||||||
for blob_part in blob_parser.lazy_parse(blob):
|
for blob_part in self._blob_parser.lazy_parse(blob):
|
||||||
blob_part.metadata.update(blob.metadata)
|
blob_part.metadata.update(blob.metadata)
|
||||||
if self.load_auth is True:
|
if self.load_auth is True:
|
||||||
blob_part.metadata["authorized_identities"] = auth_identities
|
blob_part.metadata["authorized_identities"] = auth_identities
|
||||||
@ -182,7 +171,7 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
data = json.loads(s)
|
data = json.loads(s)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_extended_metadata(self, file_id: str) -> dict:
|
def get_extended_metadata(self, file_id: str) -> Dict:
|
||||||
"""
|
"""
|
||||||
Retrieve extended metadata for a file in SharePoint.
|
Retrieve extended metadata for a file in SharePoint.
|
||||||
As of today, following fields are supported in the extended metadata:
|
As of today, following fields are supported in the extended metadata:
|
||||||
|
Loading…
Reference in New Issue
Block a user