mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-06 03:27:55 +00:00
Support loading files from Dropbox (#8271)
## Description This commit introduces the `DropboxLoader` class, a new document loader that allows loading files from Dropbox into the application. The loader relies on a Dropbox app, which requires creating an app on Dropbox, obtaining the necessary scope permissions, and generating an access token. Additionally, the dropbox Python package is required. The `DropboxLoader` class is designed to be used as a document loader for processing various file types, including text files, PDFs, and Dropbox Paper files. ## Dependencies `pip install dropbox` and `pip install unstructured` for PDF reading. ## Tag maintainer @rlancemartin, @eyurtsev (from Data Loaders). I'd appreciate some feedback here 🙏 . ## Social Networks https://github.com/rubenbarragan https://www.linkedin.com/in/rgbarragan/ https://twitter.com/RubenBarraganP --------- Co-authored-by: Ruben Barragan <rbarragan@Rubens-MacBook-Air.local>
This commit is contained in:
parent
41bb3a6f9b
commit
ef6332ead6
File diff suppressed because one or more lines are too long
@ -38,6 +38,7 @@ from langchain.document_loaders.diffbot import DiffbotLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.discord import DiscordChatLoader
|
||||
from langchain.document_loaders.docugami import DocugamiLoader
|
||||
from langchain.document_loaders.dropbox import DropboxLoader
|
||||
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
||||
from langchain.document_loaders.email import (
|
||||
OutlookMessageLoader,
|
||||
@ -194,6 +195,7 @@ __all__ = [
|
||||
"DiscordChatLoader",
|
||||
"DocugamiLoader",
|
||||
"Docx2txtLoader",
|
||||
"DropboxLoader",
|
||||
"DuckDBLoader",
|
||||
"EmbaasBlobLoader",
|
||||
"EmbaasLoader",
|
||||
|
172
libs/langchain/langchain/document_loaders/dropbox.py
Normal file
172
libs/langchain/langchain/document_loaders/dropbox.py
Normal file
@ -0,0 +1,172 @@
|
||||
"""Loads data from Dropbox."""
|
||||
|
||||
# Prerequisites:
|
||||
# 1. Create a Dropbox app.
|
||||
# 2. Give the app these scope permissions: `files.metadata.read`
|
||||
# and `files.content.read`.
|
||||
# 3. Generate access token: https://www.dropbox.com/developers/apps/create.
|
||||
# 4. `pip install dropbox` (requires `pip install unstructured` for PDF filetype).
|
||||
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, root_validator
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class DropboxLoader(BaseLoader, BaseModel):
|
||||
"""Loads files from Dropbox.
|
||||
|
||||
In addition to common files such as text and PDF files, it also supports
|
||||
*Dropbox Paper* files.
|
||||
"""
|
||||
|
||||
dropbox_access_token: str
|
||||
"""Dropbox access token."""
|
||||
dropbox_folder_path: Optional[str] = None
|
||||
"""The folder path to load from."""
|
||||
dropbox_file_paths: Optional[List[str]] = None
|
||||
"""The file paths to load from."""
|
||||
recursive: bool = False
|
||||
"""Flag to indicate whether to load files recursively from subfolders."""
|
||||
|
||||
@root_validator
|
||||
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate that either folder_path or file_paths is set, but not both."""
|
||||
if (
|
||||
values.get("dropbox_folder_path") is not None
|
||||
and values.get("dropbox_file_paths") is not None
|
||||
):
|
||||
raise ValueError("Cannot specify both folder_path and file_paths")
|
||||
if values.get("dropbox_folder_path") is None and not values.get(
|
||||
"dropbox_file_paths"
|
||||
):
|
||||
raise ValueError("Must specify either folder_path or file_paths")
|
||||
|
||||
return values
|
||||
|
||||
def _create_dropbox_client(self) -> Any:
|
||||
"""Create a Dropbox client."""
|
||||
try:
|
||||
from dropbox import Dropbox, exceptions
|
||||
except ImportError:
|
||||
raise ImportError("You must run " "`pip install dropbox")
|
||||
|
||||
try:
|
||||
dbx = Dropbox(self.dropbox_access_token)
|
||||
dbx.users_get_current_account()
|
||||
except exceptions.AuthError as ex:
|
||||
raise ValueError(
|
||||
"Invalid Dropbox access token. Please verify your token and try again."
|
||||
) from ex
|
||||
return dbx
|
||||
|
||||
def _load_documents_from_folder(self, folder_path: str) -> List[Document]:
|
||||
"""Load documents from a Dropbox folder."""
|
||||
dbx = self._create_dropbox_client()
|
||||
|
||||
try:
|
||||
from dropbox import exceptions
|
||||
from dropbox.files import FileMetadata
|
||||
except ImportError:
|
||||
raise ImportError("You must run " "`pip install dropbox")
|
||||
|
||||
try:
|
||||
results = dbx.files_list_folder(folder_path, recursive=self.recursive)
|
||||
except exceptions.ApiError as ex:
|
||||
raise ValueError(
|
||||
f"Could not list files in the folder: {folder_path}. "
|
||||
"Please verify the folder path and try again."
|
||||
) from ex
|
||||
|
||||
files = [entry for entry in results.entries if isinstance(entry, FileMetadata)]
|
||||
documents = [
|
||||
doc
|
||||
for doc in (self._load_file_from_path(file.path_display) for file in files)
|
||||
if doc is not None
|
||||
]
|
||||
return documents
|
||||
|
||||
def _load_file_from_path(self, file_path: str) -> Optional[Document]:
|
||||
"""Load a file from a Dropbox path."""
|
||||
dbx = self._create_dropbox_client()
|
||||
|
||||
try:
|
||||
from dropbox import exceptions
|
||||
except ImportError:
|
||||
raise ImportError("You must run " "`pip install dropbox")
|
||||
|
||||
try:
|
||||
file_metadata = dbx.files_get_metadata(file_path)
|
||||
|
||||
if file_metadata.is_downloadable:
|
||||
_, response = dbx.files_download(file_path)
|
||||
|
||||
# Some types such as Paper, need to be exported.
|
||||
elif file_metadata.export_info:
|
||||
_, response = dbx.files_export(file_path, "markdown")
|
||||
|
||||
except exceptions.ApiError as ex:
|
||||
raise ValueError(
|
||||
f"Could not load file: {file_path}. Please verify the file path"
|
||||
"and try again."
|
||||
) from ex
|
||||
|
||||
try:
|
||||
text = response.content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
print(f"File {file_path} could not be decoded as text. Skipping.")
|
||||
|
||||
file_extension = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if file_extension == ".pdf":
|
||||
from langchain.document_loaders import UnstructuredPDFLoader
|
||||
|
||||
# Download it to a temporary file.
|
||||
temp_dir = tempfile.TemporaryDirectory()
|
||||
temp_pdf = Path(temp_dir.name) / "tmp.pdf"
|
||||
with open(temp_pdf, mode="wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
try:
|
||||
loader = UnstructuredPDFLoader(str(temp_pdf))
|
||||
docs = loader.load()
|
||||
if docs:
|
||||
return docs[0]
|
||||
except Exception as pdf_ex:
|
||||
print(f"Error while trying to parse PDF {file_path}: {pdf_ex}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
metadata = {
|
||||
"source": f"dropbox://{file_path}",
|
||||
"title": os.path.basename(file_path),
|
||||
}
|
||||
return Document(page_content=text, metadata=metadata)
|
||||
|
||||
def _load_documents_from_paths(self) -> List[Document]:
|
||||
"""Load documents from a list of Dropbox file paths."""
|
||||
if not self.dropbox_file_paths:
|
||||
raise ValueError("file_paths must be set")
|
||||
|
||||
return [
|
||||
doc
|
||||
for doc in (
|
||||
self._load_file_from_path(file_path)
|
||||
for file_path in self.dropbox_file_paths
|
||||
)
|
||||
if doc is not None
|
||||
]
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
if self.dropbox_folder_path is not None:
|
||||
return self._load_documents_from_folder(self.dropbox_folder_path)
|
||||
else:
|
||||
return self._load_documents_from_paths()
|
Loading…
Reference in New Issue
Block a user