mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-30 19:49:09 +00:00
Harrison/rec gd (#3054)
Co-authored-by: Benjamin Scholtz <BenSchZA@users.noreply.github.com>
This commit is contained in:
parent
eee2f23a79
commit
5107fac656
@ -44,7 +44,11 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"loader = GoogleDriveLoader(folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\")"
|
"loader = GoogleDriveLoader(\n",
|
||||||
|
" folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\",\n",
|
||||||
|
" # Optional: configure whether to recursively fetch files from subfolders. Defaults to False.\n",
|
||||||
|
" recursive=False\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
# https://cloud.google.com/iam/docs/service-accounts-create
|
# https://cloud.google.com/iam/docs/service-accounts-create
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, root_validator, validator
|
from pydantic import BaseModel, root_validator, validator
|
||||||
|
|
||||||
@ -29,6 +29,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
folder_id: Optional[str] = None
|
folder_id: Optional[str] = None
|
||||||
document_ids: Optional[List[str]] = None
|
document_ids: Optional[List[str]] = None
|
||||||
file_ids: Optional[List[str]] = None
|
file_ids: Optional[List[str]] = None
|
||||||
|
recursive: bool = False
|
||||||
|
|
||||||
@root_validator
|
@root_validator
|
||||||
def validate_folder_id_or_document_ids(
|
def validate_folder_id_or_document_ids(
|
||||||
@ -170,35 +171,49 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
}
|
}
|
||||||
return Document(page_content=text, metadata=metadata)
|
return Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def _load_documents_from_folder(self) -> List[Document]:
|
def _load_documents_from_folder(self, folder_id: str) -> List[Document]:
|
||||||
"""Load documents from a folder."""
|
"""Load documents from a folder."""
|
||||||
from googleapiclient.discovery import build
|
from googleapiclient.discovery import build
|
||||||
|
|
||||||
creds = self._load_credentials()
|
creds = self._load_credentials()
|
||||||
service = build("drive", "v3", credentials=creds)
|
service = build("drive", "v3", credentials=creds)
|
||||||
|
files = self._fetch_files_recursive(service, folder_id)
|
||||||
|
returns = []
|
||||||
|
for file in files:
|
||||||
|
if file["mimeType"] == "application/vnd.google-apps.document":
|
||||||
|
returns.append(self._load_document_from_id(file["id"])) # type: ignore
|
||||||
|
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
||||||
|
returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore
|
||||||
|
elif file["mimeType"] == "application/pdf":
|
||||||
|
returns.extend(self._load_file_from_id(file["id"])) # type: ignore
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return returns
|
||||||
|
|
||||||
|
def _fetch_files_recursive(
|
||||||
|
self, service: Any, folder_id: str
|
||||||
|
) -> List[Dict[str, Union[str, List[str]]]]:
|
||||||
|
"""Fetch all files and subfolders recursively."""
|
||||||
results = (
|
results = (
|
||||||
service.files()
|
service.files()
|
||||||
.list(
|
.list(
|
||||||
q=f"'{self.folder_id}' in parents",
|
q=f"'{folder_id}' in parents",
|
||||||
pageSize=1000,
|
pageSize=1000,
|
||||||
includeItemsFromAllDrives=True,
|
includeItemsFromAllDrives=True,
|
||||||
supportsAllDrives=True,
|
supportsAllDrives=True,
|
||||||
fields="nextPageToken, files(id, name, mimeType)",
|
fields="nextPageToken, files(id, name, mimeType, parents)",
|
||||||
)
|
)
|
||||||
.execute()
|
.execute()
|
||||||
)
|
)
|
||||||
items = results.get("files", [])
|
files = results.get("files", [])
|
||||||
returns = []
|
returns = []
|
||||||
for item in items:
|
for file in files:
|
||||||
if item["mimeType"] == "application/vnd.google-apps.document":
|
if file["mimeType"] == "application/vnd.google-apps.folder":
|
||||||
returns.append(self._load_document_from_id(item["id"]))
|
if self.recursive:
|
||||||
elif item["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
returns.extend(self._fetch_files_recursive(service, file["id"]))
|
||||||
returns.extend(self._load_sheet_from_id(item["id"]))
|
|
||||||
elif item["mimeType"] == "application/pdf":
|
|
||||||
returns.extend(self._load_file_from_id(item["id"]))
|
|
||||||
else:
|
else:
|
||||||
pass
|
returns.append(file)
|
||||||
|
|
||||||
return returns
|
return returns
|
||||||
|
|
||||||
@ -256,7 +271,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
if self.folder_id:
|
if self.folder_id:
|
||||||
return self._load_documents_from_folder()
|
return self._load_documents_from_folder(self.folder_id)
|
||||||
elif self.document_ids:
|
elif self.document_ids:
|
||||||
return self._load_documents_from_ids()
|
return self._load_documents_from_ids()
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user