mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-13 16:36:06 +00:00
Add titles to metadatas in gdrive loader (#2260)
I noticed the Googledrive loader does not have the "title" metadata for google docs and PDFs. This just adds that info to match the sheets.
This commit is contained in:
parent
4b59bb55c7
commit
64f44c6483
@ -148,6 +148,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
creds = self._load_credentials()
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
|
||||
file = service.files().get(fileId=id).execute()
|
||||
request = service.files().export_media(fileId=id, mimeType="text/plain")
|
||||
fh = BytesIO()
|
||||
downloader = MediaIoBaseDownload(fh, request)
|
||||
@ -163,7 +164,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
print("An error occurred: {}".format(e))
|
||||
|
||||
text = fh.getvalue().decode("utf-8")
|
||||
metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"}
|
||||
metadata = {
|
||||
"source": f"https://docs.google.com/document/d/{id}/edit",
|
||||
"title": f"{file.get('name')}",
|
||||
}
|
||||
return Document(page_content=text, metadata=metadata)
|
||||
|
||||
def _load_documents_from_folder(self) -> List[Document]:
|
||||
@ -213,6 +217,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
creds = self._load_credentials()
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
|
||||
file = service.files().get(fileId=id).execute()
|
||||
request = service.files().get_media(fileId=id)
|
||||
fh = BytesIO()
|
||||
downloader = MediaIoBaseDownload(fh, request)
|
||||
@ -230,6 +235,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
page_content=page.extract_text(),
|
||||
metadata={
|
||||
"source": f"https://drive.google.com/file/d/{id}/view",
|
||||
"title": f"{file.get('name')}",
|
||||
"page": i,
|
||||
},
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user