mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 00:47:27 +00:00
Add titles to metadatas in gdrive loader (#2260)
I noticed the Googledrive loader does not have the "title" metadata for google docs and PDFs. This just adds that info to match the sheets.
This commit is contained in:
parent
4b59bb55c7
commit
64f44c6483
@ -148,6 +148,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
creds = self._load_credentials()
|
creds = self._load_credentials()
|
||||||
service = build("drive", "v3", credentials=creds)
|
service = build("drive", "v3", credentials=creds)
|
||||||
|
|
||||||
|
file = service.files().get(fileId=id).execute()
|
||||||
request = service.files().export_media(fileId=id, mimeType="text/plain")
|
request = service.files().export_media(fileId=id, mimeType="text/plain")
|
||||||
fh = BytesIO()
|
fh = BytesIO()
|
||||||
downloader = MediaIoBaseDownload(fh, request)
|
downloader = MediaIoBaseDownload(fh, request)
|
||||||
@ -163,7 +164,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
print("An error occurred: {}".format(e))
|
print("An error occurred: {}".format(e))
|
||||||
|
|
||||||
text = fh.getvalue().decode("utf-8")
|
text = fh.getvalue().decode("utf-8")
|
||||||
metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"}
|
metadata = {
|
||||||
|
"source": f"https://docs.google.com/document/d/{id}/edit",
|
||||||
|
"title": f"{file.get('name')}",
|
||||||
|
}
|
||||||
return Document(page_content=text, metadata=metadata)
|
return Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def _load_documents_from_folder(self) -> List[Document]:
|
def _load_documents_from_folder(self) -> List[Document]:
|
||||||
@ -213,6 +217,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
creds = self._load_credentials()
|
creds = self._load_credentials()
|
||||||
service = build("drive", "v3", credentials=creds)
|
service = build("drive", "v3", credentials=creds)
|
||||||
|
|
||||||
|
file = service.files().get(fileId=id).execute()
|
||||||
request = service.files().get_media(fileId=id)
|
request = service.files().get_media(fileId=id)
|
||||||
fh = BytesIO()
|
fh = BytesIO()
|
||||||
downloader = MediaIoBaseDownload(fh, request)
|
downloader = MediaIoBaseDownload(fh, request)
|
||||||
@ -230,6 +235,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
page_content=page.extract_text(),
|
page_content=page.extract_text(),
|
||||||
metadata={
|
metadata={
|
||||||
"source": f"https://drive.google.com/file/d/{id}/view",
|
"source": f"https://drive.google.com/file/d/{id}/view",
|
||||||
|
"title": f"{file.get('name')}",
|
||||||
"page": i,
|
"page": i,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user