mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 14:49:29 +00:00
community: [bugfix] fix source path for office files in O365 (#28260)
# What problem are we fixing? Currently documents loaded using `O365BaseLoader` fetch source from `file.web_url` (where `file` is `<class 'O365.drive.File'>`). This works well for `.pdf` documents. Unfortunately office documents (`.xlsx`, `.docx` ...) pass their `web_url` in following format: `https://sharepoint_address/sites/path/to/library/root/Doc.aspx?sourcedoc=%XXXXXXXX-1111-1111-XXXX-XXXXXXXXXX%7D&file=filename.xlsx&action=default&mobileredirect=true` This obfuscates the path to the file. This PR utilizes the parrent folder's path and file name to reconstruct the actual location of the file. Knowing the file's location can be crucial for some RAG applications (path to the file can carry information we don't want to loose). @vbarda Could you please look at this one? I'm @-mentioning you since we've already closed some PRs together :-) Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
534b8f4364
commit
75bc6bb191
@ -5,7 +5,9 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import urllib
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
||||||
@ -186,9 +188,18 @@ class O365BaseLoader(BaseLoader, BaseModel):
|
|||||||
for file in items:
|
for file in items:
|
||||||
if file.is_file:
|
if file.is_file:
|
||||||
if file.mime_type in list(file_mime_types.values()):
|
if file.mime_type in list(file_mime_types.values()):
|
||||||
|
source = file.web_url
|
||||||
|
if re.search(
|
||||||
|
r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
|
||||||
|
):
|
||||||
|
source = (
|
||||||
|
file._parent.web_url
|
||||||
|
+ "/"
|
||||||
|
+ urllib.parse.quote(file.name)
|
||||||
|
)
|
||||||
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
||||||
metadata_dict[file.name] = {
|
metadata_dict[file.name] = {
|
||||||
"source": file.web_url,
|
"source": source,
|
||||||
"mime_type": file.mime_type,
|
"mime_type": file.mime_type,
|
||||||
"created": str(file.created),
|
"created": str(file.created),
|
||||||
"modified": str(file.modified),
|
"modified": str(file.modified),
|
||||||
@ -241,9 +252,18 @@ class O365BaseLoader(BaseLoader, BaseModel):
|
|||||||
continue
|
continue
|
||||||
if file.is_file:
|
if file.is_file:
|
||||||
if file.mime_type in list(file_mime_types.values()):
|
if file.mime_type in list(file_mime_types.values()):
|
||||||
|
source = file.web_url
|
||||||
|
if re.search(
|
||||||
|
r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
|
||||||
|
):
|
||||||
|
source = (
|
||||||
|
file._parent.web_url
|
||||||
|
+ "/"
|
||||||
|
+ urllib.parse.quote(file.name)
|
||||||
|
)
|
||||||
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
||||||
metadata_dict[file.name] = {
|
metadata_dict[file.name] = {
|
||||||
"source": file.web_url,
|
"source": source,
|
||||||
"mime_type": file.mime_type,
|
"mime_type": file.mime_type,
|
||||||
"created": file.created,
|
"created": file.created,
|
||||||
"modified": file.modified,
|
"modified": file.modified,
|
||||||
|
Loading…
Reference in New Issue
Block a user