community: [bugfix] fix source path for office files in O365 (#28260)

# What problem are we fixing?

Currently documents loaded using `O365BaseLoader` fetch source from
`file.web_url` (where `file` is `<class 'O365.drive.File'>`). This works
well for `.pdf` documents. Unfortunately office documents (`.xlsx`,
`.docx` ...) pass their `web_url` in following format:

`https://sharepoint_address/sites/path/to/library/root/Doc.aspx?sourcedoc=%XXXXXXXX-1111-1111-XXXX-XXXXXXXXXX%7D&file=filename.xlsx&action=default&mobileredirect=true`

This obfuscates the path to the file. This PR utilizes the parrent
folder's path and file name to reconstruct the actual location of the
file. Knowing the file's location can be crucial for some RAG
applications (path to the file can carry information we don't want to
loose).

@vbarda Could you please look at this one? I'm @-mentioning you since
we've already closed some PRs together :-)

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Martin Triska 2024-12-09 21:34:59 +01:00 committed by GitHub
parent 534b8f4364
commit 75bc6bb191
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -5,7 +5,9 @@ from __future__ import annotations
import logging
import mimetypes
import os
import re
import tempfile
import urllib
from abc import abstractmethod
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
@ -186,9 +188,18 @@ class O365BaseLoader(BaseLoader, BaseModel):
for file in items:
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
source = file.web_url
if re.search(
r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
):
source = (
file._parent.web_url
+ "/"
+ urllib.parse.quote(file.name)
)
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
metadata_dict[file.name] = {
"source": file.web_url,
"source": source,
"mime_type": file.mime_type,
"created": str(file.created),
"modified": str(file.modified),
@ -241,9 +252,18 @@ class O365BaseLoader(BaseLoader, BaseModel):
continue
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
source = file.web_url
if re.search(
r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
):
source = (
file._parent.web_url
+ "/"
+ urllib.parse.quote(file.name)
)
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
metadata_dict[file.name] = {
"source": file.web_url,
"source": source,
"mime_type": file.mime_type,
"created": file.created,
"modified": file.modified,