community: [bugfix] fix source path for office files in O365 (#28260)

# What problem are we fixing? Currently documents loaded using `O365BaseLoader` fetch source from `file.web_url` (where `file` is `<class 'O365.drive.File'>`). This works well for `.pdf` documents. Unfortunately office documents (`.xlsx`, `.docx` ...) pass their `web_url` in following format: `https://sharepoint_address/sites/path/to/library/root/Doc.aspx?sourcedoc=%XXXXXXXX-1111-1111-XXXX-XXXXXXXXXX%7D&file=filename.xlsx&action=default&mobileredirect=true` This obfuscates the path to the file. This PR utilizes the parrent folder's path and file name to reconstruct the actual location of the file. Knowing the file's location can be crucial for some RAG applications (path to the file can carry information we don't want to loose). @vbarda Could you please look at this one? I'm @-mentioning you since we've already closed some PRs together :-) Co-authored-by: Erick Friis <erick@langchain.dev>
2025-09-26 13:59:49 +00:00 · 2024-12-09 21:34:59 +01:00
parent 534b8f4364
commit 75bc6bb191
1 changed files with 22 additions and 2 deletions
--- a/libs/community/langchain_community/document_loaders/base_o365.py
+++ b/libs/community/langchain_community/document_loaders/base_o365.py
@@ -5,7 +5,9 @@ from __future__ import annotations
 import logging
 import mimetypes
 import os
+import re
 import tempfile
+import urllib
 from abc import abstractmethod
 from pathlib import Path, PurePath
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
@@ -186,9 +188,18 @@ class O365BaseLoader(BaseLoader, BaseModel):
            for file in items:
                if file.is_file:
                    if file.mime_type in list(file_mime_types.values()):
+                        source = file.web_url
+                        if re.search(
+                            r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                        ):
+                            source = (
+                                file._parent.web_url
+                                + "/"
+                                + urllib.parse.quote(file.name)
+                            )
                        file.download(to_path=temp_dir, chunk_size=self.chunk_size)
                        metadata_dict[file.name] = {
-                            "source": file.web_url,
+                            "source": source,
                            "mime_type": file.mime_type,
                            "created": str(file.created),
                            "modified": str(file.modified),
@@ -241,9 +252,18 @@ class O365BaseLoader(BaseLoader, BaseModel):
                    continue
                if file.is_file:
                    if file.mime_type in list(file_mime_types.values()):
+                        source = file.web_url
+                        if re.search(
+                            r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                        ):
+                            source = (
+                                file._parent.web_url
+                                + "/"
+                                + urllib.parse.quote(file.name)
+                            )
                        file.download(to_path=temp_dir, chunk_size=self.chunk_size)
                        metadata_dict[file.name] = {
-                            "source": file.web_url,
+                            "source": source,
                            "mime_type": file.mime_type,
                            "created": file.created,
                            "modified": file.modified,