From 2df8ac402abab8087ef8f514c02a3a1fc82b8114 Mon Sep 17 00:00:00 2001
From: Martin Triska <martin.triska@gmail.com>
Date: Thu, 23 May 2024 17:42:19 +0200
Subject: [PATCH] community[minor]: Added propagation of document metadata from
 O365BaseLoader (#20663)

**Description:**
- Added propagation of document metadata from O365BaseLoader to
FileSystemBlobLoader (O365BaseLoader uses FileSystemBlobLoader under the
hood).
- This is done by passing dictionary `metadata_dict`: key=filename and
value=dictionary containing document's metadata
- Modified `FileSystemBlobLoader` to accept the `metadata_dict`, use
`mimetype` from it (if available) and pass metadata further into blob
loader.

**Issue:**
- `O365BaseLoader` under the hood downloads documents to temp folder and
then uses `FileSystemBlobLoader` on it.
- However metadata about the document in question is lost in this
process. In particular:
- `mime_type`: `FileSystemBlobLoader` guesses `mime_type` from the file
extension, but that does not work 100% of the time.
- `web_url`: this is useful to keep around since in RAG LLM we might
want to provide link to the source document. In order to work well with
document parsers, we pass the `web_url` as `source` (`web_url` is
ignored by parsers, `source` is preserved)

**Dependencies:**
None

**Twitter handle:**
@martintriska1

Please review @baskaryan

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
---
 .../document_loaders/base_o365.py             | 24 ++++++++++++++++---
 .../document_loaders/sharepoint.py            |  5 +++-
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py
index ddf95bdc764..33a7c5a818a 100644
--- a/libs/community/langchain_community/document_loaders/base_o365.py
+++ b/libs/community/langchain_community/document_loaders/base_o365.py
@@ -1,4 +1,5 @@
 """Base class for all loaders that uses O365 Package"""
+
 from __future__ import annotations
 
 import logging
@@ -6,8 +7,8 @@ import os
 import tempfile
 from abc import abstractmethod
 from enum import Enum
-from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
+from pathlib import Path, PurePath
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union
 
 from langchain_core.pydantic_v1 import (
     BaseModel,
@@ -108,14 +109,31 @@ class O365BaseLoader(BaseLoader, BaseModel):
         """
         file_mime_types = self._fetch_mime_types
         items = folder.get_items()
+        metadata_dict: Dict[str, Dict[str, Any]] = {}
         with tempfile.TemporaryDirectory() as temp_dir:
             os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
             for file in items:
                 if file.is_file:
                     if file.mime_type in list(file_mime_types.values()):
                         file.download(to_path=temp_dir, chunk_size=self.chunk_size)
+                        metadata_dict[file.name] = {
+                            "source": file.web_url,
+                            "mime_type": file.mime_type,
+                            "created": file.created,
+                            "modified": file.modified,
+                            "created_by": str(file.created_by),
+                            "modified_by": str(file.modified_by),
+                            "description": file.description,
+                        }
+
             loader = FileSystemBlobLoader(path=temp_dir)
-            yield from loader.yield_blobs()
+            for blob in loader.yield_blobs():
+                if not isinstance(blob.path, PurePath):
+                    raise NotImplementedError("Expected blob path to be a PurePath")
+                if blob.path:
+                    file_metadata_ = metadata_dict.get(str(blob.path), {})
+                    blob.metadata.update(file_metadata_)
+                yield blob
         if self.recursive:
             for subfolder in folder.get_child_folders():
                 yield from self._load_from_folder(subfolder)
diff --git a/libs/community/langchain_community/document_loaders/sharepoint.py b/libs/community/langchain_community/document_loaders/sharepoint.py
index bfcc47fba1b..5eb02df867a 100644
--- a/libs/community/langchain_community/document_loaders/sharepoint.py
+++ b/libs/community/langchain_community/document_loaders/sharepoint.py
@@ -1,4 +1,5 @@
 """Loader that loads data from Sharepoint Document Library"""
+
 from __future__ import annotations
 
 import json
@@ -82,7 +83,9 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
             if not isinstance(target_folder, Folder):
                 raise ValueError("Unable to fetch root folder")
             for blob in self._load_from_folder(target_folder):
-                yield from blob_parser.lazy_parse(blob)
+                for blob_part in blob_parser.lazy_parse(blob):
+                    blob_part.metadata.update(blob.metadata)
+                    yield blob_part
 
     def authorized_identities(self) -> List:
         data = self._fetch_access_token()