From 05ebe1e66b0ec998e91dfbdd8e7afb024d091ed6 Mon Sep 17 00:00:00 2001
From: Martin Triska <martin.triska@gmail.com>
Date: Fri, 13 Dec 2024 18:30:17 +0100
Subject: [PATCH] Community: add `modified_since` argument to `O365BaseLoader`
 (#28708)

## What are we doing in this PR
We're adding `modified_since` optional argument to `O365BaseLoader`.
When set, O365 loader will only load documents newer than
`modified_since` datetime.

## Why?
OneDrives / Sharepoints can contain large number of documents. Current
approach is to download and parse all files and let indexer to deal with
duplicates. This can be prohibitively time-consuming. Especially when
using OCR-based parser like
[zerox](https://github.com/langchain-ai/langchain/blob/fa0618883493cf6a1447a73b66cd10c0f028e09b/libs/community/langchain_community/document_loaders/pdf.py#L948).
This argument allows to skip documents that are older than known time of
indexing.

_Q: What if a file was modfied during last indexing process?
A: Users can set the `modified_since` conservatively and indexer will
still take care of duplicates._


If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
---
 .../document_loaders/base_o365.py             | 45 +++++++++++--------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py
index 981a637cbb3..4cd341fadde 100644
--- a/libs/community/langchain_community/document_loaders/base_o365.py
+++ b/libs/community/langchain_community/document_loaders/base_o365.py
@@ -9,6 +9,7 @@ import re
 import tempfile
 import urllib
 from abc import abstractmethod
+from datetime import datetime
 from pathlib import Path, PurePath
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
 
@@ -86,6 +87,9 @@ class O365BaseLoader(BaseLoader, BaseModel):
     """Number of bytes to retrieve from each api call to the server. int or 'auto'."""
     recursive: bool = False
     """Should the loader recursively load subfolders?"""
+    modified_since: Optional[datetime] = None
+    """Only fetch documents modified since given datetime. The datetime object
+    must be timezone aware."""
     handlers: Optional[Dict[str, Any]] = {}
     """
     Provide custom handlers for MimeTypeBasedParser.
@@ -188,26 +192,29 @@ class O365BaseLoader(BaseLoader, BaseModel):
             for file in items:
                 if file.is_file:
                     if file.mime_type in list(file_mime_types.values()):
-                        source = file.web_url
-                        if re.search(
-                            r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                        if (not self.modified_since) or (
+                            file.modified > self.modified_since
                         ):
-                            source = (
-                                file._parent.web_url
-                                + "/"
-                                + urllib.parse.quote(file.name)
-                            )
-                        file.download(to_path=temp_dir, chunk_size=self.chunk_size)
-                        metadata_dict[file.name] = {
-                            "source": source,
-                            "mime_type": file.mime_type,
-                            "created": str(file.created),
-                            "modified": str(file.modified),
-                            "created_by": str(file.created_by),
-                            "modified_by": str(file.modified_by),
-                            "description": file.description,
-                            "id": str(file.object_id),
-                        }
+                            source = file.web_url
+                            if re.search(
+                                r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                            ):
+                                source = (
+                                    file._parent.web_url
+                                    + "/"
+                                    + urllib.parse.quote(file.name)
+                                )
+                            file.download(to_path=temp_dir, chunk_size=self.chunk_size)
+                            metadata_dict[file.name] = {
+                                "source": source,
+                                "mime_type": file.mime_type,
+                                "created": str(file.created),
+                                "modified": str(file.modified),
+                                "created_by": str(file.created_by),
+                                "modified_by": str(file.modified_by),
+                                "description": file.description,
+                                "id": str(file.object_id),
+                            }
 
             loader = FileSystemBlobLoader(path=temp_dir)
             for blob in loader.yield_blobs():