From 05ebe1e66b0ec998e91dfbdd8e7afb024d091ed6 Mon Sep 17 00:00:00 2001 From: Martin Triska Date: Fri, 13 Dec 2024 18:30:17 +0100 Subject: [PATCH] Community: add `modified_since` argument to `O365BaseLoader` (#28708) ## What are we doing in this PR We're adding `modified_since` optional argument to `O365BaseLoader`. When set, O365 loader will only load documents newer than `modified_since` datetime. ## Why? OneDrives / Sharepoints can contain large number of documents. Current approach is to download and parse all files and let indexer to deal with duplicates. This can be prohibitively time-consuming. Especially when using OCR-based parser like [zerox](https://github.com/langchain-ai/langchain/blob/fa0618883493cf6a1447a73b66cd10c0f028e09b/libs/community/langchain_community/document_loaders/pdf.py#L948). This argument allows to skip documents that are older than known time of indexing. _Q: What if a file was modfied during last indexing process? A: Users can set the `modified_since` conservatively and indexer will still take care of duplicates._ If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis --- .../document_loaders/base_o365.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py index 981a637cbb3..4cd341fadde 100644 --- a/libs/community/langchain_community/document_loaders/base_o365.py +++ b/libs/community/langchain_community/document_loaders/base_o365.py @@ -9,6 +9,7 @@ import re import tempfile import urllib from abc import abstractmethod +from datetime import datetime from pathlib import Path, PurePath from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union @@ -86,6 +87,9 @@ class O365BaseLoader(BaseLoader, BaseModel): """Number of bytes to retrieve from each api call to the server. int or 'auto'.""" recursive: bool = False """Should the loader recursively load subfolders?""" + modified_since: Optional[datetime] = None + """Only fetch documents modified since given datetime. The datetime object + must be timezone aware.""" handlers: Optional[Dict[str, Any]] = {} """ Provide custom handlers for MimeTypeBasedParser. @@ -188,26 +192,29 @@ class O365BaseLoader(BaseLoader, BaseModel): for file in items: if file.is_file: if file.mime_type in list(file_mime_types.values()): - source = file.web_url - if re.search( - r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url + if (not self.modified_since) or ( + file.modified > self.modified_since ): - source = ( - file._parent.web_url - + "/" - + urllib.parse.quote(file.name) - ) - file.download(to_path=temp_dir, chunk_size=self.chunk_size) - metadata_dict[file.name] = { - "source": source, - "mime_type": file.mime_type, - "created": str(file.created), - "modified": str(file.modified), - "created_by": str(file.created_by), - "modified_by": str(file.modified_by), - "description": file.description, - "id": str(file.object_id), - } + source = file.web_url + if re.search( + r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url + ): + source = ( + file._parent.web_url + + "/" + + urllib.parse.quote(file.name) + ) + file.download(to_path=temp_dir, chunk_size=self.chunk_size) + metadata_dict[file.name] = { + "source": source, + "mime_type": file.mime_type, + "created": str(file.created), + "modified": str(file.modified), + "created_by": str(file.created_by), + "modified_by": str(file.modified_by), + "description": file.description, + "id": str(file.object_id), + } loader = FileSystemBlobLoader(path=temp_dir) for blob in loader.yield_blobs():