Community: add modified_since argument to O365BaseLoader (#28708)

## What are we doing in this PR
We're adding `modified_since` optional argument to `O365BaseLoader`.
When set, O365 loader will only load documents newer than
`modified_since` datetime.

## Why?
OneDrives / Sharepoints can contain large number of documents. Current
approach is to download and parse all files and let indexer to deal with
duplicates. This can be prohibitively time-consuming. Especially when
using OCR-based parser like
[zerox](fa06188834/libs/community/langchain_community/document_loaders/pdf.py (L948)).
This argument allows to skip documents that are older than known time of
indexing.

_Q: What if a file was modfied during last indexing process?
A: Users can set the `modified_since` conservatively and indexer will
still take care of duplicates._




If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Martin Triska 2024-12-13 18:30:17 +01:00 committed by GitHub
parent c855d434c5
commit 05ebe1e66b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -9,6 +9,7 @@ import re
import tempfile import tempfile
import urllib import urllib
from abc import abstractmethod from abc import abstractmethod
from datetime import datetime
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
@ -86,6 +87,9 @@ class O365BaseLoader(BaseLoader, BaseModel):
"""Number of bytes to retrieve from each api call to the server. int or 'auto'.""" """Number of bytes to retrieve from each api call to the server. int or 'auto'."""
recursive: bool = False recursive: bool = False
"""Should the loader recursively load subfolders?""" """Should the loader recursively load subfolders?"""
modified_since: Optional[datetime] = None
"""Only fetch documents modified since given datetime. The datetime object
must be timezone aware."""
handlers: Optional[Dict[str, Any]] = {} handlers: Optional[Dict[str, Any]] = {}
""" """
Provide custom handlers for MimeTypeBasedParser. Provide custom handlers for MimeTypeBasedParser.
@ -188,6 +192,9 @@ class O365BaseLoader(BaseLoader, BaseModel):
for file in items: for file in items:
if file.is_file: if file.is_file:
if file.mime_type in list(file_mime_types.values()): if file.mime_type in list(file_mime_types.values()):
if (not self.modified_since) or (
file.modified > self.modified_since
):
source = file.web_url source = file.web_url
if re.search( if re.search(
r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url