mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 05:13:46 +00:00
Community: add modified_since
argument to O365BaseLoader
(#28708)
## What are we doing in this PR
We're adding `modified_since` optional argument to `O365BaseLoader`.
When set, O365 loader will only load documents newer than
`modified_since` datetime.
## Why?
OneDrives / Sharepoints can contain large number of documents. Current
approach is to download and parse all files and let indexer to deal with
duplicates. This can be prohibitively time-consuming. Especially when
using OCR-based parser like
[zerox](fa06188834/libs/community/langchain_community/document_loaders/pdf.py (L948)
).
This argument allows to skip documents that are older than known time of
indexing.
_Q: What if a file was modfied during last indexing process?
A: Users can set the `modified_since` conservatively and indexer will
still take care of duplicates._
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
---------
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
c855d434c5
commit
05ebe1e66b
@ -9,6 +9,7 @@ import re
|
|||||||
import tempfile
|
import tempfile
|
||||||
import urllib
|
import urllib
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
||||||
|
|
||||||
@ -86,6 +87,9 @@ class O365BaseLoader(BaseLoader, BaseModel):
|
|||||||
"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""
|
"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""
|
||||||
recursive: bool = False
|
recursive: bool = False
|
||||||
"""Should the loader recursively load subfolders?"""
|
"""Should the loader recursively load subfolders?"""
|
||||||
|
modified_since: Optional[datetime] = None
|
||||||
|
"""Only fetch documents modified since given datetime. The datetime object
|
||||||
|
must be timezone aware."""
|
||||||
handlers: Optional[Dict[str, Any]] = {}
|
handlers: Optional[Dict[str, Any]] = {}
|
||||||
"""
|
"""
|
||||||
Provide custom handlers for MimeTypeBasedParser.
|
Provide custom handlers for MimeTypeBasedParser.
|
||||||
@ -188,6 +192,9 @@ class O365BaseLoader(BaseLoader, BaseModel):
|
|||||||
for file in items:
|
for file in items:
|
||||||
if file.is_file:
|
if file.is_file:
|
||||||
if file.mime_type in list(file_mime_types.values()):
|
if file.mime_type in list(file_mime_types.values()):
|
||||||
|
if (not self.modified_since) or (
|
||||||
|
file.modified > self.modified_since
|
||||||
|
):
|
||||||
source = file.web_url
|
source = file.web_url
|
||||||
if re.search(
|
if re.search(
|
||||||
r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
|
r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
|
||||||
|
Loading…
Reference in New Issue
Block a user