mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-20 14:31:44 +00:00
Confluence DL retry/backoff (#3168)
Implemented a retry/backoff logic in response to #2473 --------- Co-authored-by: Justin Flick <jflick@homesite.com>
This commit is contained in:
parent
c03a65c6dc
commit
8faef1a91a
@ -1,9 +1,19 @@
|
|||||||
"""Load Data from a Confluence Space"""
|
"""Load Data from a Confluence Space"""
|
||||||
|
import logging
|
||||||
from typing import Any, Callable, List, Optional, Union
|
from typing import Any, Callable, List, Optional, Union
|
||||||
|
|
||||||
|
from tenacity import (
|
||||||
|
before_sleep_log,
|
||||||
|
retry,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_exponential,
|
||||||
|
)
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ConfluenceLoader(BaseLoader):
|
class ConfluenceLoader(BaseLoader):
|
||||||
"""
|
"""
|
||||||
@ -44,8 +54,14 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
:type oauth2: dict, optional
|
:type oauth2: dict, optional
|
||||||
:param cloud: _description_, defaults to True
|
:param cloud: _description_, defaults to True
|
||||||
:type cloud: bool, optional
|
:type cloud: bool, optional
|
||||||
:raises ValueError: _description_
|
:param number_of_retries: How many times to retry, defaults to 3
|
||||||
:raises ImportError: _description_
|
:type number_of_retries: Optional[int], optional
|
||||||
|
:param min_retry_seconds: defaults to 2
|
||||||
|
:type min_retry_seconds: Optional[int], optional
|
||||||
|
:param max_retry_seconds: defaults to 10
|
||||||
|
:type max_retry_seconds: Optional[int], optional
|
||||||
|
:raises ValueError: Errors while validating input
|
||||||
|
:raises ImportError: Required dependencies not installed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -54,13 +70,19 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
username: Optional[str] = None,
|
username: Optional[str] = None,
|
||||||
oauth2: Optional[dict] = None,
|
oauth2: Optional[dict] = None,
|
||||||
cloud: bool = True,
|
cloud: Optional[bool] = True,
|
||||||
|
number_of_retries: Optional[int] = 3,
|
||||||
|
min_retry_seconds: Optional[int] = 2,
|
||||||
|
max_retry_seconds: Optional[int] = 10,
|
||||||
):
|
):
|
||||||
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
|
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
|
||||||
if errors:
|
if errors:
|
||||||
raise ValueError(f"Error(s) while validating input: {errors}")
|
raise ValueError(f"Error(s) while validating input: {errors}")
|
||||||
|
|
||||||
self.base_url = url
|
self.base_url = url
|
||||||
|
self.number_of_retries = number_of_retries
|
||||||
|
self.min_retry_seconds = min_retry_seconds
|
||||||
|
self.max_retry_seconds = max_retry_seconds
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from atlassian import Confluence # noqa: F401
|
from atlassian import Confluence # noqa: F401
|
||||||
@ -196,9 +218,19 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
|
|
||||||
if page_ids:
|
if page_ids:
|
||||||
for page_id in page_ids:
|
for page_id in page_ids:
|
||||||
page = self.confluence.get_page_by_id(
|
get_page = retry(
|
||||||
page_id=page_id, expand="body.storage.value"
|
reraise=True,
|
||||||
)
|
stop=stop_after_attempt(
|
||||||
|
self.number_of_retries # type: ignore[arg-type]
|
||||||
|
),
|
||||||
|
wait=wait_exponential(
|
||||||
|
multiplier=1, # type: ignore[arg-type]
|
||||||
|
min=self.min_retry_seconds, # type: ignore[arg-type]
|
||||||
|
max=self.max_retry_seconds, # type: ignore[arg-type]
|
||||||
|
),
|
||||||
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
|
)(self.confluence.get_page_by_id)
|
||||||
|
page = get_page(page_id=page_id, expand="body.storage.value")
|
||||||
doc = self.process_page(page, include_attachments, text_maker)
|
doc = self.process_page(page, include_attachments, text_maker)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
@ -227,7 +259,19 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
page = 0
|
page = 0
|
||||||
docs = []
|
docs = []
|
||||||
while page < limit:
|
while page < limit:
|
||||||
batch = retrieval_method(**kwargs, start=page)
|
get_pages = retry(
|
||||||
|
reraise=True,
|
||||||
|
stop=stop_after_attempt(
|
||||||
|
self.number_of_retries # type: ignore[arg-type]
|
||||||
|
),
|
||||||
|
wait=wait_exponential(
|
||||||
|
multiplier=1,
|
||||||
|
min=self.min_retry_seconds, # type: ignore[arg-type]
|
||||||
|
max=self.max_retry_seconds, # type: ignore[arg-type]
|
||||||
|
),
|
||||||
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
|
)(retrieval_method)
|
||||||
|
batch = get_pages(**kwargs, start=page)
|
||||||
if len(batch) < limit:
|
if len(batch) < limit:
|
||||||
page = limit
|
page = limit
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user