diff --git a/libs/langchain/langchain/document_loaders/blackboard.py b/libs/langchain/langchain/document_loaders/blackboard.py index 0ab6ca11be8..b21db2dfd1b 100644 --- a/libs/langchain/langchain/document_loaders/blackboard.py +++ b/libs/langchain/langchain/document_loaders/blackboard.py @@ -31,7 +31,7 @@ class BlackboardLoader(WebBaseLoader): ) documents = loader.load() - """ + """ # noqa: E501 base_url: str """Base url of the blackboard course.""" @@ -47,6 +47,7 @@ class BlackboardLoader(WebBaseLoader): load_all_recursively: bool = True, basic_auth: Optional[Tuple[str, str]] = None, cookies: Optional[dict] = None, + continue_on_failure: Optional[bool] = False, ): """Initialize with blackboard course url. @@ -58,6 +59,10 @@ class BlackboardLoader(WebBaseLoader): load_all_recursively: If True, load all documents recursively. basic_auth: Basic auth credentials. cookies: Cookies. + continue_on_failure: whether to continue loading the sitemap if an error + occurs loading a url, emitting a warning instead of raising an + exception. Setting this to True makes the loader more robust, but also + may result in missing data. Default: False Raises: ValueError: If blackboard course url is invalid. @@ -80,6 +85,7 @@ class BlackboardLoader(WebBaseLoader): cookies.update({"BbRouter": bbrouter}) self.session.cookies.update(cookies) self.load_all_recursively = load_all_recursively + self.continue_on_failure = continue_on_failure self.check_bs4() def check_bs4(self) -> None: diff --git a/libs/langchain/langchain/document_loaders/gitbook.py b/libs/langchain/langchain/document_loaders/gitbook.py index e293c70c0ad..1fcec22922d 100644 --- a/libs/langchain/langchain/document_loaders/gitbook.py +++ b/libs/langchain/langchain/document_loaders/gitbook.py @@ -19,6 +19,7 @@ class GitbookLoader(WebBaseLoader): load_all_paths: bool = False, base_url: Optional[str] = None, content_selector: str = "main", + continue_on_failure: Optional[bool] = False, ): """Initialize with web page and whether to load all paths. @@ -31,6 +32,10 @@ class GitbookLoader(WebBaseLoader): appended to this base url. Defaults to `web_page`. content_selector: The CSS selector for the content to load. Defaults to "main". + continue_on_failure: whether to continue loading the sitemap if an error + occurs loading a url, emitting a warning instead of raising an + exception. Setting this to True makes the loader more robust, but also + may result in missing data. Default: False """ self.base_url = base_url or web_page if self.base_url.endswith("/"): @@ -43,6 +48,7 @@ class GitbookLoader(WebBaseLoader): super().__init__(web_paths) self.load_all_paths = load_all_paths self.content_selector = content_selector + self.continue_on_failure = continue_on_failure def load(self) -> List[Document]: """Fetch text from one single GitBook page.""" diff --git a/libs/langchain/langchain/document_loaders/sitemap.py b/libs/langchain/langchain/document_loaders/sitemap.py index 68fe88eefbc..158f19e0d2d 100644 --- a/libs/langchain/langchain/document_loaders/sitemap.py +++ b/libs/langchain/langchain/document_loaders/sitemap.py @@ -33,6 +33,7 @@ class SitemapLoader(WebBaseLoader): blocknum: int = 0, meta_function: Optional[Callable] = None, is_local: bool = False, + continue_on_failure: bool = False, ): """Initialize with webpage path and optional filter URLs. @@ -48,6 +49,10 @@ class SitemapLoader(WebBaseLoader): remember when setting this method to also copy metadata["loc"] to metadata["source"] if you are using this field is_local: whether the sitemap is a local file. Default: False + continue_on_failure: whether to continue loading the sitemap if an error + occurs loading a url, emitting a warning instead of raising an + exception. Setting this to True makes the loader more robust, but also + may result in missing data. Default: False """ if blocksize is not None and blocksize < 1: @@ -71,6 +76,7 @@ class SitemapLoader(WebBaseLoader): self.blocksize = blocksize self.blocknum = blocknum self.is_local = is_local + self.continue_on_failure = continue_on_failure def parse_sitemap(self, soup: Any) -> List[dict]: """Parse sitemap xml and load into a list of dicts. diff --git a/libs/langchain/langchain/document_loaders/web_base.py b/libs/langchain/langchain/document_loaders/web_base.py index ed684cd9ee1..5ae9482bce9 100644 --- a/libs/langchain/langchain/document_loaders/web_base.py +++ b/libs/langchain/langchain/document_loaders/web_base.py @@ -62,6 +62,7 @@ class WebBaseLoader(BaseLoader): header_template: Optional[dict] = None, verify_ssl: Optional[bool] = True, proxies: Optional[dict] = None, + continue_on_failure: Optional[bool] = False, ): """Initialize with webpage path.""" @@ -96,6 +97,7 @@ class WebBaseLoader(BaseLoader): self.session = requests.Session() self.session.headers = dict(headers) self.session.verify = verify_ssl + self.continue_on_failure = continue_on_failure if proxies: self.session.proxies.update(proxies) @@ -133,7 +135,20 @@ class WebBaseLoader(BaseLoader): self, url: str, semaphore: asyncio.Semaphore ) -> str: async with semaphore: - return await self._fetch(url) + try: + return await self._fetch(url) + except Exception as e: + if self.continue_on_failure: + logger.warning( + f"Error fetching {url}, skipping due to" + f" continue_on_failure=True" + ) + return "" + logger.exception( + f"Error fetching {url} and aborting, use continue_on_failure=True " + "to continue loading urls after encountering an error." + ) + raise e async def fetch_all(self, urls: List[str]) -> Any: """Fetch all urls concurrently with rate limiting."""