From 24b5c27bb167042ef08ea56a1dc3ce9cb474b103 Mon Sep 17 00:00:00 2001 From: Kefan You Date: Wed, 22 May 2024 07:51:03 +0800 Subject: [PATCH] community[patch]: raise_for_status logic missing in async _fetch of WebBaseLoader (#21948) ## 'raise_for_status' parameter of WebBaseLoader works in sync load but not in async load. In webBaseLoader: Sync load is calling `_scrape` and has `raise_for_status` properly handled. ``` def _scrape( self, url: str, parser: Union[str, None] = None, bs_kwargs: Optional[dict] = None, ) -> Any: from bs4 import BeautifulSoup if parser is None: if url.endswith(".xml"): parser = "xml" else: parser = self.default_parser self._check_parser(parser) html_doc = self.session.get(url, **self.requests_kwargs) if self.raise_for_status: html_doc.raise_for_status() if self.encoding is not None: html_doc.encoding = self.encoding elif self.autoset_encoding: html_doc.encoding = html_doc.apparent_encoding return BeautifulSoup(html_doc.text, parser, **(bs_kwargs or {})) ``` Async load is calling `_fetch` but missing `raise_for_status` logic. ``` async def _fetch( self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 ) -> str: async with aiohttp.ClientSession() as session: for i in range(retries): try: async with session.get( url, headers=self.session.headers, ssl=None if self.session.verify else False, cookies=self.session.cookies.get_dict(), ) as response: return await response.text() ``` Co-authored-by: kefan.you --- libs/community/langchain_community/document_loaders/web_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/community/langchain_community/document_loaders/web_base.py b/libs/community/langchain_community/document_loaders/web_base.py index b07f904c5fc..b925f792e57 100644 --- a/libs/community/langchain_community/document_loaders/web_base.py +++ b/libs/community/langchain_community/document_loaders/web_base.py @@ -134,6 +134,8 @@ class WebBaseLoader(BaseLoader): ssl=None if self.session.verify else False, cookies=self.session.cookies.get_dict(), ) as response: + if self.raise_for_status: + response.raise_for_status() return await response.text() except aiohttp.ClientConnectionError as e: if i == retries - 1: