mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 06:39:52 +00:00
when encountering error during fetch return "" in web_base.py (#8753)
when e.g. downloading a sitemap with a malformed url (e.g. "ttp://example.com/index.html" with the h omitted at the beginning of the url), this will ensure that the sitemap download does not crash, but just emits a warning. (maybe should be optional with e.g. a `skip_faulty_urls:bool=True` parameter, but this was the most straightforward fix) @rlancemartin, @eyurtsev --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
bbd22b9b76
commit
cff52638b2
@ -31,7 +31,7 @@ class BlackboardLoader(WebBaseLoader):
|
|||||||
)
|
)
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
|
|
||||||
"""
|
""" # noqa: E501
|
||||||
|
|
||||||
base_url: str
|
base_url: str
|
||||||
"""Base url of the blackboard course."""
|
"""Base url of the blackboard course."""
|
||||||
@ -47,6 +47,7 @@ class BlackboardLoader(WebBaseLoader):
|
|||||||
load_all_recursively: bool = True,
|
load_all_recursively: bool = True,
|
||||||
basic_auth: Optional[Tuple[str, str]] = None,
|
basic_auth: Optional[Tuple[str, str]] = None,
|
||||||
cookies: Optional[dict] = None,
|
cookies: Optional[dict] = None,
|
||||||
|
continue_on_failure: Optional[bool] = False,
|
||||||
):
|
):
|
||||||
"""Initialize with blackboard course url.
|
"""Initialize with blackboard course url.
|
||||||
|
|
||||||
@ -58,6 +59,10 @@ class BlackboardLoader(WebBaseLoader):
|
|||||||
load_all_recursively: If True, load all documents recursively.
|
load_all_recursively: If True, load all documents recursively.
|
||||||
basic_auth: Basic auth credentials.
|
basic_auth: Basic auth credentials.
|
||||||
cookies: Cookies.
|
cookies: Cookies.
|
||||||
|
continue_on_failure: whether to continue loading the sitemap if an error
|
||||||
|
occurs loading a url, emitting a warning instead of raising an
|
||||||
|
exception. Setting this to True makes the loader more robust, but also
|
||||||
|
may result in missing data. Default: False
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If blackboard course url is invalid.
|
ValueError: If blackboard course url is invalid.
|
||||||
@ -80,6 +85,7 @@ class BlackboardLoader(WebBaseLoader):
|
|||||||
cookies.update({"BbRouter": bbrouter})
|
cookies.update({"BbRouter": bbrouter})
|
||||||
self.session.cookies.update(cookies)
|
self.session.cookies.update(cookies)
|
||||||
self.load_all_recursively = load_all_recursively
|
self.load_all_recursively = load_all_recursively
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
self.check_bs4()
|
self.check_bs4()
|
||||||
|
|
||||||
def check_bs4(self) -> None:
|
def check_bs4(self) -> None:
|
||||||
|
@ -19,6 +19,7 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
load_all_paths: bool = False,
|
load_all_paths: bool = False,
|
||||||
base_url: Optional[str] = None,
|
base_url: Optional[str] = None,
|
||||||
content_selector: str = "main",
|
content_selector: str = "main",
|
||||||
|
continue_on_failure: Optional[bool] = False,
|
||||||
):
|
):
|
||||||
"""Initialize with web page and whether to load all paths.
|
"""Initialize with web page and whether to load all paths.
|
||||||
|
|
||||||
@ -31,6 +32,10 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
appended to this base url. Defaults to `web_page`.
|
appended to this base url. Defaults to `web_page`.
|
||||||
content_selector: The CSS selector for the content to load.
|
content_selector: The CSS selector for the content to load.
|
||||||
Defaults to "main".
|
Defaults to "main".
|
||||||
|
continue_on_failure: whether to continue loading the sitemap if an error
|
||||||
|
occurs loading a url, emitting a warning instead of raising an
|
||||||
|
exception. Setting this to True makes the loader more robust, but also
|
||||||
|
may result in missing data. Default: False
|
||||||
"""
|
"""
|
||||||
self.base_url = base_url or web_page
|
self.base_url = base_url or web_page
|
||||||
if self.base_url.endswith("/"):
|
if self.base_url.endswith("/"):
|
||||||
@ -43,6 +48,7 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
super().__init__(web_paths)
|
super().__init__(web_paths)
|
||||||
self.load_all_paths = load_all_paths
|
self.load_all_paths = load_all_paths
|
||||||
self.content_selector = content_selector
|
self.content_selector = content_selector
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Fetch text from one single GitBook page."""
|
"""Fetch text from one single GitBook page."""
|
||||||
|
@ -33,6 +33,7 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
blocknum: int = 0,
|
blocknum: int = 0,
|
||||||
meta_function: Optional[Callable] = None,
|
meta_function: Optional[Callable] = None,
|
||||||
is_local: bool = False,
|
is_local: bool = False,
|
||||||
|
continue_on_failure: bool = False,
|
||||||
):
|
):
|
||||||
"""Initialize with webpage path and optional filter URLs.
|
"""Initialize with webpage path and optional filter URLs.
|
||||||
|
|
||||||
@ -48,6 +49,10 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
remember when setting this method to also copy metadata["loc"]
|
remember when setting this method to also copy metadata["loc"]
|
||||||
to metadata["source"] if you are using this field
|
to metadata["source"] if you are using this field
|
||||||
is_local: whether the sitemap is a local file. Default: False
|
is_local: whether the sitemap is a local file. Default: False
|
||||||
|
continue_on_failure: whether to continue loading the sitemap if an error
|
||||||
|
occurs loading a url, emitting a warning instead of raising an
|
||||||
|
exception. Setting this to True makes the loader more robust, but also
|
||||||
|
may result in missing data. Default: False
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if blocksize is not None and blocksize < 1:
|
if blocksize is not None and blocksize < 1:
|
||||||
@ -71,6 +76,7 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
self.blocksize = blocksize
|
self.blocksize = blocksize
|
||||||
self.blocknum = blocknum
|
self.blocknum = blocknum
|
||||||
self.is_local = is_local
|
self.is_local = is_local
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
|
|
||||||
def parse_sitemap(self, soup: Any) -> List[dict]:
|
def parse_sitemap(self, soup: Any) -> List[dict]:
|
||||||
"""Parse sitemap xml and load into a list of dicts.
|
"""Parse sitemap xml and load into a list of dicts.
|
||||||
|
@ -62,6 +62,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
header_template: Optional[dict] = None,
|
header_template: Optional[dict] = None,
|
||||||
verify_ssl: Optional[bool] = True,
|
verify_ssl: Optional[bool] = True,
|
||||||
proxies: Optional[dict] = None,
|
proxies: Optional[dict] = None,
|
||||||
|
continue_on_failure: Optional[bool] = False,
|
||||||
):
|
):
|
||||||
"""Initialize with webpage path."""
|
"""Initialize with webpage path."""
|
||||||
|
|
||||||
@ -96,6 +97,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.session.headers = dict(headers)
|
self.session.headers = dict(headers)
|
||||||
self.session.verify = verify_ssl
|
self.session.verify = verify_ssl
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
|
|
||||||
if proxies:
|
if proxies:
|
||||||
self.session.proxies.update(proxies)
|
self.session.proxies.update(proxies)
|
||||||
@ -133,7 +135,20 @@ class WebBaseLoader(BaseLoader):
|
|||||||
self, url: str, semaphore: asyncio.Semaphore
|
self, url: str, semaphore: asyncio.Semaphore
|
||||||
) -> str:
|
) -> str:
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await self._fetch(url)
|
try:
|
||||||
|
return await self._fetch(url)
|
||||||
|
except Exception as e:
|
||||||
|
if self.continue_on_failure:
|
||||||
|
logger.warning(
|
||||||
|
f"Error fetching {url}, skipping due to"
|
||||||
|
f" continue_on_failure=True"
|
||||||
|
)
|
||||||
|
return ""
|
||||||
|
logger.exception(
|
||||||
|
f"Error fetching {url} and aborting, use continue_on_failure=True "
|
||||||
|
"to continue loading urls after encountering an error."
|
||||||
|
)
|
||||||
|
raise e
|
||||||
|
|
||||||
async def fetch_all(self, urls: List[str]) -> Any:
|
async def fetch_all(self, urls: List[str]) -> Any:
|
||||||
"""Fetch all urls concurrently with rate limiting."""
|
"""Fetch all urls concurrently with rate limiting."""
|
||||||
|
Loading…
Reference in New Issue
Block a user