mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 13:54:48 +00:00
when encountering error during fetch return "" in web_base.py (#8753)
when e.g. downloading a sitemap with a malformed url (e.g. "ttp://example.com/index.html" with the h omitted at the beginning of the url), this will ensure that the sitemap download does not crash, but just emits a warning. (maybe should be optional with e.g. a `skip_faulty_urls:bool=True` parameter, but this was the most straightforward fix) @rlancemartin, @eyurtsev --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
bbd22b9b76
commit
cff52638b2
@ -31,7 +31,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
"""
|
||||
""" # noqa: E501
|
||||
|
||||
base_url: str
|
||||
"""Base url of the blackboard course."""
|
||||
@ -47,6 +47,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
load_all_recursively: bool = True,
|
||||
basic_auth: Optional[Tuple[str, str]] = None,
|
||||
cookies: Optional[dict] = None,
|
||||
continue_on_failure: Optional[bool] = False,
|
||||
):
|
||||
"""Initialize with blackboard course url.
|
||||
|
||||
@ -58,6 +59,10 @@ class BlackboardLoader(WebBaseLoader):
|
||||
load_all_recursively: If True, load all documents recursively.
|
||||
basic_auth: Basic auth credentials.
|
||||
cookies: Cookies.
|
||||
continue_on_failure: whether to continue loading the sitemap if an error
|
||||
occurs loading a url, emitting a warning instead of raising an
|
||||
exception. Setting this to True makes the loader more robust, but also
|
||||
may result in missing data. Default: False
|
||||
|
||||
Raises:
|
||||
ValueError: If blackboard course url is invalid.
|
||||
@ -80,6 +85,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
cookies.update({"BbRouter": bbrouter})
|
||||
self.session.cookies.update(cookies)
|
||||
self.load_all_recursively = load_all_recursively
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.check_bs4()
|
||||
|
||||
def check_bs4(self) -> None:
|
||||
|
@ -19,6 +19,7 @@ class GitbookLoader(WebBaseLoader):
|
||||
load_all_paths: bool = False,
|
||||
base_url: Optional[str] = None,
|
||||
content_selector: str = "main",
|
||||
continue_on_failure: Optional[bool] = False,
|
||||
):
|
||||
"""Initialize with web page and whether to load all paths.
|
||||
|
||||
@ -31,6 +32,10 @@ class GitbookLoader(WebBaseLoader):
|
||||
appended to this base url. Defaults to `web_page`.
|
||||
content_selector: The CSS selector for the content to load.
|
||||
Defaults to "main".
|
||||
continue_on_failure: whether to continue loading the sitemap if an error
|
||||
occurs loading a url, emitting a warning instead of raising an
|
||||
exception. Setting this to True makes the loader more robust, but also
|
||||
may result in missing data. Default: False
|
||||
"""
|
||||
self.base_url = base_url or web_page
|
||||
if self.base_url.endswith("/"):
|
||||
@ -43,6 +48,7 @@ class GitbookLoader(WebBaseLoader):
|
||||
super().__init__(web_paths)
|
||||
self.load_all_paths = load_all_paths
|
||||
self.content_selector = content_selector
|
||||
self.continue_on_failure = continue_on_failure
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Fetch text from one single GitBook page."""
|
||||
|
@ -33,6 +33,7 @@ class SitemapLoader(WebBaseLoader):
|
||||
blocknum: int = 0,
|
||||
meta_function: Optional[Callable] = None,
|
||||
is_local: bool = False,
|
||||
continue_on_failure: bool = False,
|
||||
):
|
||||
"""Initialize with webpage path and optional filter URLs.
|
||||
|
||||
@ -48,6 +49,10 @@ class SitemapLoader(WebBaseLoader):
|
||||
remember when setting this method to also copy metadata["loc"]
|
||||
to metadata["source"] if you are using this field
|
||||
is_local: whether the sitemap is a local file. Default: False
|
||||
continue_on_failure: whether to continue loading the sitemap if an error
|
||||
occurs loading a url, emitting a warning instead of raising an
|
||||
exception. Setting this to True makes the loader more robust, but also
|
||||
may result in missing data. Default: False
|
||||
"""
|
||||
|
||||
if blocksize is not None and blocksize < 1:
|
||||
@ -71,6 +76,7 @@ class SitemapLoader(WebBaseLoader):
|
||||
self.blocksize = blocksize
|
||||
self.blocknum = blocknum
|
||||
self.is_local = is_local
|
||||
self.continue_on_failure = continue_on_failure
|
||||
|
||||
def parse_sitemap(self, soup: Any) -> List[dict]:
|
||||
"""Parse sitemap xml and load into a list of dicts.
|
||||
|
@ -62,6 +62,7 @@ class WebBaseLoader(BaseLoader):
|
||||
header_template: Optional[dict] = None,
|
||||
verify_ssl: Optional[bool] = True,
|
||||
proxies: Optional[dict] = None,
|
||||
continue_on_failure: Optional[bool] = False,
|
||||
):
|
||||
"""Initialize with webpage path."""
|
||||
|
||||
@ -96,6 +97,7 @@ class WebBaseLoader(BaseLoader):
|
||||
self.session = requests.Session()
|
||||
self.session.headers = dict(headers)
|
||||
self.session.verify = verify_ssl
|
||||
self.continue_on_failure = continue_on_failure
|
||||
|
||||
if proxies:
|
||||
self.session.proxies.update(proxies)
|
||||
@ -133,7 +135,20 @@ class WebBaseLoader(BaseLoader):
|
||||
self, url: str, semaphore: asyncio.Semaphore
|
||||
) -> str:
|
||||
async with semaphore:
|
||||
return await self._fetch(url)
|
||||
try:
|
||||
return await self._fetch(url)
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.warning(
|
||||
f"Error fetching {url}, skipping due to"
|
||||
f" continue_on_failure=True"
|
||||
)
|
||||
return ""
|
||||
logger.exception(
|
||||
f"Error fetching {url} and aborting, use continue_on_failure=True "
|
||||
"to continue loading urls after encountering an error."
|
||||
)
|
||||
raise e
|
||||
|
||||
async def fetch_all(self, urls: List[str]) -> Any:
|
||||
"""Fetch all urls concurrently with rate limiting."""
|
||||
|
Loading…
Reference in New Issue
Block a user