mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-19 09:30:15 +00:00
IMPROVEMENT WebResearchRetriever error handling in urls with connection error (#13401)
- **Description:** Added a method `fetch_valid_documents` to `WebResearchRetriever` class that will test the connection for every url in `new_urls` and remove those that raise a `ConnectionError`. - **Issue:** [Previous PR](https://github.com/langchain-ai/langchain/pull/13353), - **Dependencies:** None, - **Tag maintainer:** @efriis Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17.
This commit is contained in:
parent
d2335d0114
commit
0fb5f857f9
@ -51,6 +51,7 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
requests_per_second: int = 2,
|
||||
requests_kwargs: Optional[Dict[str, Any]] = None,
|
||||
raise_for_status: bool = False,
|
||||
ignore_load_errors: bool = False,
|
||||
):
|
||||
"""Initialize with a webpage path."""
|
||||
|
||||
@ -88,6 +89,17 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
self.raise_for_status = raise_for_status
|
||||
self.autoset_encoding = autoset_encoding
|
||||
self.encoding = encoding
|
||||
self.ignore_load_errors = ignore_load_errors
|
||||
|
||||
def _fetch_valid_connection_docs(self, url: str) -> Any:
|
||||
if self.ignore_load_errors:
|
||||
try:
|
||||
return self.session.get(url, **self.requests_kwargs)
|
||||
except Exception as e:
|
||||
warnings.warn(str(e))
|
||||
return None
|
||||
|
||||
return self.session.get(url, **self.requests_kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _check_parser(parser: str) -> None:
|
||||
@ -114,7 +126,10 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
|
||||
self._check_parser(parser)
|
||||
|
||||
html_doc = self.session.get(url, **self.requests_kwargs)
|
||||
html_doc = self._fetch_valid_connection_docs(url)
|
||||
if not getattr(html_doc, "ok", False):
|
||||
return None
|
||||
|
||||
if self.raise_for_status:
|
||||
html_doc.raise_for_status()
|
||||
|
||||
@ -142,7 +157,10 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
text = ""
|
||||
return text
|
||||
except aiohttp.ClientConnectionError as e:
|
||||
if i == retries - 1:
|
||||
if i == retries - 1 and self.ignore_load_errors:
|
||||
logger.warning(f"Error fetching {url} after {retries} retries.")
|
||||
return ""
|
||||
elif i == retries - 1:
|
||||
raise
|
||||
else:
|
||||
logger.warning(
|
||||
@ -196,6 +214,8 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
docs = []
|
||||
for i, text in enumerate(cast(List[str], results)):
|
||||
soup = self._scrape(self.web_paths[i])
|
||||
if not soup:
|
||||
continue
|
||||
metadata = _build_metadata(soup, self.web_paths[i])
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
|
@ -198,7 +198,7 @@ class WebResearchRetriever(BaseRetriever):
|
||||
logger.info(f"New URLs to load: {new_urls}")
|
||||
# Load, split, and add new urls to vectorstore
|
||||
if new_urls:
|
||||
loader = AsyncHtmlLoader(new_urls)
|
||||
loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True)
|
||||
html2text = Html2TextTransformer()
|
||||
logger.info("Indexing new urls...")
|
||||
docs = loader.load()
|
||||
|
Loading…
Reference in New Issue
Block a user