mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-20 09:57:32 +00:00
IMPROVEMENT WebResearchRetriever error handling in urls with connection error (#13401)
- **Description:** Added a method `fetch_valid_documents` to `WebResearchRetriever` class that will test the connection for every url in `new_urls` and remove those that raise a `ConnectionError`. - **Issue:** [Previous PR](https://github.com/langchain-ai/langchain/pull/13353), - **Dependencies:** None, - **Tag maintainer:** @efriis Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17.
This commit is contained in:
parent
d2335d0114
commit
0fb5f857f9
@ -51,6 +51,7 @@ class AsyncHtmlLoader(BaseLoader):
|
|||||||
requests_per_second: int = 2,
|
requests_per_second: int = 2,
|
||||||
requests_kwargs: Optional[Dict[str, Any]] = None,
|
requests_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
raise_for_status: bool = False,
|
raise_for_status: bool = False,
|
||||||
|
ignore_load_errors: bool = False,
|
||||||
):
|
):
|
||||||
"""Initialize with a webpage path."""
|
"""Initialize with a webpage path."""
|
||||||
|
|
||||||
@ -88,6 +89,17 @@ class AsyncHtmlLoader(BaseLoader):
|
|||||||
self.raise_for_status = raise_for_status
|
self.raise_for_status = raise_for_status
|
||||||
self.autoset_encoding = autoset_encoding
|
self.autoset_encoding = autoset_encoding
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
self.ignore_load_errors = ignore_load_errors
|
||||||
|
|
||||||
|
def _fetch_valid_connection_docs(self, url: str) -> Any:
|
||||||
|
if self.ignore_load_errors:
|
||||||
|
try:
|
||||||
|
return self.session.get(url, **self.requests_kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
warnings.warn(str(e))
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self.session.get(url, **self.requests_kwargs)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _check_parser(parser: str) -> None:
|
def _check_parser(parser: str) -> None:
|
||||||
@ -114,7 +126,10 @@ class AsyncHtmlLoader(BaseLoader):
|
|||||||
|
|
||||||
self._check_parser(parser)
|
self._check_parser(parser)
|
||||||
|
|
||||||
html_doc = self.session.get(url, **self.requests_kwargs)
|
html_doc = self._fetch_valid_connection_docs(url)
|
||||||
|
if not getattr(html_doc, "ok", False):
|
||||||
|
return None
|
||||||
|
|
||||||
if self.raise_for_status:
|
if self.raise_for_status:
|
||||||
html_doc.raise_for_status()
|
html_doc.raise_for_status()
|
||||||
|
|
||||||
@ -142,7 +157,10 @@ class AsyncHtmlLoader(BaseLoader):
|
|||||||
text = ""
|
text = ""
|
||||||
return text
|
return text
|
||||||
except aiohttp.ClientConnectionError as e:
|
except aiohttp.ClientConnectionError as e:
|
||||||
if i == retries - 1:
|
if i == retries - 1 and self.ignore_load_errors:
|
||||||
|
logger.warning(f"Error fetching {url} after {retries} retries.")
|
||||||
|
return ""
|
||||||
|
elif i == retries - 1:
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -196,6 +214,8 @@ class AsyncHtmlLoader(BaseLoader):
|
|||||||
docs = []
|
docs = []
|
||||||
for i, text in enumerate(cast(List[str], results)):
|
for i, text in enumerate(cast(List[str], results)):
|
||||||
soup = self._scrape(self.web_paths[i])
|
soup = self._scrape(self.web_paths[i])
|
||||||
|
if not soup:
|
||||||
|
continue
|
||||||
metadata = _build_metadata(soup, self.web_paths[i])
|
metadata = _build_metadata(soup, self.web_paths[i])
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
|
||||||
|
@ -198,7 +198,7 @@ class WebResearchRetriever(BaseRetriever):
|
|||||||
logger.info(f"New URLs to load: {new_urls}")
|
logger.info(f"New URLs to load: {new_urls}")
|
||||||
# Load, split, and add new urls to vectorstore
|
# Load, split, and add new urls to vectorstore
|
||||||
if new_urls:
|
if new_urls:
|
||||||
loader = AsyncHtmlLoader(new_urls)
|
loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True)
|
||||||
html2text = Html2TextTransformer()
|
html2text = Html2TextTransformer()
|
||||||
logger.info("Indexing new urls...")
|
logger.info("Indexing new urls...")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
Loading…
Reference in New Issue
Block a user