mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-11 15:35:09 +00:00
recursive loader add status check (#10891)
This commit is contained in:
parent
6e02c45ca4
commit
c1f9cc0bc5
@ -63,6 +63,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
prevent_outside: Optional[bool] = True,
|
prevent_outside: Optional[bool] = True,
|
||||||
link_regex: Union[str, re.Pattern, None] = None,
|
link_regex: Union[str, re.Pattern, None] = None,
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
|
check_response_status: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||||
Args:
|
Args:
|
||||||
@ -84,6 +85,8 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
prevent_outside: If True, prevent loading from urls which are not children
|
prevent_outside: If True, prevent loading from urls which are not children
|
||||||
of the root url.
|
of the root url.
|
||||||
link_regex: Regex for extracting sub-links from the raw html of a web page.
|
link_regex: Regex for extracting sub-links from the raw html of a web page.
|
||||||
|
check_response_status: If True, check HTTP response status and skip
|
||||||
|
URLs with error responses (400-599).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.url = url
|
self.url = url
|
||||||
@ -101,6 +104,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
self.link_regex = link_regex
|
self.link_regex = link_regex
|
||||||
self._lock = asyncio.Lock() if self.use_async else None
|
self._lock = asyncio.Lock() if self.use_async else None
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
|
self.check_response_status = check_response_status
|
||||||
|
|
||||||
def _get_child_links_recursive(
|
def _get_child_links_recursive(
|
||||||
self, url: str, visited: Set[str], *, depth: int = 0
|
self, url: str, visited: Set[str], *, depth: int = 0
|
||||||
@ -123,8 +127,13 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
visited.add(url)
|
visited.add(url)
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, timeout=self.timeout, headers=self.headers)
|
response = requests.get(url, timeout=self.timeout, headers=self.headers)
|
||||||
except Exception:
|
if self.check_response_status and 400 <= response.status_code <= 599:
|
||||||
logger.warning(f"Unable to load from {url}")
|
raise ValueError(f"Received HTTP status {response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Unable to load from {url}. Received error {e} of type "
|
||||||
|
f"{e.__class__.__name__}"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
content = self.extractor(response.text)
|
content = self.extractor(response.text)
|
||||||
if content:
|
if content:
|
||||||
@ -193,6 +202,8 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
text = await response.text()
|
text = await response.text()
|
||||||
|
if self.check_response_status and 400 <= response.status <= 599:
|
||||||
|
raise ValueError(f"Received HTTP status {response.status}")
|
||||||
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
|
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Unable to load {url}. Received error {e} of type "
|
f"Unable to load {url}. Received error {e} of type "
|
||||||
|
@ -12,9 +12,10 @@ def test_async_recursive_url_loader() -> None:
|
|||||||
use_async=True,
|
use_async=True,
|
||||||
max_depth=3,
|
max_depth=3,
|
||||||
timeout=None,
|
timeout=None,
|
||||||
|
check_response_status=True,
|
||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 890
|
assert len(docs) == 513
|
||||||
assert docs[0].page_content == "placeholder"
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user