mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 08:56:27 +00:00
fix recursive loader (#10856)
This commit is contained in:
parent
de0a02f507
commit
b05a74b106
@ -120,6 +120,7 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
return
|
||||
|
||||
# Get all links that can be accessed from the current URL
|
||||
visited.add(url)
|
||||
try:
|
||||
response = requests.get(url, timeout=self.timeout, headers=self.headers)
|
||||
except Exception:
|
||||
@ -131,7 +132,6 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
page_content=content,
|
||||
metadata=self.metadata_extractor(response.text, url),
|
||||
)
|
||||
visited.add(url)
|
||||
|
||||
# Store the visited links and recursively visit the children
|
||||
sub_links = extract_sub_links(
|
||||
@ -184,11 +184,11 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
timeout=aiohttp.ClientTimeout(total=self.timeout),
|
||||
headers=self.headers,
|
||||
)
|
||||
async with self._lock: # type: ignore
|
||||
visited.add(url)
|
||||
try:
|
||||
async with session.get(url) as response:
|
||||
text = await response.text()
|
||||
async with self._lock: # type: ignore
|
||||
visited.add(url)
|
||||
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
|
||||
logger.warning(
|
||||
f"Unable to load {url}. Received error {e} of type "
|
||||
|
@ -14,7 +14,7 @@ def test_async_recursive_url_loader() -> None:
|
||||
timeout=None,
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1024
|
||||
assert len(docs) == 890
|
||||
assert docs[0].page_content == "placeholder"
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ def test_sync_recursive_url_loader() -> None:
|
||||
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 27
|
||||
assert len(docs) == 25
|
||||
assert docs[0].page_content == "placeholder"
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user