fix recursive loader (#10856)

This commit is contained in:
Bagatur 2023-09-20 13:55:47 -07:00 committed by GitHub
parent de0a02f507
commit b05a74b106
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 5 deletions

View File

@ -120,6 +120,7 @@ class RecursiveUrlLoader(BaseLoader):
return return
# Get all links that can be accessed from the current URL # Get all links that can be accessed from the current URL
visited.add(url)
try: try:
response = requests.get(url, timeout=self.timeout, headers=self.headers) response = requests.get(url, timeout=self.timeout, headers=self.headers)
except Exception: except Exception:
@ -131,7 +132,6 @@ class RecursiveUrlLoader(BaseLoader):
page_content=content, page_content=content,
metadata=self.metadata_extractor(response.text, url), metadata=self.metadata_extractor(response.text, url),
) )
visited.add(url)
# Store the visited links and recursively visit the children # Store the visited links and recursively visit the children
sub_links = extract_sub_links( sub_links = extract_sub_links(
@ -184,11 +184,11 @@ class RecursiveUrlLoader(BaseLoader):
timeout=aiohttp.ClientTimeout(total=self.timeout), timeout=aiohttp.ClientTimeout(total=self.timeout),
headers=self.headers, headers=self.headers,
) )
async with self._lock: # type: ignore
visited.add(url)
try: try:
async with session.get(url) as response: async with session.get(url) as response:
text = await response.text() text = await response.text()
async with self._lock: # type: ignore
visited.add(url)
except (aiohttp.client_exceptions.InvalidURL, Exception) as e: except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
logger.warning( logger.warning(
f"Unable to load {url}. Received error {e} of type " f"Unable to load {url}. Received error {e} of type "

View File

@ -14,7 +14,7 @@ def test_async_recursive_url_loader() -> None:
timeout=None, timeout=None,
) )
docs = loader.load() docs = loader.load()
assert len(docs) == 1024 assert len(docs) == 890
assert docs[0].page_content == "placeholder" assert docs[0].page_content == "placeholder"
@ -38,7 +38,7 @@ def test_sync_recursive_url_loader() -> None:
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2 url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
) )
docs = loader.load() docs = loader.load()
assert len(docs) == 27 assert len(docs) == 25
assert docs[0].page_content == "placeholder" assert docs[0].page_content == "placeholder"