diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py index 6a73e515a82..45f27a8d78d 100644 --- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py +++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py @@ -120,6 +120,7 @@ class RecursiveUrlLoader(BaseLoader): return # Get all links that can be accessed from the current URL + visited.add(url) try: response = requests.get(url, timeout=self.timeout, headers=self.headers) except Exception: @@ -131,7 +132,6 @@ class RecursiveUrlLoader(BaseLoader): page_content=content, metadata=self.metadata_extractor(response.text, url), ) - visited.add(url) # Store the visited links and recursively visit the children sub_links = extract_sub_links( @@ -184,11 +184,11 @@ class RecursiveUrlLoader(BaseLoader): timeout=aiohttp.ClientTimeout(total=self.timeout), headers=self.headers, ) + async with self._lock: # type: ignore + visited.add(url) try: async with session.get(url) as response: text = await response.text() - async with self._lock: # type: ignore - visited.add(url) except (aiohttp.client_exceptions.InvalidURL, Exception) as e: logger.warning( f"Unable to load {url}. Received error {e} of type " diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py b/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py index b2080921dd8..82f96f044fe 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py @@ -14,7 +14,7 @@ def test_async_recursive_url_loader() -> None: timeout=None, ) docs = loader.load() - assert len(docs) == 1024 + assert len(docs) == 890 assert docs[0].page_content == "placeholder" @@ -38,7 +38,7 @@ def test_sync_recursive_url_loader() -> None: url, extractor=lambda _: "placeholder", use_async=False, max_depth=2 ) docs = loader.load() - assert len(docs) == 27 + assert len(docs) == 25 assert docs[0].page_content == "placeholder"