diff --git a/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py b/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py index 92c274f33ed..533f7fa038b 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py +++ b/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py @@ -1,33 +1,78 @@ from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader - +import asyncio def test_async_recursive_url_loader() -> None: url = "https://docs.python.org/3.9/" loader = RecursiveUrlLoader( - url, - extractor=lambda _: "placeholder", - use_async=True, - max_depth=3, - timeout=None, - check_response_status=True, + url, extractor=lambda _: "placeholder", use_async=True, max_depth=3, + timeout=None, check_response_status=True, ) docs = loader.load() assert len(docs) == 512 assert docs[0].page_content == "placeholder" - def test_async_recursive_url_loader_deterministic() -> None: url = "https://docs.python.org/3.9/" loader = RecursiveUrlLoader( - url, - use_async=True, - max_depth=3, - timeout=None, + url, use_async=True, max_depth=3, timeout=None, ) docs = sorted(loader.load(), key=lambda d: d.metadata["source"]) docs_2 = sorted(loader.load(), key=lambda d: d.metadata["source"]) assert docs == docs_2 +def test_async_recursive_url_lazy_loader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url, extractor=lambda _: "placeholder", use_async=True, max_depth=3, + timeout=None, check_response_status=True, + ) + docs = [doc for doc in loader.lazy_load()] + assert len(docs) == 512 + assert docs[0].page_content == "placeholder" + +def test_async_recursive_url_lazy_loader_deterministic() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url, use_async=True, max_depth=3, timeout=None, + ) + docs = sorted([doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]) + docs_2 = sorted([doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]) + assert docs == docs_2 + + +async def test_async_recursive_url_alazy_loader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url, extractor=lambda _: "placeholder", use_async=True, max_depth=3, + timeout=None, check_response_status=True, + ) + docs = [doc async for doc in loader.alazy_load()] + assert len(docs) == 512 + assert docs[0].page_content == "placeholder" + +def test_async_recursive_url_aloader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url, extractor=lambda _: "placeholder", use_async=True, max_depth=3, + timeout=None, check_response_status=True, + ) + docs = asyncio.run(loader.aload()) + assert len(docs) == 512 + assert docs[0].page_content == "placeholder" + +def test_async_recursive_url_aloader_deterministic() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader(url, use_async=True, max_depth=3, timeout=None,) + docs = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"]) + docs_2 = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"]) + assert docs == docs_2 + +async def test_async_recursive_url_alazy_loader_deterministic() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader(url, use_async=True, max_depth=3, timeout=None,) + docs = sorted([doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]) + docs_2 = sorted([doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]) + assert docs == docs_2 def test_sync_recursive_url_loader() -> None: url = "https://docs.python.org/3.9/" @@ -38,16 +83,41 @@ def test_sync_recursive_url_loader() -> None: assert len(docs) == 24 assert docs[0].page_content == "placeholder" +def test_sync_recursive_url_lazy_loader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url, extractor=lambda _: "placeholder", use_async=False, max_depth=2 + ) + docs = [doc for doc in loader.lazy_load()] + assert len(docs) == 24 + assert docs[0].page_content == "placeholder" + +def test_sync_recursive_url_aloader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url, extractor=lambda _: "placeholder", use_async=False, max_depth=2 + ) + docs = asyncio.run(loader.aload()) + assert len(docs) == 24 + assert docs[0].page_content == "placeholder" + +async def test_sync_recursive_url_alazy_loader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url, extractor=lambda _: "placeholder", use_async=False, max_depth=2 + ) + docs = [doc async for doc in loader.alazy_load()] + assert len(docs) == 24 + assert docs[0].page_content == "placeholder" def test_sync_async_equivalent() -> None: url = "https://docs.python.org/3.9/" loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) - async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) + async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2) docs = sorted(loader.load(), key=lambda d: d.metadata["source"]) async_docs = sorted(async_loader.load(), key=lambda d: d.metadata["source"]) assert docs == async_docs - def test_loading_invalid_url() -> None: url = "https://this.url.is.invalid/this/is/a/test" loader = RecursiveUrlLoader( @@ -56,11 +126,10 @@ def test_loading_invalid_url() -> None: docs = loader.load() assert len(docs) == 0 - def test_sync_async_metadata_necessary_properties() -> None: url = "https://docs.python.org/3.9/" loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) - async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) + async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2) docs = loader.load() async_docs = async_loader.load() for doc in docs: