diff --git a/libs/community/tests/unit_tests/document_loaders/test_recursive_url_loader.py b/libs/community/tests/unit_tests/document_loaders/test_recursive_url_loader.py new file mode 100644 index 00000000000..6f0fd09738d --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_recursive_url_loader.py @@ -0,0 +1,72 @@ +import requests_mock +from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader +import aiohttp + + +def mock_requests(loader): + html1 = ( + '
hullo
' + '
buhbye
' + ) + html2 = '
buhbye
' + html3 = '
buhbye
' + html4 = '

the end

' + + MOCK_DEFINITIONS = [ + ('http://test.com', html1), + ('http://test.com/one', html2), + ('http://test.com/two', html3), + ('http://test.com/three', html4), + ] + + with requests_mock.Mocker() as m: + for url, html in MOCK_DEFINITIONS: + m.get(url, text=html) + docs = loader.load() + return docs + +class MockGet: + def __init__(self, url): + if "one" in url: + self._text = '

buhbye
' + elif "two" in url: + self._text = '
buhbye
' + elif "three" in url: + self._text = '

the end

' + else: + self._text = ( + '

hullo
' + '
buhbye
' + ) + self.headers = {} + + async def text(self): + return self._text + + async def __aexit__(self, exc_type, exc, tb): + pass + + async def __aenter__(self): + return self + +def test_sync__init__(): + loader = RecursiveUrlLoader("http://test.com",max_depth=1) + docs = mock_requests(loader) + assert len(docs) == 1 + +def test_async__init__(mocker): + mocker.patch.object(aiohttp.ClientSession, 'get', new=MockGet) + loader = RecursiveUrlLoader("http://test.com",max_depth=1, use_async=True) + docs = loader.load() + assert len(docs) == 1 + +def test_sync_deduplication(): + loader = RecursiveUrlLoader("http://test.com",max_depth=3) + docs = mock_requests(loader) + assert len(docs) == 4 + +def test_async_deduplication(mocker): + mocker.patch.object(aiohttp.ClientSession, 'get', new=MockGet) + loader = RecursiveUrlLoader("http://test.com",max_depth=3, use_async=True) + docs = loader.load() + assert len(docs) == 4 \ No newline at end of file