diff --git a/libs/community/tests/unit_tests/document_loaders/test_sitemap.py b/libs/community/tests/unit_tests/document_loaders/test_sitemap.py index e6e9d4265e3..6aefbb32468 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_sitemap.py +++ b/libs/community/tests/unit_tests/document_loaders/test_sitemap.py @@ -1,55 +1,127 @@ import aiohttp - +import requests_mock.mocker from langchain_community.document_loaders import SitemapLoader +import requests + +xml_ = """ + + +https://example.com +weekly +0.5 + + +https://test.com/production +weekly +0.5 + + +https://test.com/next +weekly +0.5 + + +https://important.website.org +weekly +0.5 + +""" + +class CustomSession(requests.Session): + def __init__(self): + super().__init__() + + def get(self, url, **kwargs): + # Check if the URL matches any of the hardcoded URLs + if url == "https://example.com": + return self._mock_response('HERE') + elif url == "https://test.com/production": + return self._mock_response('NOT HERE') + elif url == "https://test.com/next": + return self._mock_response('ALSO NOT HERE') + else: + return self._mock_response("MAYBE HERE") + + def _mock_response(self, text): + response = requests.Response() + response.status_code = 200 + response._content = text.encode('utf-8') + return response + + +async def test__init__(): + loader = SitemapLoader("http://test.com/sitemap.xml", restrict_to_same_domain=False,session=CustomSession()) + docs = [] + + session = CustomSession() + async with aiohttp.ClientSession() as session: + async with session.get("https://example.com") as response: + ans = await response.text() + assert ans == "HERE" + with requests_mock.mocker.Mocker() as m: + m.get("http://test.com/sitemap.xml", text=xml_) + docs = loader.load() + assert len(docs) == 4 -class MockGet: - def __init__(self, url, headers={},ssl=None,cookies={}): - if "sitemap" in url: - self._text = """ - - - https://very.serious.website.org - 2023-05-04T16:15:31.377584+00:00 - - weekly - 1 - - - - https://example.com - 2023-05-05T07:52:19.633878+00:00 - - daily - 0.9 - - - - """ - elif "serious" in url: - self._text = "foo" - elif "example" in url: - self._text = "bar" - - self.headers = headers - self.ssl = ssl - self.cookies = cookies - - async def text(self): - return self._text - - async def __aexit__(self, exc_type, exc, tb): - pass - - async def __aenter__(self): - return self - - -def test__init__(mocker): - mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet) - # session = aiohttp.ClientSession() +def test_lazy_load(): loader = SitemapLoader("http://test.com/sitemap.xml", restrict_to_same_domain=False) - docs = loader.aload() + docs = [] + + with requests_mock.mocker.Mocker() as m: + m.get("http://test.com/sitemap.xml", text=xml_) + for doc in loader.lazy_load(): + docs.append(doc) + + assert len(docs) == 4 + + +async def test_alazy_load(): + loader = SitemapLoader( + "http://test.com/sitemap.xml", restrict_to_same_domain=False + ) + docs = [] + + with requests_mock.mocker.Mocker() as m: + m.get("http://test.com/sitemap.xml", text=xml_) + async for doc in loader.alazy_load(): + docs.append(doc) + + assert len(docs) == 4 + +async def test_same_domain(): + loader = SitemapLoader("http://test.com/sitemap.xml", restrict_to_same_domain=True) + docs = [] + + with requests_mock.mocker.Mocker() as m: + m.get("http://test.com/sitemap.xml", text=xml_) + async for doc in loader.alazy_load(): + docs.append(doc) - print(len(docs)) assert len(docs) == 2 + +def test_regex_filter(): + loader = SitemapLoader("http://test.com/sitemap.xml", restrict_to_same_domain=False, filter_urls=['.*example.*']) + docs = [] + + with requests_mock.mocker.Mocker() as m: + m.get("http://test.com/sitemap.xml", text=xml_) + m.get("https://example.com", text="HERE") + docs = loader.load() + + assert docs[0].page_content == "HERE" + assert len(docs) == 1 + +def test_blocks(): + loader = SitemapLoader("http://test.com/sitemap.xml", restrict_to_same_domain=False, blocksize=2,blocknum=1) + docs = [] + + with requests_mock.mocker.Mocker() as m: + m.get("http://test.com/sitemap.xml", text=xml_) + #m.get("https://test.com/next", text="HERE") + #m.get("https://important.website.org", text="NOT HERE") + docs = loader.load() + + #assert docs[0].page_content == "HERE" + assert docs[0].metadata['loc'] == "https://test.com/next" + assert len(docs) == 2 \ No newline at end of file