diff --git a/libs/community/tests/unit_tests/document_loaders/test_web_base.py b/libs/community/tests/unit_tests/document_loaders/test_web_base.py index ea789009806..529c19b4c1f 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_web_base.py +++ b/libs/community/tests/unit_tests/document_loaders/test_web_base.py @@ -1,3 +1,7 @@ +from textwrap import dedent +from typing import Any +from unittest.mock import MagicMock, patch + import pytest as pytest from langchain_community.document_loaders.web_base import WebBaseLoader @@ -19,3 +23,62 @@ class TestWebBaseLoader: assert web_base_loader.web_paths == ["https://www.example.com"] web_base_loader = WebBaseLoader(web_path="https://www.example.com") assert web_base_loader.web_paths == ["https://www.example.com"] + + +@pytest.mark.requires("bs4") +@patch("langchain_community.document_loaders.web_base.requests.Session.get") +def test_lazy_load(mock_get: Any) -> None: + import bs4 + + mock_response = MagicMock() + mock_response.text = "
Test content
" + mock_get.return_value = mock_response + + loader = WebBaseLoader(web_paths=["https://www.example.com"]) + results = list(loader.lazy_load()) + mock_get.assert_called_with("https://www.example.com") + assert len(results) == 1 + assert results[0].page_content == "Test content" + + # Test bs4 kwargs + mock_html = dedent(""" + + +Test content
+Test content
" + + mock_response = MagicMock() + mock_response.text = mock_text + mock_get.return_value.__aenter__.return_value = mock_response + + loader = WebBaseLoader( + web_paths=["https://www.example.com"], + header_template={"User-Agent": "test-user-agent"}, + ) + results = loader.aload() + assert len(results) == 1 + assert results[0].page_content == "Test content" + mock_get.assert_called_with( + "https://www.example.com", headers={"User-Agent": "test-user-agent"}, cookies={} + )