from typing import Any, Tuple from unittest.mock import MagicMock, patch import pytest from bs4 import BeautifulSoup from langchain_community.document_loaders.gitbook import GitbookLoader @pytest.fixture def mock_soups() -> Tuple[BeautifulSoup, BeautifulSoup]: # Create mock soup with loc elements for sitemap testing sitemap_content = """ https://example.com/page1 https://example.com/page2 https://example.com/page3 """ mock_sitemap_soup = BeautifulSoup(sitemap_content, "html.parser") # Create mock soup for page content page_content = """

Test Page

This is test content.

""" mock_page_soup = BeautifulSoup(page_content, "html.parser") return mock_sitemap_soup, mock_page_soup @patch("langchain_community.document_loaders.web_base.requests.get") def test_init_with_default_sitemap(mock_get: MagicMock) -> None: # Test that the loader uses the default sitemap URL when load_all_paths=True loader = GitbookLoader(web_page="https://example.com", load_all_paths=True) # Check that the web_path was set to the default sitemap URL assert loader.web_paths[0] == "https://example.com/sitemap.xml" @patch("langchain_community.document_loaders.web_base.requests.get") def test_init_with_custom_sitemap(mock_get: MagicMock) -> None: # Test that the loader uses the provided sitemap URL when specified custom_sitemap = "https://example.com/sitemap-pages.xml" loader = GitbookLoader( web_page="https://example.com", load_all_paths=True, sitemap_url=custom_sitemap, ) # Check that the web_path was set to the custom sitemap URL assert loader.web_paths[0] == custom_sitemap @patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape") @patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all") def test_lazy_load_with_custom_sitemap( mock_scrape_all: MagicMock, mock_scrape: MagicMock, mock_soups: Tuple[BeautifulSoup, BeautifulSoup], ) -> None: # Setup the mocks mock_sitemap_soup, mock_page_soup = mock_soups mock_scrape.return_value = mock_sitemap_soup mock_scrape_all.return_value = [ mock_page_soup, mock_page_soup, mock_page_soup, ] # Create loader with custom sitemap URL loader = GitbookLoader( web_page="https://example.com", load_all_paths=True, sitemap_url="https://example.com/sitemap-pages.xml", ) # Get the documents docs = list(loader.lazy_load()) # Check that we got docs for each path in the sitemap assert len(docs) == 3 for doc in docs: assert doc.metadata["title"] == "Test Page" assert "This is test content." in doc.page_content @patch("langchain_community.document_loaders.web_base.requests.get") def test_with_single_page(mock_get: MagicMock) -> None: # Test loading a single page (load_all_paths=False) loader = GitbookLoader(web_page="https://example.com/page", load_all_paths=False) # Check that sitemap URL logic was not applied assert loader.web_paths[0] == "https://example.com/page" @patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape") def test_get_paths_extraction( mock_scrape: MagicMock, mock_soups: Tuple[BeautifulSoup, BeautifulSoup] ) -> None: # Test that _get_paths correctly extracts paths from sitemap mock_sitemap_soup, _ = mock_soups mock_scrape.return_value = mock_sitemap_soup loader = GitbookLoader(web_page="https://example.com", load_all_paths=True) soup_info = loader.scrape() paths = loader._get_paths(soup_info) # Check that paths were extracted correctly assert len(paths) == 3 assert paths == ["/page1", "/page2", "/page3"] @patch("requests.get") def test_integration_with_different_sitemaps(mock_get: MagicMock) -> None: # This test simulates the reported issue with different sitemap formats # Mock response for default sitemap (empty content) empty_resp = MagicMock() empty_resp.text = "" empty_resp.status_code = 200 # Mock response for custom sitemap (with content) custom_resp = MagicMock() custom_resp.text = """ https://docs.gitbook.com/page1 https://docs.gitbook.com/page2 """ custom_resp.status_code = 200 # Mock response for the actual pages page_resp = MagicMock() page_resp.text = """

Page

Content

""" page_resp.status_code = 200 # Define side effect to return different responses based on URL def side_effect(url: str, *args: Any, **kwargs: Any) -> MagicMock: if url == "https://docs.gitbook.com/sitemap.xml": return empty_resp elif url == "https://docs.gitbook.com/sitemap-pages.xml": return custom_resp else: return page_resp mock_get.side_effect = side_effect # Test with default sitemap (should result in no docs) with patch( "langchain_community.document_loaders.web_base.requests.get", side_effect=side_effect, ): with patch( "langchain_community.document_loaders.gitbook.GitbookLoader.scrape" ) as mock_scrape: with patch( "langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all" ) as mock_scrape_all: mock_scrape.return_value = BeautifulSoup( "", "html.parser" ) mock_scrape_all.return_value = [] loader1 = GitbookLoader( web_page="https://docs.gitbook.com/", load_all_paths=True ) docs1 = list(loader1.lazy_load()) assert len(docs1) == 0 # Test with custom sitemap (should result in docs) with patch( "langchain_community.document_loaders.web_base.requests.get", side_effect=side_effect, ): with patch( "langchain_community.document_loaders.gitbook.GitbookLoader.scrape" ) as mock_scrape: with patch( "langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all" ) as mock_scrape_all: mock_scrape.return_value = BeautifulSoup( custom_resp.text, "html.parser" ) mock_scrape_all.return_value = [ BeautifulSoup(page_resp.text, "html.parser"), BeautifulSoup(page_resp.text, "html.parser"), ] loader2 = GitbookLoader( web_page="https://docs.gitbook.com/", load_all_paths=True, sitemap_url="https://docs.gitbook.com/sitemap-pages.xml", ) docs2 = list(loader2.lazy_load()) assert len(docs2) == 2