community[minor]: Add custom sitemap URL parameter to GitbookLoader (#30549)

## Description
This PR adds a new `sitemap_url` parameter to the `GitbookLoader` class
that allows users to specify a custom sitemap URL when loading content
from a GitBook site. This is particularly useful for GitBook sites that
use non-standard sitemap file names like `sitemap-pages.xml` instead of
the default `sitemap.xml`.
The standard `GitbookLoader` assumes that the sitemap is located at
`/sitemap.xml`, but some GitBook instances (including GitBook's own
documentation) use different paths for their sitemaps. This parameter
makes the loader more flexible and helps users extract content from a
wider range of GitBook sites.
## Issue
Fixes bug
[30473](https://github.com/langchain-ai/langchain/issues/30473) where
the `GitbookLoader` would fail to find pages on GitBook sites that use
custom sitemap URLs.
## Dependencies
No new dependencies required.
*I've added*:
* Unit tests to verify the parameter works correctly
* Integration tests to confirm the parameter is properly used with real
GitBook sites
* Updated docstrings with parameter documentation
The changes are fully backward compatible, as the parameter is optional
with a sensible default.

---------

Co-authored-by: andrasfe <andrasf94@gmail.com>
Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
Andras L Ferenczi 2025-04-01 12:17:21 -04:00 committed by GitHub
parent fdda1aaea1
commit 64df60e690
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 235 additions and 1 deletions

View File

@ -21,6 +21,8 @@ class GitbookLoader(WebBaseLoader):
content_selector: str = "main",
continue_on_failure: bool = False,
show_progress: bool = True,
*,
sitemap_url: Optional[str] = None,
):
"""Initialize with web page and whether to load all paths.
@ -38,13 +40,20 @@ class GitbookLoader(WebBaseLoader):
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
sitemap_url: Custom sitemap URL to use when load_all_paths is True.
Defaults to "{base_url}/sitemap.xml".
"""
self.base_url = base_url or web_page
if self.base_url.endswith("/"):
self.base_url = self.base_url[:-1]
if load_all_paths:
# set web_path to the sitemap if we want to crawl all paths
web_page = f"{self.base_url}/sitemap.xml"
if sitemap_url:
web_page = sitemap_url
else:
web_page = f"{self.base_url}/sitemap.xml"
super().__init__(
web_paths=(web_page,),
continue_on_failure=continue_on_failure,

View File

@ -54,3 +54,24 @@ class TestGitbookLoader:
result = loader.load()
print(len(result)) # noqa: T201
assert len(result) > 10
@pytest.mark.parametrize(
"web_page, sitemap_url, expected_web_path",
[
(
"https://example.com/",
"https://example.com/custom-sitemap.xml",
"https://example.com/custom-sitemap.xml",
),
],
)
def test_init_with_custom_sitemap(
self,
web_page: str,
sitemap_url: str,
expected_web_path: str,
) -> None:
"""Test that the custom sitemap URL is correctly used when provided."""
loader = GitbookLoader(web_page, load_all_paths=True, sitemap_url=sitemap_url)
assert loader.web_path == expected_web_path
assert loader.load_all_paths

View File

@ -0,0 +1,204 @@
from typing import Any, Tuple
from unittest.mock import MagicMock, patch
import pytest
from bs4 import BeautifulSoup
from langchain_community.document_loaders.gitbook import GitbookLoader
@pytest.fixture
def mock_soups() -> Tuple[BeautifulSoup, BeautifulSoup]:
# Create mock soup with loc elements for sitemap testing
sitemap_content = """
<urlset>
<url><loc>https://example.com/page1</loc></url>
<url><loc>https://example.com/page2</loc></url>
<url><loc>https://example.com/page3</loc></url>
</urlset>
"""
mock_sitemap_soup = BeautifulSoup(sitemap_content, "html.parser")
# Create mock soup for page content
page_content = """
<html>
<body>
<main>
<h1>Test Page</h1>
<p>This is test content.</p>
</main>
</body>
</html>
"""
mock_page_soup = BeautifulSoup(page_content, "html.parser")
return mock_sitemap_soup, mock_page_soup
@patch("langchain_community.document_loaders.web_base.requests.get")
def test_init_with_default_sitemap(mock_get: MagicMock) -> None:
# Test that the loader uses the default sitemap URL when load_all_paths=True
loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
# Check that the web_path was set to the default sitemap URL
assert loader.web_paths[0] == "https://example.com/sitemap.xml"
@patch("langchain_community.document_loaders.web_base.requests.get")
def test_init_with_custom_sitemap(mock_get: MagicMock) -> None:
# Test that the loader uses the provided sitemap URL when specified
custom_sitemap = "https://example.com/sitemap-pages.xml"
loader = GitbookLoader(
web_page="https://example.com",
load_all_paths=True,
sitemap_url=custom_sitemap,
)
# Check that the web_path was set to the custom sitemap URL
assert loader.web_paths[0] == custom_sitemap
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all")
def test_lazy_load_with_custom_sitemap(
mock_scrape_all: MagicMock,
mock_scrape: MagicMock,
mock_soups: Tuple[BeautifulSoup, BeautifulSoup],
) -> None:
# Setup the mocks
mock_sitemap_soup, mock_page_soup = mock_soups
mock_scrape.return_value = mock_sitemap_soup
mock_scrape_all.return_value = [
mock_page_soup,
mock_page_soup,
mock_page_soup,
]
# Create loader with custom sitemap URL
loader = GitbookLoader(
web_page="https://example.com",
load_all_paths=True,
sitemap_url="https://example.com/sitemap-pages.xml",
)
# Get the documents
docs = list(loader.lazy_load())
# Check that we got docs for each path in the sitemap
assert len(docs) == 3
for doc in docs:
assert doc.metadata["title"] == "Test Page"
assert "This is test content." in doc.page_content
@patch("langchain_community.document_loaders.web_base.requests.get")
def test_with_single_page(mock_get: MagicMock) -> None:
# Test loading a single page (load_all_paths=False)
loader = GitbookLoader(web_page="https://example.com/page", load_all_paths=False)
# Check that sitemap URL logic was not applied
assert loader.web_paths[0] == "https://example.com/page"
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
def test_get_paths_extraction(
mock_scrape: MagicMock, mock_soups: Tuple[BeautifulSoup, BeautifulSoup]
) -> None:
# Test that _get_paths correctly extracts paths from sitemap
mock_sitemap_soup, _ = mock_soups
mock_scrape.return_value = mock_sitemap_soup
loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
soup_info = loader.scrape()
paths = loader._get_paths(soup_info)
# Check that paths were extracted correctly
assert len(paths) == 3
assert paths == ["/page1", "/page2", "/page3"]
@patch("requests.get")
def test_integration_with_different_sitemaps(mock_get: MagicMock) -> None:
# This test simulates the reported issue with different sitemap formats
# Mock response for default sitemap (empty content)
empty_resp = MagicMock()
empty_resp.text = "<urlset></urlset>"
empty_resp.status_code = 200
# Mock response for custom sitemap (with content)
custom_resp = MagicMock()
custom_resp.text = """
<urlset>
<url><loc>https://docs.gitbook.com/page1</loc></url>
<url><loc>https://docs.gitbook.com/page2</loc></url>
</urlset>
"""
custom_resp.status_code = 200
# Mock response for the actual pages
page_resp = MagicMock()
page_resp.text = """
<html><body><main><h1>Page</h1><p>Content</p></main></body></html>
"""
page_resp.status_code = 200
# Define side effect to return different responses based on URL
def side_effect(url: str, *args: Any, **kwargs: Any) -> MagicMock:
if url == "https://docs.gitbook.com/sitemap.xml":
return empty_resp
elif url == "https://docs.gitbook.com/sitemap-pages.xml":
return custom_resp
else:
return page_resp
mock_get.side_effect = side_effect
# Test with default sitemap (should result in no docs)
with patch(
"langchain_community.document_loaders.web_base.requests.get",
side_effect=side_effect,
):
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
) as mock_scrape:
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
) as mock_scrape_all:
mock_scrape.return_value = BeautifulSoup(
"<urlset></urlset>", "html.parser"
)
mock_scrape_all.return_value = []
loader1 = GitbookLoader(
web_page="https://docs.gitbook.com/", load_all_paths=True
)
docs1 = list(loader1.lazy_load())
assert len(docs1) == 0
# Test with custom sitemap (should result in docs)
with patch(
"langchain_community.document_loaders.web_base.requests.get",
side_effect=side_effect,
):
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
) as mock_scrape:
with patch(
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
) as mock_scrape_all:
mock_scrape.return_value = BeautifulSoup(
custom_resp.text, "html.parser"
)
mock_scrape_all.return_value = [
BeautifulSoup(page_resp.text, "html.parser"),
BeautifulSoup(page_resp.text, "html.parser"),
]
loader2 = GitbookLoader(
web_page="https://docs.gitbook.com/",
load_all_paths=True,
sitemap_url="https://docs.gitbook.com/sitemap-pages.xml",
)
docs2 = list(loader2.lazy_load())
assert len(docs2) == 2