mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-28 11:55:21 +00:00
community[minor]: Add custom sitemap URL parameter to GitbookLoader (#30549)
## Description This PR adds a new `sitemap_url` parameter to the `GitbookLoader` class that allows users to specify a custom sitemap URL when loading content from a GitBook site. This is particularly useful for GitBook sites that use non-standard sitemap file names like `sitemap-pages.xml` instead of the default `sitemap.xml`. The standard `GitbookLoader` assumes that the sitemap is located at `/sitemap.xml`, but some GitBook instances (including GitBook's own documentation) use different paths for their sitemaps. This parameter makes the loader more flexible and helps users extract content from a wider range of GitBook sites. ## Issue Fixes bug [30473](https://github.com/langchain-ai/langchain/issues/30473) where the `GitbookLoader` would fail to find pages on GitBook sites that use custom sitemap URLs. ## Dependencies No new dependencies required. *I've added*: * Unit tests to verify the parameter works correctly * Integration tests to confirm the parameter is properly used with real GitBook sites * Updated docstrings with parameter documentation The changes are fully backward compatible, as the parameter is optional with a sensible default. --------- Co-authored-by: andrasfe <andrasf94@gmail.com> Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
parent
fdda1aaea1
commit
64df60e690
@ -21,6 +21,8 @@ class GitbookLoader(WebBaseLoader):
|
||||
content_selector: str = "main",
|
||||
continue_on_failure: bool = False,
|
||||
show_progress: bool = True,
|
||||
*,
|
||||
sitemap_url: Optional[str] = None,
|
||||
):
|
||||
"""Initialize with web page and whether to load all paths.
|
||||
|
||||
@ -38,13 +40,20 @@ class GitbookLoader(WebBaseLoader):
|
||||
exception. Setting this to True makes the loader more robust, but also
|
||||
may result in missing data. Default: False
|
||||
show_progress: whether to show a progress bar while loading. Default: True
|
||||
sitemap_url: Custom sitemap URL to use when load_all_paths is True.
|
||||
Defaults to "{base_url}/sitemap.xml".
|
||||
"""
|
||||
self.base_url = base_url or web_page
|
||||
if self.base_url.endswith("/"):
|
||||
self.base_url = self.base_url[:-1]
|
||||
|
||||
if load_all_paths:
|
||||
# set web_path to the sitemap if we want to crawl all paths
|
||||
if sitemap_url:
|
||||
web_page = sitemap_url
|
||||
else:
|
||||
web_page = f"{self.base_url}/sitemap.xml"
|
||||
|
||||
super().__init__(
|
||||
web_paths=(web_page,),
|
||||
continue_on_failure=continue_on_failure,
|
||||
|
@ -54,3 +54,24 @@ class TestGitbookLoader:
|
||||
result = loader.load()
|
||||
print(len(result)) # noqa: T201
|
||||
assert len(result) > 10
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"web_page, sitemap_url, expected_web_path",
|
||||
[
|
||||
(
|
||||
"https://example.com/",
|
||||
"https://example.com/custom-sitemap.xml",
|
||||
"https://example.com/custom-sitemap.xml",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_init_with_custom_sitemap(
|
||||
self,
|
||||
web_page: str,
|
||||
sitemap_url: str,
|
||||
expected_web_path: str,
|
||||
) -> None:
|
||||
"""Test that the custom sitemap URL is correctly used when provided."""
|
||||
loader = GitbookLoader(web_page, load_all_paths=True, sitemap_url=sitemap_url)
|
||||
assert loader.web_path == expected_web_path
|
||||
assert loader.load_all_paths
|
||||
|
204
libs/community/tests/unit_tests/document_loaders/test_gitbook.py
Normal file
204
libs/community/tests/unit_tests/document_loaders/test_gitbook.py
Normal file
@ -0,0 +1,204 @@
|
||||
from typing import Any, Tuple
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from langchain_community.document_loaders.gitbook import GitbookLoader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_soups() -> Tuple[BeautifulSoup, BeautifulSoup]:
|
||||
# Create mock soup with loc elements for sitemap testing
|
||||
sitemap_content = """
|
||||
<urlset>
|
||||
<url><loc>https://example.com/page1</loc></url>
|
||||
<url><loc>https://example.com/page2</loc></url>
|
||||
<url><loc>https://example.com/page3</loc></url>
|
||||
</urlset>
|
||||
"""
|
||||
mock_sitemap_soup = BeautifulSoup(sitemap_content, "html.parser")
|
||||
|
||||
# Create mock soup for page content
|
||||
page_content = """
|
||||
<html>
|
||||
<body>
|
||||
<main>
|
||||
<h1>Test Page</h1>
|
||||
<p>This is test content.</p>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
mock_page_soup = BeautifulSoup(page_content, "html.parser")
|
||||
return mock_sitemap_soup, mock_page_soup
|
||||
|
||||
|
||||
@patch("langchain_community.document_loaders.web_base.requests.get")
|
||||
def test_init_with_default_sitemap(mock_get: MagicMock) -> None:
|
||||
# Test that the loader uses the default sitemap URL when load_all_paths=True
|
||||
loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
|
||||
|
||||
# Check that the web_path was set to the default sitemap URL
|
||||
assert loader.web_paths[0] == "https://example.com/sitemap.xml"
|
||||
|
||||
|
||||
@patch("langchain_community.document_loaders.web_base.requests.get")
|
||||
def test_init_with_custom_sitemap(mock_get: MagicMock) -> None:
|
||||
# Test that the loader uses the provided sitemap URL when specified
|
||||
custom_sitemap = "https://example.com/sitemap-pages.xml"
|
||||
loader = GitbookLoader(
|
||||
web_page="https://example.com",
|
||||
load_all_paths=True,
|
||||
sitemap_url=custom_sitemap,
|
||||
)
|
||||
|
||||
# Check that the web_path was set to the custom sitemap URL
|
||||
assert loader.web_paths[0] == custom_sitemap
|
||||
|
||||
|
||||
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
|
||||
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all")
|
||||
def test_lazy_load_with_custom_sitemap(
|
||||
mock_scrape_all: MagicMock,
|
||||
mock_scrape: MagicMock,
|
||||
mock_soups: Tuple[BeautifulSoup, BeautifulSoup],
|
||||
) -> None:
|
||||
# Setup the mocks
|
||||
mock_sitemap_soup, mock_page_soup = mock_soups
|
||||
mock_scrape.return_value = mock_sitemap_soup
|
||||
mock_scrape_all.return_value = [
|
||||
mock_page_soup,
|
||||
mock_page_soup,
|
||||
mock_page_soup,
|
||||
]
|
||||
|
||||
# Create loader with custom sitemap URL
|
||||
loader = GitbookLoader(
|
||||
web_page="https://example.com",
|
||||
load_all_paths=True,
|
||||
sitemap_url="https://example.com/sitemap-pages.xml",
|
||||
)
|
||||
|
||||
# Get the documents
|
||||
docs = list(loader.lazy_load())
|
||||
|
||||
# Check that we got docs for each path in the sitemap
|
||||
assert len(docs) == 3
|
||||
for doc in docs:
|
||||
assert doc.metadata["title"] == "Test Page"
|
||||
assert "This is test content." in doc.page_content
|
||||
|
||||
|
||||
@patch("langchain_community.document_loaders.web_base.requests.get")
|
||||
def test_with_single_page(mock_get: MagicMock) -> None:
|
||||
# Test loading a single page (load_all_paths=False)
|
||||
loader = GitbookLoader(web_page="https://example.com/page", load_all_paths=False)
|
||||
|
||||
# Check that sitemap URL logic was not applied
|
||||
assert loader.web_paths[0] == "https://example.com/page"
|
||||
|
||||
|
||||
@patch("langchain_community.document_loaders.gitbook.GitbookLoader.scrape")
|
||||
def test_get_paths_extraction(
|
||||
mock_scrape: MagicMock, mock_soups: Tuple[BeautifulSoup, BeautifulSoup]
|
||||
) -> None:
|
||||
# Test that _get_paths correctly extracts paths from sitemap
|
||||
mock_sitemap_soup, _ = mock_soups
|
||||
mock_scrape.return_value = mock_sitemap_soup
|
||||
|
||||
loader = GitbookLoader(web_page="https://example.com", load_all_paths=True)
|
||||
|
||||
soup_info = loader.scrape()
|
||||
paths = loader._get_paths(soup_info)
|
||||
|
||||
# Check that paths were extracted correctly
|
||||
assert len(paths) == 3
|
||||
assert paths == ["/page1", "/page2", "/page3"]
|
||||
|
||||
|
||||
@patch("requests.get")
|
||||
def test_integration_with_different_sitemaps(mock_get: MagicMock) -> None:
|
||||
# This test simulates the reported issue with different sitemap formats
|
||||
|
||||
# Mock response for default sitemap (empty content)
|
||||
empty_resp = MagicMock()
|
||||
empty_resp.text = "<urlset></urlset>"
|
||||
empty_resp.status_code = 200
|
||||
|
||||
# Mock response for custom sitemap (with content)
|
||||
custom_resp = MagicMock()
|
||||
custom_resp.text = """
|
||||
<urlset>
|
||||
<url><loc>https://docs.gitbook.com/page1</loc></url>
|
||||
<url><loc>https://docs.gitbook.com/page2</loc></url>
|
||||
</urlset>
|
||||
"""
|
||||
custom_resp.status_code = 200
|
||||
|
||||
# Mock response for the actual pages
|
||||
page_resp = MagicMock()
|
||||
page_resp.text = """
|
||||
<html><body><main><h1>Page</h1><p>Content</p></main></body></html>
|
||||
"""
|
||||
page_resp.status_code = 200
|
||||
|
||||
# Define side effect to return different responses based on URL
|
||||
def side_effect(url: str, *args: Any, **kwargs: Any) -> MagicMock:
|
||||
if url == "https://docs.gitbook.com/sitemap.xml":
|
||||
return empty_resp
|
||||
elif url == "https://docs.gitbook.com/sitemap-pages.xml":
|
||||
return custom_resp
|
||||
else:
|
||||
return page_resp
|
||||
|
||||
mock_get.side_effect = side_effect
|
||||
|
||||
# Test with default sitemap (should result in no docs)
|
||||
with patch(
|
||||
"langchain_community.document_loaders.web_base.requests.get",
|
||||
side_effect=side_effect,
|
||||
):
|
||||
with patch(
|
||||
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
|
||||
) as mock_scrape:
|
||||
with patch(
|
||||
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
|
||||
) as mock_scrape_all:
|
||||
mock_scrape.return_value = BeautifulSoup(
|
||||
"<urlset></urlset>", "html.parser"
|
||||
)
|
||||
mock_scrape_all.return_value = []
|
||||
|
||||
loader1 = GitbookLoader(
|
||||
web_page="https://docs.gitbook.com/", load_all_paths=True
|
||||
)
|
||||
docs1 = list(loader1.lazy_load())
|
||||
assert len(docs1) == 0
|
||||
|
||||
# Test with custom sitemap (should result in docs)
|
||||
with patch(
|
||||
"langchain_community.document_loaders.web_base.requests.get",
|
||||
side_effect=side_effect,
|
||||
):
|
||||
with patch(
|
||||
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape"
|
||||
) as mock_scrape:
|
||||
with patch(
|
||||
"langchain_community.document_loaders.gitbook.GitbookLoader.scrape_all"
|
||||
) as mock_scrape_all:
|
||||
mock_scrape.return_value = BeautifulSoup(
|
||||
custom_resp.text, "html.parser"
|
||||
)
|
||||
mock_scrape_all.return_value = [
|
||||
BeautifulSoup(page_resp.text, "html.parser"),
|
||||
BeautifulSoup(page_resp.text, "html.parser"),
|
||||
]
|
||||
|
||||
loader2 = GitbookLoader(
|
||||
web_page="https://docs.gitbook.com/",
|
||||
load_all_paths=True,
|
||||
sitemap_url="https://docs.gitbook.com/sitemap-pages.xml",
|
||||
)
|
||||
docs2 = list(loader2.lazy_load())
|
||||
assert len(docs2) == 2
|
Loading…
Reference in New Issue
Block a user