mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-18 12:58:59 +00:00
Fix: Sitemap Document Loader Tests and Documentation (#11866)
**Description:** While working on the Docusaurus site loader #9138, I noticed some outdated docs and tests for the Sitemap Loader. **Issue:** This is tangentially related to #6691 in reference to doc links. I plan on digging in to a few of these issue when I find time next.
This commit is contained in:
parent
8bb8c56f74
commit
e669f9d731
File diff suppressed because one or more lines are too long
@ -11,7 +11,7 @@ def test_sitemap() -> None:
|
|||||||
loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
|
loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
assert len(documents) > 1
|
assert len(documents) > 1
|
||||||
assert "🦜🔗" in documents[0].page_content
|
assert "LangChain Python API" in documents[0].page_content
|
||||||
|
|
||||||
|
|
||||||
def test_sitemap_block() -> None:
|
def test_sitemap_block() -> None:
|
||||||
@ -21,7 +21,7 @@ def test_sitemap_block() -> None:
|
|||||||
)
|
)
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
assert len(documents) == 1
|
assert len(documents) == 1
|
||||||
assert "🦜🔗" in documents[0].page_content
|
assert "LangChain Python API" in documents[0].page_content
|
||||||
|
|
||||||
|
|
||||||
def test_sitemap_block_only_one() -> None:
|
def test_sitemap_block_only_one() -> None:
|
||||||
@ -31,7 +31,7 @@ def test_sitemap_block_only_one() -> None:
|
|||||||
)
|
)
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
assert len(documents) > 1
|
assert len(documents) > 1
|
||||||
assert "🦜🔗" in documents[0].page_content
|
assert "LangChain Python API" in documents[0].page_content
|
||||||
|
|
||||||
|
|
||||||
def test_sitemap_block_blocknum_default() -> None:
|
def test_sitemap_block_blocknum_default() -> None:
|
||||||
@ -41,7 +41,7 @@ def test_sitemap_block_blocknum_default() -> None:
|
|||||||
)
|
)
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
assert len(documents) > 1
|
assert len(documents) > 1
|
||||||
assert "🦜🔗" in documents[0].page_content
|
assert "LangChain Python API" in documents[0].page_content
|
||||||
|
|
||||||
|
|
||||||
def test_sitemap_block_size_to_small() -> None:
|
def test_sitemap_block_size_to_small() -> None:
|
||||||
@ -76,11 +76,11 @@ def test_filter_sitemap() -> None:
|
|||||||
"""Test sitemap loader."""
|
"""Test sitemap loader."""
|
||||||
loader = SitemapLoader(
|
loader = SitemapLoader(
|
||||||
"https://langchain.readthedocs.io/sitemap.xml",
|
"https://langchain.readthedocs.io/sitemap.xml",
|
||||||
filter_urls=["https://python.langchain.com/en/stable/"],
|
filter_urls=["https://api.python.langchain.com/en/stable/"],
|
||||||
)
|
)
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
assert len(documents) == 1
|
assert len(documents) == 1
|
||||||
assert "🦜🔗" in documents[0].page_content
|
assert "LangChain Python API" in documents[0].page_content
|
||||||
|
|
||||||
|
|
||||||
def test_sitemap_metadata() -> None:
|
def test_sitemap_metadata() -> None:
|
||||||
@ -128,7 +128,7 @@ def test_sitemap_metadata_default() -> None:
|
|||||||
def test_local_sitemap() -> None:
|
def test_local_sitemap() -> None:
|
||||||
"""Test sitemap loader."""
|
"""Test sitemap loader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/sitemap.xml"
|
file_path = Path(__file__).parent.parent / "examples/sitemap.xml"
|
||||||
loader = SitemapLoader(str(file_path))
|
loader = SitemapLoader(str(file_path), is_local=True)
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
assert len(documents) > 1
|
assert len(documents) > 1
|
||||||
assert "🦜🔗" in documents[0].page_content
|
assert "🦜️🔗" in documents[0].page_content
|
||||||
|
@ -1,35 +1,35 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||||
xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://python.langchain.com/en/stable/</loc>
|
<loc>https://python.langchain.com/en/stable/</loc>
|
||||||
|
|
||||||
|
|
||||||
<lastmod>2023-05-04T16:15:31.377584+00:00</lastmod>
|
<lastmod>2023-05-04T16:15:31.377584+00:00</lastmod>
|
||||||
|
|
||||||
<changefreq>weekly</changefreq>
|
<changefreq>weekly</changefreq>
|
||||||
<priority>1</priority>
|
<priority>1</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://python.langchain.com/en/latest/</loc>
|
<loc>https://python.langchain.com/en/latest/</loc>
|
||||||
|
|
||||||
|
|
||||||
<lastmod>2023-05-05T07:52:19.633878+00:00</lastmod>
|
<lastmod>2023-05-05T07:52:19.633878+00:00</lastmod>
|
||||||
|
|
||||||
<changefreq>daily</changefreq>
|
<changefreq>daily</changefreq>
|
||||||
<priority>0.9</priority>
|
<priority>0.9</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://python.langchain.com/en/harrison-docs-refactor-3-24/</loc>
|
<loc>https://python.langchain.com/en/harrison-docs-refactor-3-24/</loc>
|
||||||
|
|
||||||
|
|
||||||
<lastmod>2023-03-27T02:32:55.132916+00:00</lastmod>
|
<lastmod>2023-03-27T02:32:55.132916+00:00</lastmod>
|
||||||
|
|
||||||
<changefreq>monthly</changefreq>
|
<changefreq>monthly</changefreq>
|
||||||
<priority>0.8</priority>
|
<priority>0.8</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
</urlset>
|
</urlset>
|
Loading…
Reference in New Issue
Block a user