docs: Updated docs for sitemap loader to use correct URL (#17395)

- **Description:** 
Updated URL for sitemap loader from
"https://langchain.readthedocs.io/sitemap.xml" to
"https://api.python.langchain.com/sitemap.xml"
  - **Issue:** Fixes #17236
This commit is contained in:
Pennlaine 2024-02-13 08:20:32 +08:00 committed by GitHub
parent bd0ad6637a
commit e1bc623f8f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -13,27 +13,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: nest_asyncio in /Users/tasp/Code/projects/langchain/.venv/lib/python3.10/site-packages (1.5.6)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [ "source": [
"%pip install --upgrade --quiet nest_asyncio" "%pip install --upgrade --quiet nest_asyncio"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -54,11 +43,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"sitemap_loader = SitemapLoader(web_path=\"https://langchain.readthedocs.io/sitemap.xml\")\n", "sitemap_loader = SitemapLoader(web_path=\"https://api.python.langchain.com/sitemap.xml\")\n",
"\n", "\n",
"docs = sitemap_loader.load()" "docs = sitemap_loader.load()"
] ]
@ -90,7 +79,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2023-10-13T18:13:26.966937+00:00', 'changefreq': 'weekly', 'priority': '1'})" "Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2024-02-09T01:10:49.422114+00:00', 'changefreq': 'weekly', 'priority': '1'})"
] ]
}, },
"execution_count": 6, "execution_count": 6,
@ -113,20 +102,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching pages: 100%|##########| 1/1 [00:00<00:00, 16.39it/s]\n"
]
}
],
"source": [ "source": [
"loader = SitemapLoader(\n", "loader = SitemapLoader(\n",
" web_path=\"https://langchain.readthedocs.io/sitemap.xml\",\n", " web_path=\" https://api.python.langchain.com/sitemap.xml\",\n",
" filter_urls=[\"https://api.python.langchain.com/en/latest\"],\n", " filter_urls=[\"https://api.python.langchain.com/en/latest\"],\n",
")\n", ")\n",
"documents = loader.load()" "documents = loader.load()"
@ -134,7 +115,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 8,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -142,10 +123,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/latest/', 'loc': 'https://api.python.langchain.com/en/latest/', 'lastmod': '2023-10-13T18:09:58.478681+00:00', 'changefreq': 'daily', 'priority': '0.9'})" "Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/latest/', 'loc': 'https://api.python.langchain.com/en/latest/', 'lastmod': '2024-02-12T05:26:10.971077+00:00', 'changefreq': 'daily', 'priority': '0.9'})"
] ]
}, },
"execution_count": 28, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -183,7 +164,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 30, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -211,12 +192,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 31, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"loader = SitemapLoader(\n", "loader = SitemapLoader(\n",
" \"https://langchain.readthedocs.io/sitemap.xml\",\n", " \"https://api.python.langchain.com/sitemap.xml\",\n",
" filter_urls=[\"https://api.python.langchain.com/en/latest/\"],\n", " filter_urls=[\"https://api.python.langchain.com/en/latest/\"],\n",
" parsing_function=remove_nav_and_header_elements,\n", " parsing_function=remove_nav_and_header_elements,\n",
")" ")"
@ -233,17 +214,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching pages: 100%|##########| 3/3 [00:00<00:00, 12.46it/s]\n"
]
}
],
"source": [ "source": [
"sitemap_loader = SitemapLoader(web_path=\"example_data/sitemap.xml\", is_local=True)\n", "sitemap_loader = SitemapLoader(web_path=\"example_data/sitemap.xml\", is_local=True)\n",
"\n", "\n",