From e1bc623f8f3408edb07b47e614b2d1f0423c1e18 Mon Sep 17 00:00:00 2001 From: Pennlaine <123853388+PennlaineChu@users.noreply.github.com> Date: Tue, 13 Feb 2024 08:20:32 +0800 Subject: [PATCH] docs: Updated docs for sitemap loader to use correct URL (#17395) - **Description:** Updated URL for sitemap loader from "https://langchain.readthedocs.io/sitemap.xml" to "https://api.python.langchain.com/sitemap.xml" - **Issue:** Fixes #17236 --- .../document_loaders/sitemap.ipynb | 61 ++++++------------- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/docs/docs/integrations/document_loaders/sitemap.ipynb b/docs/docs/integrations/document_loaders/sitemap.ipynb index 9fe9d8e8e6e..d813f6e52ab 100644 --- a/docs/docs/integrations/document_loaders/sitemap.ipynb +++ b/docs/docs/integrations/document_loaders/sitemap.ipynb @@ -13,27 +13,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: nest_asyncio in /Users/tasp/Code/projects/langchain/.venv/lib/python3.10/site-packages (1.5.6)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "%pip install --upgrade --quiet nest_asyncio" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -54,11 +43,11 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "sitemap_loader = SitemapLoader(web_path=\"https://langchain.readthedocs.io/sitemap.xml\")\n", + "sitemap_loader = SitemapLoader(web_path=\"https://api.python.langchain.com/sitemap.xml\")\n", "\n", "docs = sitemap_loader.load()" ] @@ -90,7 +79,7 @@ { "data": { "text/plain": [ - "Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2023-10-13T18:13:26.966937+00:00', 'changefreq': 'weekly', 'priority': '1'})" + "Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/stable/', 'loc': 'https://api.python.langchain.com/en/stable/', 'lastmod': '2024-02-09T01:10:49.422114+00:00', 'changefreq': 'weekly', 'priority': '1'})" ] }, "execution_count": 6, @@ -113,20 +102,12 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching pages: 100%|##########| 1/1 [00:00<00:00, 16.39it/s]\n" - ] - } - ], + "outputs": [], "source": [ "loader = SitemapLoader(\n", - " web_path=\"https://langchain.readthedocs.io/sitemap.xml\",\n", + " web_path=\" https://api.python.langchain.com/sitemap.xml\",\n", " filter_urls=[\"https://api.python.langchain.com/en/latest\"],\n", ")\n", "documents = loader.load()" @@ -134,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -142,10 +123,10 @@ { "data": { "text/plain": [ - "Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/latest/', 'loc': 'https://api.python.langchain.com/en/latest/', 'lastmod': '2023-10-13T18:09:58.478681+00:00', 'changefreq': 'daily', 'priority': '0.9'})" + "Document(page_content='\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLangChain Python API Reference Documentation.\\n\\n\\nYou will be automatically redirected to the new location of this page.\\n\\n', metadata={'source': 'https://api.python.langchain.com/en/latest/', 'loc': 'https://api.python.langchain.com/en/latest/', 'lastmod': '2024-02-12T05:26:10.971077+00:00', 'changefreq': 'daily', 'priority': '0.9'})" ] }, - "execution_count": 28, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -183,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -211,12 +192,12 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "loader = SitemapLoader(\n", - " \"https://langchain.readthedocs.io/sitemap.xml\",\n", + " \"https://api.python.langchain.com/sitemap.xml\",\n", " filter_urls=[\"https://api.python.langchain.com/en/latest/\"],\n", " parsing_function=remove_nav_and_header_elements,\n", ")" @@ -233,17 +214,9 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching pages: 100%|##########| 3/3 [00:00<00:00, 12.46it/s]\n" - ] - } - ], + "outputs": [], "source": [ "sitemap_loader = SitemapLoader(web_path=\"example_data/sitemap.xml\", is_local=True)\n", "\n",