From 034257e9bf425fbbceb166222efbf0260030dea1 Mon Sep 17 00:00:00 2001 From: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Date: Thu, 13 Jun 2024 11:09:35 -0700 Subject: [PATCH] docs: improved recursive url loader docs (#22648) Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> --- .../document_loaders/recursive_url.ipynb | 343 +++++++++++++----- .../document_loaders/recursive_url_loader.py | 211 ++++++++++- 2 files changed, 446 insertions(+), 108 deletions(-) diff --git a/docs/docs/integrations/document_loaders/recursive_url.ipynb b/docs/docs/integrations/document_loaders/recursive_url.ipynb index 3573869cec5..fd86ba68326 100644 --- a/docs/docs/integrations/document_loaders/recursive_url.ipynb +++ b/docs/docs/integrations/document_loaders/recursive_url.ipynb @@ -7,140 +7,99 @@ "source": [ "# Recursive URL\n", "\n", - "We may want to process load all URLs under a root directory.\n", - "\n", - "For example, let's look at the [Python 3.9 Document](https://docs.python.org/3.9/).\n", - "\n", - "This has many interesting child pages that we may want to read in bulk.\n", - "\n", - "Of course, the `WebBaseLoader` can load a list of pages. \n", - "\n", - "But, the challenge is traversing the tree of child pages and actually assembling that list!\n", - " \n", - "We do this using the `RecursiveUrlLoader`.\n", - "\n", - "This also gives us the flexibility to exclude some children, customize the extractor, and more." + "The `RecursiveUrlLoader` lets you recursively scrape all child links from a root URL and parse them into Documents." ] }, { "cell_type": "markdown", - "id": "1be8094f", + "id": "947d29e7-3679-483d-973f-79ea3403a370", "metadata": {}, "source": [ - "# Parameters\n", - "- url: str, the target url to crawl.\n", - "- exclude_dirs: Optional[str], webpage directories to exclude.\n", - "- use_async: Optional[bool], wether to use async requests, using async requests is usually faster in large tasks. However, async will disable the lazy loading feature(the function still works, but it is not lazy). By default, it is set to False.\n", - "- extractor: Optional[Callable[[str], str]], a function to extract the text of the document from the webpage, by default it returns the page as it is. It is recommended to use tools like goose3 and beautifulsoup to extract the text. By default, it just returns the page as it is.\n", - "- max_depth: Optional[int] = None, the maximum depth to crawl. By default, it is set to 2. If you need to crawl the whole website, set it to a number that is large enough would simply do the job.\n", - "- timeout: Optional[int] = None, the timeout for each request, in the unit of seconds. By default, it is set to 10.\n", - "- prevent_outside: Optional[bool] = None, whether to prevent crawling outside the root url. By default, it is set to True." + "## Setup\n", + "\n", + "The `RecursiveUrlLoader` lives in the `langchain-community` package. There's no other required packages, though you will get richer default Document metadata if you have ``beautifulsoup4` installed as well." ] }, { "cell_type": "code", "execution_count": null, - "id": "23c18539", + "id": "23359ab0-8056-4dee-8bff-c38dc079f17f", "metadata": {}, "outputs": [], "source": [ - "from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader" + "%pip install -qU langchain-community beautifulsoup4" ] }, { "cell_type": "markdown", - "id": "6384c057", + "id": "07985766-e4e9-4ea1-8a18-924fa4f294e5", "metadata": {}, "source": [ - "Let's try a simple example." + "## Instantiation\n", + "\n", + "Now we can instantiate our document loader object and load Documents:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "55394afe", + "execution_count": 1, + "id": "cb208dcf-9ce9-4197-bc44-b80d20aa4e50", "metadata": {}, "outputs": [], "source": [ - "from bs4 import BeautifulSoup as Soup\n", + "from langchain_community.document_loaders import RecursiveUrlLoader\n", "\n", - "url = \"https://docs.python.org/3.9/\"\n", "loader = RecursiveUrlLoader(\n", - " url=url, max_depth=2, extractor=lambda x: Soup(x, \"html.parser\").text\n", - ")\n", - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "084fb2ce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n\\n\\n\\n\\nPython Frequently Asked Questions — Python 3.'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[0].page_content[:50]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "13bd7e16", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'source': 'https://docs.python.org/3.9/library/index.html',\n", - " 'title': 'The Python Standard Library — Python 3.9.17 documentation',\n", - " 'language': None}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[-1].metadata" + " \"https://docs.python.org/3.9/\",\n", + " # max_depth=2,\n", + " # use_async=False,\n", + " # extractor=None,\n", + " # metadata_extractor=None,\n", + " # exclude_dirs=(),\n", + " # timeout=10,\n", + " # check_response_status=True,\n", + " # continue_on_failure=True,\n", + " # prevent_outside=True,\n", + " # base_url=None,\n", + " # ...\n", + ")" ] }, { "cell_type": "markdown", - "id": "5866e5a6", + "id": "0fac4425-735f-487d-a12b-c8ed2a209039", "metadata": {}, "source": [ - "However, since it's hard to perform a perfect filter, you may still see some irrelevant results in the results. You can perform a filter on the returned documents by yourself, if it's needed. Most of the time, the returned results are good enough." - ] - }, - { - "cell_type": "markdown", - "id": "4ec8ecef", - "metadata": {}, - "source": [ - "Testing on LangChain docs." + "## Load\n", + "\n", + "Use ``.load()`` to synchronously load into memory all Documents, with one\n", + "Document per visited URL. Starting from the initial URL, we recurse through\n", + "all linked URLs up to the specified max_depth.\n", + "\n", + "Let's run through a basic example of how to use the `RecursiveUrlLoader` on the [Python 3.9 Documentation](https://docs.python.org/3.9/)." ] }, { "cell_type": "code", "execution_count": 2, - "id": "349b5598", + "id": "a30843c8-4a59-43dc-bf60-f26532f0f8e1", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/bagatur/.pyenv/versions/3.9.1/lib/python3.9/html/parser.py:170: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n", + " k = self.parse_starttag(i)\n" + ] + }, { "data": { "text/plain": [ - "8" + "{'source': 'https://docs.python.org/3.9/',\n", + " 'content_type': 'text/html',\n", + " 'title': '3.9.19 Documentation',\n", + " 'language': None}" ] }, "execution_count": 2, @@ -149,10 +108,208 @@ } ], "source": [ - "url = \"https://js.langchain.com/docs/modules/memory/integrations/\"\n", - "loader = RecursiveUrlLoader(url=url)\n", "docs = loader.load()\n", - "len(docs)" + "docs[0].metadata" + ] + }, + { + "cell_type": "markdown", + "id": "211856ed-6dd7-46c6-859e-11aaea9093db", + "metadata": {}, + "source": [ + "Great! The first document looks like the root page we started from. Let's look at the metadata of the next document" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2d842c03-fab8-4097-9f4f-809b2e71c0ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'source': 'https://docs.python.org/3.9/using/index.html',\n", + " 'content_type': 'text/html',\n", + " 'title': 'Python Setup and Usage — Python 3.9.19 documentation',\n", + " 'language': None}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[1].metadata" + ] + }, + { + "cell_type": "markdown", + "id": "f5714ace-7cc5-4c5c-9426-f68342880da0", + "metadata": {}, + "source": [ + "That url looks like a child of our root page, which is great! Let's move on from metadata to examine the content of one of our documents" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "51dc6c67-6857-4298-9472-08b147f3a631", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + " \n", + " 3.9.19 Documentation\n", + " \n", + " \n", + " str:\n", + " soup = BeautifulSoup(html, \"lxml\")\n", + " return re.sub(r\"\\n\\n+\", \"\\n\\n\", soup.text).strip()\n", + "\n", + "\n", + "loader = RecursiveUrlLoader(\"https://docs.python.org/3.9/\", extractor=bs4_extractor)\n", + "docs = loader.load()\n", + "print(docs[0].page_content[:200])" + ] + }, + { + "cell_type": "markdown", + "id": "c8e8a826", + "metadata": {}, + "source": [ + "This looks much nicer!\n", + "\n", + "You can similarly pass in a `metadata_extractor` to customize how Document metadata is extracted from the HTTP response. See the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html) for more on this." + ] + }, + { + "cell_type": "markdown", + "id": "1dddbc94", + "metadata": {}, + "source": [ + "## Lazy loading\n", + "\n", + "If we're loading a large number of Documents and our downstream operations can be done over subsets of all loaded Documents, we can lazily load our Documents one at a time to minimize our memory footprint:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7d0114fc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4j/2rz3865x6qg07tx43146py8h0000gn/T/ipykernel_73962/2110507528.py:6: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n", + " soup = BeautifulSoup(html, \"lxml\")\n" + ] + } + ], + "source": [ + "page = []\n", + "for doc in loader.lazy_load():\n", + " page.append(doc)\n", + " if len(page) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", + "\n", + " page = []" + ] + }, + { + "cell_type": "markdown", + "id": "f88a7c2f-35df-4c3a-b238-f91be2674b96", + "metadata": {}, + "source": [ + "In this example we never have more than 10 Documents loaded into memory at a time." + ] + }, + { + "cell_type": "markdown", + "id": "3e4d1c8f", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "These examples show just a few of the ways in which you can modify the default `RecursiveUrlLoader`, but there are many more modifications that can be made to best fit your use case. Using the parameters `link_regex` and `exclude_dirs` can help you filter out unwanted URLs, `aload()` and `alazy_load()` can be used for aynchronous loading, and more.\n", + "\n", + "For detailed information on configuring and calling the ``RecursiveUrlLoader``, please see the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html." ] } ], @@ -172,7 +329,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index 83e62e1d896..62e7352e3d5 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -37,7 +37,7 @@ def _metadata_extractor( except ImportError: logger.warning( "The bs4 package is required for default metadata extraction. " - "Please install it with `pip install bs4`." + "Please install it with `pip install -U beautifulsoup4`." ) return metadata soup = BeautifulSoup(raw_html, "html.parser") @@ -51,7 +51,7 @@ def _metadata_extractor( class RecursiveUrlLoader(BaseLoader): - """Load all child links from a URL page. + """Recursively load all child links from a root URL. **Security Note**: This loader is a crawler that will start crawling at a given URL and then expand to crawl child links recursively. @@ -79,8 +79,196 @@ class RecursiveUrlLoader(BaseLoader): GET request to an endpoint on Bob's site. Both sites are hosted on the same host, so such a request would not be prevented by default. - See https://python.langchain.com/docs/security - """ + See https://python.langchain.com/v0.2/docs/security/ + + Setup: + + This class has no required additional dependencies. You can optionally install + ``beautifulsoup4`` for richer default metadata extraction: + + .. code-block:: bash + + pip install -U beautifulsoup4 + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import RecursiveUrlLoader + + loader = RecursiveUrlLoader( + "https://docs.python.org/3.9/", + # max_depth=2, + # use_async=False, + # extractor=None, + # metadata_extractor=None, + # exclude_dirs=(), + # timeout=10, + # check_response_status=True, + # continue_on_failure=True, + # prevent_outside=True, + # base_url=None, + # ... + ) + + Load: + Use ``.load()`` to synchronously load into memory all Documents, with one + Document per visited URL. Starting from the initial URL, we recurse through + all linked URLs up to the specified max_depth. + + .. code-block:: python + + docs = loader.load() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + + + + + < + {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None} + + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + + + + + < + {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None} + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + + + + + < + {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None} + + Content parsing / extraction: + By default the loader sets the raw HTML from each link as the Document page + content. To parse this HTML into a more human/LLM-friendly format you can pass + in a custom ``extractor`` method: + + .. code-block:: python + + # This example uses `beautifulsoup4` and `lxml` + import re + from bs4 import BeautifulSoup + + def bs4_extractor(html: str) -> str: + soup = BeautifulSoup(html, "lxml") + return re.sub(r"\n\n+", "\n\n", soup.text).strip() + + loader = RecursiveUrlLoader( + "https://docs.python.org/3.9/", + extractor=bs4_extractor, + ) + print(loader.load()[0].page_content[:200]) + + + .. code-block:: python + + 3.9.19 Documentation + + Download + Download these documents + Docs by version + + Python 3.13 (in development) + Python 3.12 (stable) + Python 3.11 (security-fixes) + Python 3.10 (security-fixes) + Python 3.9 (securit + + Metadata extraction: + Similarly to content extraction, you can specify a metadata extraction function + to customize how Document metadata is extracted from the HTTP response. + + .. code-block:: python + + import aiohttp + import requests + from typing import Union + + def simple_metadata_extractor( + raw_html: str, url: str, response: Union[requests.Response, aiohttp.ClientResponse] + ) -> dict: + content_type = getattr(response, "headers").get("Content-Type", "") + return {"source": url, "content_type": content_type} + + loader = RecursiveUrlLoader( + "https://docs.python.org/3.9/", + metadata_extractor=simple_metadata_extractor, + ) + loader.load()[0].metadata + + .. code-block:: python + + {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html'} + + Filtering URLs: + You may not always want to pull every URL from a website. There are four parameters + that allow us to control what URLs we pull recursively. First, we can set the + ``prevent_outside`` parameter to prevent URLs outside of the ``base_url`` from + being pulled. Note that the ``base_url`` does not need to be the same as the URL we + pass in, as shown below. We can also use ``link_regex`` and ``exclude_dirs`` to be + more specific with the URLs that we select. In this example, we only pull websites + from the python docs, which contain the string "index" somewhere and are not + located in the FAQ section of the website. + + .. code-block:: python + + loader = RecursiveUrlLoader( + "https://docs.python.org/3.9/", + prevent_outside=True, + base_url="https://docs.python.org", + link_regex=r']*?\s+)?href="([^"]*(?=index)[^"]*)"', + exclude_dirs=['https://docs.python.org/3.9/faq'] + ) + docs = loader.load() + + .. code-block:: python + + ['https://docs.python.org/3.9/', + 'https://docs.python.org/3.9/py-modindex.html', + 'https://docs.python.org/3.9/genindex.html', + 'https://docs.python.org/3.9/tutorial/index.html', + 'https://docs.python.org/3.9/using/index.html', + 'https://docs.python.org/3.9/extending/index.html', + 'https://docs.python.org/3.9/installing/index.html', + 'https://docs.python.org/3.9/library/index.html', + 'https://docs.python.org/3.9/c-api/index.html', + 'https://docs.python.org/3.9/howto/index.html', + 'https://docs.python.org/3.9/distributing/index.html', + 'https://docs.python.org/3.9/reference/index.html', + 'https://docs.python.org/3.9/whatsnew/index.html'] + + """ # noqa: E501 def __init__( self, @@ -107,12 +295,12 @@ class RecursiveUrlLoader(BaseLoader): url: The URL to crawl. max_depth: The max depth of the recursive loading. use_async: Whether to use asynchronous loading. - If True, this function will not be lazy, but it will still work in the + If True, lazy_load function will not be lazy, but it will still work in the expected way, just not lazy. - extractor: A function to extract document contents from raw html. + extractor: A function to extract document contents from raw HTML. When extract function returns an empty string, the document is - ignored. - metadata_extractor: A function to extract metadata from args: raw html, the + ignored. Default returns the raw HTML. + metadata_extractor: A function to extract metadata from args: raw HTML, the source url, and the requests.Response/aiohttp.ClientResponse object (args in that order). Default extractor will attempt to use BeautifulSoup4 to extract the @@ -254,13 +442,6 @@ class RecursiveUrlLoader(BaseLoader): "Async functions forbidden when not initialized with `use_async`" ) - try: - import aiohttp - except ImportError: - raise ImportError( - "The aiohttp package is required for the RecursiveUrlLoader. " - "Please install it with `pip install aiohttp`." - ) if depth >= self.max_depth: return []