docs: improved recursive url loader docs (#22648)

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
2025-09-13 05:25:07 +00:00 · 2024-06-13 11:09:35 -07:00
parent e832bbb486
commit 034257e9bf
2 changed files with 446 additions and 108 deletions
--- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py
+++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@@ -37,7 +37,7 @@ def _metadata_extractor(
    except ImportError:
        logger.warning(
            "The bs4 package is required for default metadata extraction. "
-            "Please install it with `pip install bs4`."
+            "Please install it with `pip install -U beautifulsoup4`."
        )
        return metadata
    soup = BeautifulSoup(raw_html, "html.parser")
@@ -51,7 +51,7 @@ def _metadata_extractor(


 class RecursiveUrlLoader(BaseLoader):
-    """Load all child links from a URL page.
+    """Recursively load all child links from a root URL.

    **Security Note**: This loader is a crawler that will start crawling
        at a given URL and then expand to crawl child links recursively.
@@ -79,8 +79,196 @@ class RecursiveUrlLoader(BaseLoader):
        GET request to an endpoint on Bob's site. Both sites are hosted on the
        same host, so such a request would not be prevented by default.

-        See https://python.langchain.com/docs/security
-    """
+        See https://python.langchain.com/v0.2/docs/security/
+
+    Setup:
+
+        This class has no required additional dependencies. You can optionally install
+        ``beautifulsoup4`` for richer default metadata extraction:
+
+        .. code-block:: bash
+
+            pip install -U beautifulsoup4
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import RecursiveUrlLoader
+
+            loader = RecursiveUrlLoader(
+                "https://docs.python.org/3.9/",
+                # max_depth=2,
+                # use_async=False,
+                # extractor=None,
+                # metadata_extractor=None,
+                # exclude_dirs=(),
+                # timeout=10,
+                # check_response_status=True,
+                # continue_on_failure=True,
+                # prevent_outside=True,
+                # base_url=None,
+                # ...
+            )
+
+    Load:
+        Use ``.load()`` to synchronously load into memory all Documents, with one
+        Document per visited URL. Starting from the initial URL, we recurse through
+        all linked URLs up to the specified max_depth.
+
+        .. code-block:: python
+
+            docs = loader.load()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            <!DOCTYPE html>
+
+            <html xmlns="http://www.w3.org/1999/xhtml">
+            <head>
+                <meta charset="utf-8" /><
+            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            <!DOCTYPE html>
+
+            <html xmlns="http://www.w3.org/1999/xhtml">
+            <head>
+                <meta charset="utf-8" /><
+            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            <!DOCTYPE html>
+
+            <html xmlns="http://www.w3.org/1999/xhtml">
+            <head>
+                <meta charset="utf-8" /><
+            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}
+
+    Content parsing / extraction:
+        By default the loader sets the raw HTML from each link as the Document page
+        content. To parse this HTML into a more human/LLM-friendly format you can pass
+        in a custom ``extractor`` method:
+
+            .. code-block:: python
+
+                # This example uses `beautifulsoup4` and `lxml`
+                import re
+                from bs4 import BeautifulSoup
+
+                def bs4_extractor(html: str) -> str:
+                    soup = BeautifulSoup(html, "lxml")
+                    return re.sub(r"\n\n+", "\n\n", soup.text).strip()
+
+                loader = RecursiveUrlLoader(
+                    "https://docs.python.org/3.9/",
+                    extractor=bs4_extractor,
+                )
+                print(loader.load()[0].page_content[:200])
+
+
+            .. code-block:: python
+
+                3.9.19 Documentation
+
+                Download
+                Download these documents
+                Docs by version
+
+                Python 3.13 (in development)
+                Python 3.12 (stable)
+                Python 3.11 (security-fixes)
+                Python 3.10 (security-fixes)
+                Python 3.9 (securit
+
+    Metadata extraction:
+        Similarly to content extraction, you can specify a metadata extraction function
+        to customize how Document metadata is extracted from the HTTP response.
+
+        .. code-block:: python
+
+            import aiohttp
+            import requests
+            from typing import Union
+
+            def simple_metadata_extractor(
+                raw_html: str, url: str, response: Union[requests.Response, aiohttp.ClientResponse]
+            ) -> dict:
+                content_type = getattr(response, "headers").get("Content-Type", "")
+                return {"source": url, "content_type": content_type}
+
+            loader = RecursiveUrlLoader(
+                "https://docs.python.org/3.9/",
+                metadata_extractor=simple_metadata_extractor,
+            )
+            loader.load()[0].metadata
+
+        .. code-block:: python
+
+            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html'}
+
+    Filtering URLs:
+        You may not always want to pull every URL from a website. There are four parameters
+        that allow us to control what URLs we pull recursively. First, we can set the
+        ``prevent_outside`` parameter to prevent URLs outside of the ``base_url`` from
+        being pulled. Note that the ``base_url`` does not need to be the same as the URL we
+        pass in, as shown below. We can also use ``link_regex`` and ``exclude_dirs`` to be
+        more specific with the URLs that we select. In this example, we only pull websites
+        from the python docs, which contain the string "index" somewhere and are not
+        located in the FAQ section of the website.
+
+        .. code-block:: python
+
+            loader = RecursiveUrlLoader(
+                "https://docs.python.org/3.9/",
+                prevent_outside=True,
+                base_url="https://docs.python.org",
+                link_regex=r'<a\s+(?:[^>]*?\s+)?href="([^"]*(?=index)[^"]*)"',
+                exclude_dirs=['https://docs.python.org/3.9/faq']
+            )
+            docs = loader.load()
+
+        .. code-block:: python
+
+            ['https://docs.python.org/3.9/',
+            'https://docs.python.org/3.9/py-modindex.html',
+            'https://docs.python.org/3.9/genindex.html',
+            'https://docs.python.org/3.9/tutorial/index.html',
+            'https://docs.python.org/3.9/using/index.html',
+            'https://docs.python.org/3.9/extending/index.html',
+            'https://docs.python.org/3.9/installing/index.html',
+            'https://docs.python.org/3.9/library/index.html',
+            'https://docs.python.org/3.9/c-api/index.html',
+            'https://docs.python.org/3.9/howto/index.html',
+            'https://docs.python.org/3.9/distributing/index.html',
+            'https://docs.python.org/3.9/reference/index.html',
+            'https://docs.python.org/3.9/whatsnew/index.html']
+
+    """  # noqa: E501

    def __init__(
        self,
@@ -107,12 +295,12 @@ class RecursiveUrlLoader(BaseLoader):
            url: The URL to crawl.
            max_depth: The max depth of the recursive loading.
            use_async: Whether to use asynchronous loading.
-                If True, this function will not be lazy, but it will still work in the
+                If True, lazy_load function will not be lazy, but it will still work in the
                expected way, just not lazy.
-            extractor: A function to extract document contents from raw html.
+            extractor: A function to extract document contents from raw HTML.
                When extract function returns an empty string, the document is
-                ignored.
-            metadata_extractor: A function to extract metadata from args: raw html, the
+                ignored. Default returns the raw HTML.
+            metadata_extractor: A function to extract metadata from args: raw HTML, the
                source url, and the requests.Response/aiohttp.ClientResponse object
                (args in that order).
                Default extractor will attempt to use BeautifulSoup4 to extract the
@@ -254,13 +442,6 @@ class RecursiveUrlLoader(BaseLoader):
                "Async functions forbidden when not initialized with `use_async`"
            )

-        try:
-            import aiohttp
-        except ImportError:
-            raise ImportError(
-                "The aiohttp package is required for the RecursiveUrlLoader. "
-                "Please install it with `pip install aiohttp`."
-            )
        if depth >= self.max_depth:
            return []