fix recursive loader (#10752)

maintain same base url throughout recursion, yield initial page, fixing recursion depth tracking
2025-07-14 08:56:27 +00:00 · 2023-09-20 08:16:54 -07:00 · 2023-09-20 08:16:54 -07:00 · 96a9c27116
commit 96a9c27116
parent 276125a33b
4 changed files with 378 additions and 227 deletions
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@ -1,120 +1,43 @@
 from __future__ import annotations
 import asyncio
 import logging
 import re
-from typing import Callable, Iterator, List, Optional, Set, Union
+from typing import (
-from urllib.parse import urljoin, urlparse
+    TYPE_CHECKING,
    Callable,
    Iterator,
    List,
    Optional,
    Sequence,
    Set,
    Union,
 )
 import requests
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 from langchain.utils.html import extract_sub_links
 if TYPE_CHECKING:
    import aiohttp
 logger = logging.getLogger(__name__)
-class RecursiveUrlLoader(BaseLoader):
+def _metadata_extractor(raw_html: str, url: str) -> dict:
-    """Load all child links from a URL page."""
+    """Extract metadata from raw html using BeautifulSoup."""
    metadata = {"source": url}
    def __init__(
        self,
        url: str,
        max_depth: Optional[int] = None,
        use_async: Optional[bool] = None,
        extractor: Optional[Callable[[str], str]] = None,
        exclude_dirs: Optional[str] = None,
        timeout: Optional[int] = None,
        prevent_outside: Optional[bool] = None,
    ) -> None:
        """Initialize with URL to crawl and any subdirectories to exclude.
        Args:
            url: The URL to crawl.
            exclude_dirs: A list of subdirectories to exclude.
            use_async: Whether to use asynchronous loading,
            if use_async is true, this function will not be lazy,
            but it will still work in the expected way, just not lazy.
            extractor: A function to extract the text from the html,
            when extract function returns empty string, the document will be ignored.
            max_depth: The max depth of the recursive loading.
            timeout: The timeout for the requests, in the unit of seconds.
        """
        self.url = url
        self.exclude_dirs = exclude_dirs
        self.use_async = use_async if use_async is not None else False
        self.extractor = extractor if extractor is not None else lambda x: x
        self.max_depth = max_depth if max_depth is not None else 2
        self.timeout = timeout if timeout is not None else 10
        self.prevent_outside = prevent_outside if prevent_outside is not None else True
    def _get_sub_links(self, raw_html: str, base_url: str) -> List[str]:
        """This function extracts all the links from the raw html,
        and convert them into absolute paths.
        Args:
            raw_html (str): original html
            base_url (str): the base url of the html
        Returns:
            List[str]: sub links
        """
        # Get all links that are relative to the root of the website
        all_links = re.findall(r"href=[\"\'](.*?)[\"\']", raw_html)
        absolute_paths = []
        invalid_prefixes = ("javascript:", "mailto:", "#")
        invalid_suffixes = (
            ".css",
            ".js",
            ".ico",
            ".png",
            ".jpg",
            ".jpeg",
            ".gif",
            ".svg",
        )
        # Process the links
        for link in all_links:
            # Ignore blacklisted patterns
            # like javascript: or mailto:, files of svg, ico, css, js
            if link.startswith(invalid_prefixes) or link.endswith(invalid_suffixes):
                continue
            # Some may be absolute links like https://to/path
            if link.startswith("http"):
                if (not self.prevent_outside) or (
                    self.prevent_outside and link.startswith(base_url)
                ):
                    absolute_paths.append(link)
            else:
                absolute_paths.append(urljoin(base_url, link))
            # Some may be relative links like /to/path
            if link.startswith("/") and not link.startswith("//"):
                absolute_paths.append(urljoin(base_url, link))
                continue
            # Some may have omitted the protocol like //to/path
            if link.startswith("//"):
                absolute_paths.append(f"{urlparse(base_url).scheme}:{link}")
                continue
        # Remove duplicates
        # also do another filter to prevent outside links
        absolute_paths = list(
            set(
                [
                    path
                    for path in absolute_paths
                    if not self.prevent_outside
                    or path.startswith(base_url)
                    and path != base_url
                ]
            )
        )
        return absolute_paths
    def _gen_metadata(self, raw_html: str, url: str) -> dict:
        """Build metadata from BeautifulSoup output."""
    try:
        from bs4 import BeautifulSoup
    except ImportError:
-            print("The bs4 package is required for the RecursiveUrlLoader.")
+        logger.warning(
-            print("Please install it with `pip install bs4`.")
+            "The bs4 package is required for default metadata extraction. "
-        metadata = {"source": url}
+            "Please install it with `pip install bs4`."
        )
        return metadata
    soup = BeautifulSoup(raw_html, "html.parser")
    if title := soup.find("title"):
        metadata["title"] = title.get_text()
@ -124,64 +47,113 @@ class RecursiveUrlLoader(BaseLoader):
        metadata["language"] = html.get("lang", None)
    return metadata
 class RecursiveUrlLoader(BaseLoader):
    """Load all child links from a URL page."""
    def __init__(
        self,
        url: str,
        max_depth: Optional[int] = 2,
        use_async: Optional[bool] = None,
        extractor: Optional[Callable[[str], str]] = None,
        metadata_extractor: Optional[Callable[[str, str], str]] = None,
        exclude_dirs: Optional[Sequence[str]] = (),
        timeout: Optional[int] = 10,
        prevent_outside: Optional[bool] = True,
        link_regex: Union[str, re.Pattern, None] = None,
        headers: Optional[dict] = None,
    ) -> None:
        """Initialize with URL to crawl and any subdirectories to exclude.
        Args:
            url: The URL to crawl.
            max_depth: The max depth of the recursive loading.
            use_async: Whether to use asynchronous loading.
                If True, this function will not be lazy, but it will still work in the
                expected way, just not lazy.
            extractor: A function to extract document contents from raw html.
                When extract function returns an empty string, the document is
                ignored.
            metadata_extractor: A function to extract metadata from raw html and the
                source url (args in that order). Default extractor will attempt
                to use BeautifulSoup4 to extract the title, description and language
                of the page.
            exclude_dirs: A list of subdirectories to exclude.
            timeout: The timeout for the requests, in the unit of seconds. If None then
                connection will not timeout.
            prevent_outside: If True, prevent loading from urls which are not children
                of the root url.
            link_regex: Regex for extracting sub-links from the raw html of a web page.
        """
        self.url = url
        self.max_depth = max_depth if max_depth is not None else 2
        self.use_async = use_async if use_async is not None else False
        self.extractor = extractor if extractor is not None else lambda x: x
        self.metadata_extractor = (
            metadata_extractor
            if metadata_extractor is not None
            else _metadata_extractor
        )
        self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
        self.timeout = timeout
        self.prevent_outside = prevent_outside if prevent_outside is not None else True
        self.link_regex = link_regex
        self._lock = asyncio.Lock() if self.use_async else None
        self.headers = headers
    def _get_child_links_recursive(
-        self, url: str, visited: Optional[Set[str]] = None, depth: int = 0
+        self, url: str, visited: Set[str], *, depth: int = 0
    ) -> Iterator[Document]:
        """Recursively get all child links starting with the path of the input URL.
        Args:
            url: The URL to crawl.
            visited: A set of visited URLs.
            depth: Current depth of recursion. Stop when depth >= max_depth.
        """
-        if depth > self.max_depth:
+        if depth >= self.max_depth:
-            return []
+            return
        # Add a trailing slash if not present
        if not url.endswith("/"):
            url += "/"
        # Exclude the root and parent from a list
        visited = set() if visited is None else visited
        # Exclude the links that start with any of the excluded directories
-        if self.exclude_dirs and any(
+        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
-            url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs
+            return
        ):
            return []
        # Get all links that can be accessed from the current URL
        try:
-            response = requests.get(url, timeout=self.timeout)
+            response = requests.get(url, timeout=self.timeout, headers=self.headers)
        except Exception:
-            return []
+            logger.warning(f"Unable to load from {url}")
-
+            return
-        absolute_paths = self._get_sub_links(response.text, url)
+        content = self.extractor(response.text)
        if content:
            yield Document(
                page_content=content,
                metadata=self.metadata_extractor(response.text, url),
            )
        visited.add(url)
        # Store the visited links and recursively visit the children
-        for link in absolute_paths:
+        sub_links = extract_sub_links(
            response.text,
            self.url,
            pattern=self.link_regex,
            prevent_outside=self.prevent_outside,
        )
        for link in sub_links:
            # Check all unvisited links
            if link not in visited:
-                visited.add(link)
+                yield from self._get_child_links_recursive(
-
+                    link, visited, depth=depth + 1
                try:
                    response = requests.get(link)
                    text = response.text
                except Exception:
                    # unreachable link, so just ignore it
                    continue
                loaded_link = Document(
                    page_content=self.extractor(text),
                    metadata=self._gen_metadata(text, link),
                )
                yield loaded_link
                # If the link is a directory (w/ children) then visit it
                if link.endswith("/"):
                    yield from self._get_child_links_recursive(link, visited, depth + 1)
        return []
    async def _async_get_child_links_recursive(
-        self, url: str, visited: Optional[Set[str]] = None, depth: int = 0
+        self,
        url: str,
        visited: Set[str],
        *,
        session: Optional[aiohttp.ClientSession] = None,
        depth: int = 0,
    ) -> List[Document]:
        """Recursively get all child links starting with the path of the input URL.
@ -193,117 +165,87 @@ class RecursiveUrlLoader(BaseLoader):
        try:
            import aiohttp
        except ImportError:
-            print("The aiohttp package is required for the RecursiveUrlLoader.")
+            raise ImportError(
-            print("Please install it with `pip install aiohttp`.")
+                "The aiohttp package is required for the RecursiveUrlLoader. "
-        if depth > self.max_depth:
+                "Please install it with `pip install aiohttp`."
            )
        if depth >= self.max_depth:
            return []
        # Add a trailing slash if not present
        if not url.endswith("/"):
            url += "/"
        # Exclude the root and parent from a list
        visited = set() if visited is None else visited
        # Exclude the links that start with any of the excluded directories
-        if self.exclude_dirs and any(
+        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
            url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs
        ):
            return []
        # Disable SSL verification because websites may have invalid SSL certificates,
        # but won't cause any security issues for us.
-        async with aiohttp.ClientSession(
+        close_session = session is None
        session = session or aiohttp.ClientSession(
            connector=aiohttp.TCPConnector(ssl=False),
-            timeout=aiohttp.ClientTimeout(self.timeout),
+            timeout=aiohttp.ClientTimeout(total=self.timeout),
-        ) as session:
+            headers=self.headers,
            # Some url may be invalid, so catch the exception
            response: aiohttp.ClientResponse
            try:
                response = await session.get(url)
                text = await response.text()
            except aiohttp.client_exceptions.InvalidURL:
                return []
            # There may be some other exceptions, so catch them,
            # we don't want to stop the whole process
            except Exception:
                return []
            absolute_paths = self._get_sub_links(text, url)
            # Worker will be only called within the current function
            # Worker function will process the link
            # then recursively call get_child_links_recursive to process the children
            async def worker(link: str) -> Union[Document, None]:
                try:
                    async with aiohttp.ClientSession(
                        connector=aiohttp.TCPConnector(ssl=False),
                        timeout=aiohttp.ClientTimeout(self.timeout),
                    ) as session:
                        response = await session.get(link)
                        text = await response.text()
                        extracted = self.extractor(text)
                        if len(extracted) > 0:
                            return Document(
                                page_content=extracted,
                                metadata=self._gen_metadata(text, link),
        )
-                        else:
+        try:
-                            return None
+            async with session.get(url) as response:
-                # Despite the fact that we have filtered some links,
+                text = await response.text()
-                # there may still be some invalid links, so catch the exception
+            async with self._lock:  # type: ignore
-                except aiohttp.client_exceptions.InvalidURL:
+                visited.add(url)
-                    return None
+        except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
-                # There may be some other exceptions, so catch them,
+            logger.warning(
-                # we don't want to stop the whole process
+                f"Unable to load {url}. Received error {e} of type "
-                except Exception:
+                f"{e.__class__.__name__}"
                    # print(e)
                    return None
            # The coroutines that will be executed
            tasks = []
            # Generate the tasks
            for link in absolute_paths:
                # Check all unvisited links
                if link not in visited:
                    visited.add(link)
                    tasks.append(worker(link))
            # Get the not None results
            results = list(
                filter(lambda x: x is not None, await asyncio.gather(*tasks))
            )
            return []
        results = []
        content = self.extractor(text)
        if content:
            results.append(
                Document(
                    page_content=content,
                    metadata=self.metadata_extractor(text, url),
                )
            )
        if depth < self.max_depth - 1:
            sub_links = extract_sub_links(
                text,
                self.url,
                pattern=self.link_regex,
                prevent_outside=self.prevent_outside,
            )
            # Recursively call the function to get the children of the children
            sub_tasks = []
-            for link in absolute_paths:
+            async with self._lock:  # type: ignore
                to_visit = set(sub_links).difference(visited)
                for link in to_visit:
                    sub_tasks.append(
-                    self._async_get_child_links_recursive(link, visited, depth + 1)
+                        self._async_get_child_links_recursive(
                            link, visited, session=session, depth=depth + 1
                        )
                    )
            # sub_tasks returns coroutines of list,
            # so we need to flatten the list await asyncio.gather(*sub_tasks)
            flattened = []
            next_results = await asyncio.gather(*sub_tasks)
            for sub_result in next_results:
-                if isinstance(sub_result, Exception):
+                if isinstance(sub_result, Exception) or sub_result is None:
                    # We don't want to stop the whole process, so just ignore it
-                    # Not standard html format or invalid url or 404 may cause this
+                    # Not standard html format or invalid url or 404 may cause this.
                    # But we can't do anything about it.
                    continue
-                if sub_result is not None:
+                # locking not fully working, temporary hack to ensure deduplication
-                    flattened += sub_result
+                results += [r for r in sub_result if r not in results]
-            results += flattened
+        if close_session:
-            return list(filter(lambda x: x is not None, results))
+            await session.close()
        return results
    def lazy_load(self) -> Iterator[Document]:
        """Lazy load web pages.
        When use_async is True, this function will not be lazy,
        but it will still work in the expected way, just not lazy."""
        visited: Set[str] = set()
        if self.use_async:
-            results = asyncio.run(self._async_get_child_links_recursive(self.url))
+            results = asyncio.run(
-            if results is None:
+                self._async_get_child_links_recursive(self.url, visited)
-                return iter([])
+            )
            return iter(results or [])
        else:
-                return iter(results)
+            return self._get_child_links_recursive(self.url, visited)
        else:
            return self._get_child_links_recursive(self.url)
    def load(self) -> List[Document]:
        """Load web pages."""
--- a/libs/langchain/langchain/utils/html.py
+++ b/libs/langchain/langchain/utils/html.py
@ -0,0 +1,69 @@
 import re
 from typing import List, Union
 from urllib.parse import urljoin, urlparse
 PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
 SUFFIXES_TO_IGNORE = (
    ".css",
    ".js",
    ".ico",
    ".png",
    ".jpg",
    ".jpeg",
    ".gif",
    ".svg",
    ".csv",
    ".bz2",
    ".zip",
    ".epub",
 )
 SUFFIXES_TO_IGNORE_REGEX = (
    "(?!" + "|".join([re.escape(s) + "[\#'\"]" for s in SUFFIXES_TO_IGNORE]) + ")"
 )
 PREFIXES_TO_IGNORE_REGEX = (
    "(?!" + "|".join([re.escape(s) for s in PREFIXES_TO_IGNORE]) + ")"
 )
 DEFAULT_LINK_REGEX = (
    f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)[\#'\"]"
 )
 def find_all_links(
    raw_html: str, *, pattern: Union[str, re.Pattern, None] = None
 ) -> List[str]:
    pattern = pattern or DEFAULT_LINK_REGEX
    return list(set(re.findall(pattern, raw_html)))
 def extract_sub_links(
    raw_html: str,
    base_url: str,
    *,
    pattern: Union[str, re.Pattern, None] = None,
    prevent_outside: bool = True,
 ) -> List[str]:
    """Extract all links from a raw html string and convert into absolute paths.
    Args:
        raw_html: original html
        base_url: the base url of the html
        pattern: Regex to use for extracting links from raw html.
        prevent_outside: If True, ignore external links which are not children
            of the base url.
    Returns:
        List[str]: sub links
    """
    all_links = find_all_links(raw_html, pattern=pattern)
    absolute_paths = set()
    for link in all_links:
        # Some may be absolute links like https://to/path
        if link.startswith("http"):
            if not prevent_outside or link.startswith(base_url):
                absolute_paths.add(link)
        # Some may have omitted the protocol like //to/path
        elif link.startswith("//"):
            absolute_paths.add(f"{urlparse(base_url).scheme}:{link}")
        else:
            absolute_paths.add(urljoin(base_url, link))
    return list(absolute_paths)
--- a/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py
@ -1,30 +1,61 @@
 import pytest as pytest
 from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
@pytest.mark.asyncio
 def test_async_recursive_url_loader() -> None:
    url = "https://docs.python.org/3.9/"
    loader = RecursiveUrlLoader(
-        url=url, extractor=lambda _: "placeholder", use_async=True, max_depth=1
+        url,
        extractor=lambda _: "placeholder",
        use_async=True,
        max_depth=3,
        timeout=None,
    )
    docs = loader.load()
-    assert len(docs) == 24
+    assert len(docs) == 1024
    assert docs[0].page_content == "placeholder"
@pytest.mark.asyncio
 def test_async_recursive_url_loader_deterministic() -> None:
    url = "https://docs.python.org/3.9/"
    loader = RecursiveUrlLoader(
        url,
        use_async=True,
        max_depth=3,
        timeout=None,
    )
    docs = sorted(loader.load(), key=lambda d: d.metadata["source"])
    docs_2 = sorted(loader.load(), key=lambda d: d.metadata["source"])
    assert docs == docs_2
 def test_sync_recursive_url_loader() -> None:
    url = "https://docs.python.org/3.9/"
    loader = RecursiveUrlLoader(
-        url=url, extractor=lambda _: "placeholder", use_async=False, max_depth=1
+        url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
    )
    docs = loader.load()
-    assert len(docs) == 24
+    assert len(docs) == 27
    assert docs[0].page_content == "placeholder"
@pytest.mark.asyncio
 def test_sync_async_equivalent() -> None:
    url = "https://docs.python.org/3.9/"
    loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
    async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
    docs = sorted(loader.load(), key=lambda d: d.metadata["source"])
    async_docs = sorted(async_loader.load(), key=lambda d: d.metadata["source"])
    assert docs == async_docs
 def test_loading_invalid_url() -> None:
    url = "https://this.url.is.invalid/this/is/a/test"
    loader = RecursiveUrlLoader(
-        url=url, max_depth=1, extractor=lambda _: "placeholder", use_async=False
+        url, max_depth=1, extractor=lambda _: "placeholder", use_async=False
    )
    docs = loader.load()
    assert len(docs) == 0
--- a/libs/langchain/tests/unit_tests/utils/test_html.py
+++ b/libs/langchain/tests/unit_tests/utils/test_html.py
@ -0,0 +1,109 @@
 from langchain.utils.html import (
    PREFIXES_TO_IGNORE,
    SUFFIXES_TO_IGNORE,
    extract_sub_links,
    find_all_links,
 )
 def test_find_all_links_none() -> None:
    html = "<span>Hello world</span>"
    actual = find_all_links(html)
    assert actual == []
 def test_find_all_links_single() -> None:
    htmls = [
        "href='foobar.com'",
        'href="foobar.com"',
        '<div><a class="blah" href="foobar.com">hullo</a></div>',
    ]
    actual = [find_all_links(html) for html in htmls]
    assert actual == [["foobar.com"]] * 3
 def test_find_all_links_multiple() -> None:
    html = (
        '<div><a class="blah" href="https://foobar.com">hullo</a></div>'
        '<div><a class="bleh" href="/baz/cool">buhbye</a></div>'
    )
    actual = find_all_links(html)
    assert sorted(actual) == [
        "/baz/cool",
        "https://foobar.com",
    ]
 def test_find_all_links_ignore_suffix() -> None:
    html = 'href="foobar{suffix}"'
    for suffix in SUFFIXES_TO_IGNORE:
        actual = find_all_links(html.format(suffix=suffix))
        assert actual == []
    # Don't ignore if pattern doesn't occur at end of link.
    html = 'href="foobar{suffix}more"'
    for suffix in SUFFIXES_TO_IGNORE:
        actual = find_all_links(html.format(suffix=suffix))
        assert actual == [f"foobar{suffix}more"]
 def test_find_all_links_ignore_prefix() -> None:
    html = 'href="{prefix}foobar"'
    for prefix in PREFIXES_TO_IGNORE:
        actual = find_all_links(html.format(prefix=prefix))
        assert actual == []
    # Don't ignore if pattern doesn't occur at beginning of link.
    html = 'href="foobar{prefix}more"'
    for prefix in PREFIXES_TO_IGNORE:
        # Pound signs are split on when not prefixes.
        if prefix == "#":
            continue
        actual = find_all_links(html.format(prefix=prefix))
        assert actual == [f"foobar{prefix}more"]
 def test_find_all_links_drop_fragment() -> None:
    html = 'href="foobar.com/woah#section_one"'
    actual = find_all_links(html)
    assert actual == ["foobar.com/woah"]
 def test_extract_sub_links() -> None:
    html = (
        '<a href="https://foobar.com">one</a>'
        '<a href="http://baz.net">two</a>'
        '<a href="//foobar.com/hello">three</a>'
        '<a href="/how/are/you/doing">four</a>'
    )
    expected = sorted(
        [
            "https://foobar.com",
            "https://foobar.com/hello",
            "https://foobar.com/how/are/you/doing",
        ]
    )
    actual = sorted(extract_sub_links(html, "https://foobar.com"))
    assert actual == expected
    actual = sorted(extract_sub_links(html, "https://foobar.com/hello"))
    expected = sorted(
        [
            "https://foobar.com/hello",
            "https://foobar.com/how/are/you/doing",
        ]
    )
    assert actual == expected
    actual = sorted(
        extract_sub_links(html, "https://foobar.com/hello", prevent_outside=False)
    )
    expected = sorted(
        [
            "https://foobar.com",
            "http://baz.net",
            "https://foobar.com/hello",
            "https://foobar.com/how/are/you/doing",
        ]
    )
    assert actual == expected