core[patch], community[patch]: link extraction continue on failure (#17200)

2025-06-20 05:43:55 +00:00 · 2024-02-07 14:15:30 -08:00 · 2024-02-07 14:15:30 -08:00 · af74301ab9
commit af74301ab9
parent 2281f00198
3 changed files with 48 additions and 23 deletions
--- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py
+++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@ -93,6 +93,7 @@ class RecursiveUrlLoader(BaseLoader):
        link_regex: Union[str, re.Pattern, None] = None,
        headers: Optional[dict] = None,
        check_response_status: bool = False,
        continue_on_failure: bool = True,
    ) -> None:
        """Initialize with URL to crawl and any subdirectories to exclude.
@ -117,6 +118,8 @@ class RecursiveUrlLoader(BaseLoader):
            link_regex: Regex for extracting sub-links from the raw html of a web page.
            check_response_status: If True, check HTTP response status and skip
                URLs with error responses (400-599).
            continue_on_failure: If True, continue if getting or parsing a link raises
                an exception. Otherwise, raise the exception.
        """
        self.url = url
@ -142,6 +145,7 @@ class RecursiveUrlLoader(BaseLoader):
        self._lock = asyncio.Lock() if self.use_async else None
        self.headers = headers
        self.check_response_status = check_response_status
        self.continue_on_failure = continue_on_failure
    def _get_child_links_recursive(
        self, url: str, visited: Set[str], *, depth: int = 0
@ -164,11 +168,14 @@ class RecursiveUrlLoader(BaseLoader):
            if self.check_response_status and 400 <= response.status_code <= 599:
                raise ValueError(f"Received HTTP status {response.status_code}")
        except Exception as e:
            if self.continue_on_failure:
                logger.warning(
                    f"Unable to load from {url}. Received error {e} of type "
                    f"{e.__class__.__name__}"
                )
                return
            else:
                raise e
        content = self.extractor(response.text)
        if content:
            yield Document(
@ -184,6 +191,7 @@ class RecursiveUrlLoader(BaseLoader):
            pattern=self.link_regex,
            prevent_outside=self.prevent_outside,
            exclude_prefixes=self.exclude_dirs,
            continue_on_failure=self.continue_on_failure,
        )
        for link in sub_links:
            # Check all unvisited links
@ -237,13 +245,16 @@ class RecursiveUrlLoader(BaseLoader):
                if self.check_response_status and 400 <= response.status <= 599:
                    raise ValueError(f"Received HTTP status {response.status}")
        except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
            if close_session:
                await session.close()
            if self.continue_on_failure:
                logger.warning(
                    f"Unable to load {url}. Received error {e} of type "
                    f"{e.__class__.__name__}"
                )
            if close_session:
                await session.close()
                return []
            else:
                raise e
        results = []
        content = self.extractor(text)
        if content:
@ -261,6 +272,7 @@ class RecursiveUrlLoader(BaseLoader):
                pattern=self.link_regex,
                prevent_outside=self.prevent_outside,
                exclude_prefixes=self.exclude_dirs,
                continue_on_failure=self.continue_on_failure,
            )
            # Recursively call the function to get the children of the children
--- a/libs/community/langchain_community/vectorstores/kdbai.py
+++ b/libs/community/langchain_community/vectorstores/kdbai.py
@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
 class KDBAI(VectorStore):
-    """`KDB.AI` vector store [https://kdb.ai](https://kdb.ai)
+    """`KDB.AI` vector store.
    To use, you should have the `kdbai_client` python package installed.
@ -25,7 +25,7 @@ class KDBAI(VectorStore):
        distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE,
            DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE.
-    See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb).
+    See the example https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb.
    """
    def __init__(
--- a/libs/core/langchain_core/utils/html.py
+++ b/libs/core/langchain_core/utils/html.py
@ -1,7 +1,10 @@
 import logging
 import re
 from typing import List, Optional, Sequence, Union
 from urllib.parse import urljoin, urlparse
 logger = logging.getLogger(__name__)
 PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
 SUFFIXES_TO_IGNORE = (
    ".css",
@ -52,6 +55,7 @@ def extract_sub_links(
    pattern: Union[str, re.Pattern, None] = None,
    prevent_outside: bool = True,
    exclude_prefixes: Sequence[str] = (),
    continue_on_failure: bool = False,
 ) -> List[str]:
    """Extract all links from a raw html string and convert into absolute paths.
@ -63,25 +67,34 @@ def extract_sub_links(
        prevent_outside: If True, ignore external links which are not children
            of the base url.
        exclude_prefixes: Exclude any URLs that start with one of these prefixes.
-
+        continue_on_failure: If True, continue if parsing a specific link raises an
            exception. Otherwise, raise the exception.
    Returns:
        List[str]: sub links
    """
    base_url_to_use = base_url if base_url is not None else url
    parsed_base_url = urlparse(base_url_to_use)
    parsed_url = urlparse(url)
    all_links = find_all_links(raw_html, pattern=pattern)
    absolute_paths = set()
    for link in all_links:
        try:
            parsed_link = urlparse(link)
            # Some may be absolute links like https://to/path
            if parsed_link.scheme == "http" or parsed_link.scheme == "https":
                absolute_path = link
            # Some may have omitted the protocol like //to/path
            elif link.startswith("//"):
-            absolute_path = f"{urlparse(url).scheme}:{link}"
+                absolute_path = f"{parsed_url.scheme}:{link}"
            else:
                absolute_path = urljoin(url, parsed_link.path)
            absolute_paths.add(absolute_path)
        except Exception as e:
            if continue_on_failure:
                logger.warning(f"Unable to load link {link}. Raised exception:\n\n{e}")
                continue
            else:
                raise e
    results = []
    for path in absolute_paths: