core[patch], community[patch]: link extraction continue on failure (#17200)

This commit is contained in:
Bagatur 2024-02-07 14:15:30 -08:00 committed by GitHub
parent 2281f00198
commit af74301ab9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 48 additions and 23 deletions

View File

@ -93,6 +93,7 @@ class RecursiveUrlLoader(BaseLoader):
link_regex: Union[str, re.Pattern, None] = None, link_regex: Union[str, re.Pattern, None] = None,
headers: Optional[dict] = None, headers: Optional[dict] = None,
check_response_status: bool = False, check_response_status: bool = False,
continue_on_failure: bool = True,
) -> None: ) -> None:
"""Initialize with URL to crawl and any subdirectories to exclude. """Initialize with URL to crawl and any subdirectories to exclude.
@ -117,6 +118,8 @@ class RecursiveUrlLoader(BaseLoader):
link_regex: Regex for extracting sub-links from the raw html of a web page. link_regex: Regex for extracting sub-links from the raw html of a web page.
check_response_status: If True, check HTTP response status and skip check_response_status: If True, check HTTP response status and skip
URLs with error responses (400-599). URLs with error responses (400-599).
continue_on_failure: If True, continue if getting or parsing a link raises
an exception. Otherwise, raise the exception.
""" """
self.url = url self.url = url
@ -142,6 +145,7 @@ class RecursiveUrlLoader(BaseLoader):
self._lock = asyncio.Lock() if self.use_async else None self._lock = asyncio.Lock() if self.use_async else None
self.headers = headers self.headers = headers
self.check_response_status = check_response_status self.check_response_status = check_response_status
self.continue_on_failure = continue_on_failure
def _get_child_links_recursive( def _get_child_links_recursive(
self, url: str, visited: Set[str], *, depth: int = 0 self, url: str, visited: Set[str], *, depth: int = 0
@ -164,11 +168,14 @@ class RecursiveUrlLoader(BaseLoader):
if self.check_response_status and 400 <= response.status_code <= 599: if self.check_response_status and 400 <= response.status_code <= 599:
raise ValueError(f"Received HTTP status {response.status_code}") raise ValueError(f"Received HTTP status {response.status_code}")
except Exception as e: except Exception as e:
if self.continue_on_failure:
logger.warning( logger.warning(
f"Unable to load from {url}. Received error {e} of type " f"Unable to load from {url}. Received error {e} of type "
f"{e.__class__.__name__}" f"{e.__class__.__name__}"
) )
return return
else:
raise e
content = self.extractor(response.text) content = self.extractor(response.text)
if content: if content:
yield Document( yield Document(
@ -184,6 +191,7 @@ class RecursiveUrlLoader(BaseLoader):
pattern=self.link_regex, pattern=self.link_regex,
prevent_outside=self.prevent_outside, prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs, exclude_prefixes=self.exclude_dirs,
continue_on_failure=self.continue_on_failure,
) )
for link in sub_links: for link in sub_links:
# Check all unvisited links # Check all unvisited links
@ -237,13 +245,16 @@ class RecursiveUrlLoader(BaseLoader):
if self.check_response_status and 400 <= response.status <= 599: if self.check_response_status and 400 <= response.status <= 599:
raise ValueError(f"Received HTTP status {response.status}") raise ValueError(f"Received HTTP status {response.status}")
except (aiohttp.client_exceptions.InvalidURL, Exception) as e: except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
if close_session:
await session.close()
if self.continue_on_failure:
logger.warning( logger.warning(
f"Unable to load {url}. Received error {e} of type " f"Unable to load {url}. Received error {e} of type "
f"{e.__class__.__name__}" f"{e.__class__.__name__}"
) )
if close_session:
await session.close()
return [] return []
else:
raise e
results = [] results = []
content = self.extractor(text) content = self.extractor(text)
if content: if content:
@ -261,6 +272,7 @@ class RecursiveUrlLoader(BaseLoader):
pattern=self.link_regex, pattern=self.link_regex,
prevent_outside=self.prevent_outside, prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs, exclude_prefixes=self.exclude_dirs,
continue_on_failure=self.continue_on_failure,
) )
# Recursively call the function to get the children of the children # Recursively call the function to get the children of the children

View File

@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
class KDBAI(VectorStore): class KDBAI(VectorStore):
"""`KDB.AI` vector store [https://kdb.ai](https://kdb.ai) """`KDB.AI` vector store.
To use, you should have the `kdbai_client` python package installed. To use, you should have the `kdbai_client` python package installed.
@ -25,7 +25,7 @@ class KDBAI(VectorStore):
distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE, distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE,
DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE. DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE.
See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb). See the example https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb.
""" """
def __init__( def __init__(

View File

@ -1,7 +1,10 @@
import logging
import re import re
from typing import List, Optional, Sequence, Union from typing import List, Optional, Sequence, Union
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
logger = logging.getLogger(__name__)
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
SUFFIXES_TO_IGNORE = ( SUFFIXES_TO_IGNORE = (
".css", ".css",
@ -52,6 +55,7 @@ def extract_sub_links(
pattern: Union[str, re.Pattern, None] = None, pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True, prevent_outside: bool = True,
exclude_prefixes: Sequence[str] = (), exclude_prefixes: Sequence[str] = (),
continue_on_failure: bool = False,
) -> List[str]: ) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths. """Extract all links from a raw html string and convert into absolute paths.
@ -63,25 +67,34 @@ def extract_sub_links(
prevent_outside: If True, ignore external links which are not children prevent_outside: If True, ignore external links which are not children
of the base url. of the base url.
exclude_prefixes: Exclude any URLs that start with one of these prefixes. exclude_prefixes: Exclude any URLs that start with one of these prefixes.
continue_on_failure: If True, continue if parsing a specific link raises an
exception. Otherwise, raise the exception.
Returns: Returns:
List[str]: sub links List[str]: sub links
""" """
base_url_to_use = base_url if base_url is not None else url base_url_to_use = base_url if base_url is not None else url
parsed_base_url = urlparse(base_url_to_use) parsed_base_url = urlparse(base_url_to_use)
parsed_url = urlparse(url)
all_links = find_all_links(raw_html, pattern=pattern) all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set() absolute_paths = set()
for link in all_links: for link in all_links:
try:
parsed_link = urlparse(link) parsed_link = urlparse(link)
# Some may be absolute links like https://to/path # Some may be absolute links like https://to/path
if parsed_link.scheme == "http" or parsed_link.scheme == "https": if parsed_link.scheme == "http" or parsed_link.scheme == "https":
absolute_path = link absolute_path = link
# Some may have omitted the protocol like //to/path # Some may have omitted the protocol like //to/path
elif link.startswith("//"): elif link.startswith("//"):
absolute_path = f"{urlparse(url).scheme}:{link}" absolute_path = f"{parsed_url.scheme}:{link}"
else: else:
absolute_path = urljoin(url, parsed_link.path) absolute_path = urljoin(url, parsed_link.path)
absolute_paths.add(absolute_path) absolute_paths.add(absolute_path)
except Exception as e:
if continue_on_failure:
logger.warning(f"Unable to load link {link}. Raised exception:\n\n{e}")
continue
else:
raise e
results = [] results = []
for path in absolute_paths: for path in absolute_paths: