mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 05:43:55 +00:00
core[patch], community[patch]: link extraction continue on failure (#17200)
This commit is contained in:
parent
2281f00198
commit
af74301ab9
@ -93,6 +93,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
link_regex: Union[str, re.Pattern, None] = None,
|
link_regex: Union[str, re.Pattern, None] = None,
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
check_response_status: bool = False,
|
check_response_status: bool = False,
|
||||||
|
continue_on_failure: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||||
|
|
||||||
@ -117,6 +118,8 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
link_regex: Regex for extracting sub-links from the raw html of a web page.
|
link_regex: Regex for extracting sub-links from the raw html of a web page.
|
||||||
check_response_status: If True, check HTTP response status and skip
|
check_response_status: If True, check HTTP response status and skip
|
||||||
URLs with error responses (400-599).
|
URLs with error responses (400-599).
|
||||||
|
continue_on_failure: If True, continue if getting or parsing a link raises
|
||||||
|
an exception. Otherwise, raise the exception.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.url = url
|
self.url = url
|
||||||
@ -142,6 +145,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
self._lock = asyncio.Lock() if self.use_async else None
|
self._lock = asyncio.Lock() if self.use_async else None
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
self.check_response_status = check_response_status
|
self.check_response_status = check_response_status
|
||||||
|
self.continue_on_failure = continue_on_failure
|
||||||
|
|
||||||
def _get_child_links_recursive(
|
def _get_child_links_recursive(
|
||||||
self, url: str, visited: Set[str], *, depth: int = 0
|
self, url: str, visited: Set[str], *, depth: int = 0
|
||||||
@ -164,11 +168,14 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
if self.check_response_status and 400 <= response.status_code <= 599:
|
if self.check_response_status and 400 <= response.status_code <= 599:
|
||||||
raise ValueError(f"Received HTTP status {response.status_code}")
|
raise ValueError(f"Received HTTP status {response.status_code}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if self.continue_on_failure:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Unable to load from {url}. Received error {e} of type "
|
f"Unable to load from {url}. Received error {e} of type "
|
||||||
f"{e.__class__.__name__}"
|
f"{e.__class__.__name__}"
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
content = self.extractor(response.text)
|
content = self.extractor(response.text)
|
||||||
if content:
|
if content:
|
||||||
yield Document(
|
yield Document(
|
||||||
@ -184,6 +191,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
pattern=self.link_regex,
|
pattern=self.link_regex,
|
||||||
prevent_outside=self.prevent_outside,
|
prevent_outside=self.prevent_outside,
|
||||||
exclude_prefixes=self.exclude_dirs,
|
exclude_prefixes=self.exclude_dirs,
|
||||||
|
continue_on_failure=self.continue_on_failure,
|
||||||
)
|
)
|
||||||
for link in sub_links:
|
for link in sub_links:
|
||||||
# Check all unvisited links
|
# Check all unvisited links
|
||||||
@ -237,13 +245,16 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
if self.check_response_status and 400 <= response.status <= 599:
|
if self.check_response_status and 400 <= response.status <= 599:
|
||||||
raise ValueError(f"Received HTTP status {response.status}")
|
raise ValueError(f"Received HTTP status {response.status}")
|
||||||
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
|
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
|
||||||
|
if close_session:
|
||||||
|
await session.close()
|
||||||
|
if self.continue_on_failure:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Unable to load {url}. Received error {e} of type "
|
f"Unable to load {url}. Received error {e} of type "
|
||||||
f"{e.__class__.__name__}"
|
f"{e.__class__.__name__}"
|
||||||
)
|
)
|
||||||
if close_session:
|
|
||||||
await session.close()
|
|
||||||
return []
|
return []
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
results = []
|
results = []
|
||||||
content = self.extractor(text)
|
content = self.extractor(text)
|
||||||
if content:
|
if content:
|
||||||
@ -261,6 +272,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
pattern=self.link_regex,
|
pattern=self.link_regex,
|
||||||
prevent_outside=self.prevent_outside,
|
prevent_outside=self.prevent_outside,
|
||||||
exclude_prefixes=self.exclude_dirs,
|
exclude_prefixes=self.exclude_dirs,
|
||||||
|
continue_on_failure=self.continue_on_failure,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Recursively call the function to get the children of the children
|
# Recursively call the function to get the children of the children
|
||||||
|
@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class KDBAI(VectorStore):
|
class KDBAI(VectorStore):
|
||||||
"""`KDB.AI` vector store [https://kdb.ai](https://kdb.ai)
|
"""`KDB.AI` vector store.
|
||||||
|
|
||||||
To use, you should have the `kdbai_client` python package installed.
|
To use, you should have the `kdbai_client` python package installed.
|
||||||
|
|
||||||
@ -25,7 +25,7 @@ class KDBAI(VectorStore):
|
|||||||
distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE,
|
distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE,
|
||||||
DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE.
|
DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE.
|
||||||
|
|
||||||
See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb).
|
See the example https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import List, Optional, Sequence, Union
|
from typing import List, Optional, Sequence, Union
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
|
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
|
||||||
SUFFIXES_TO_IGNORE = (
|
SUFFIXES_TO_IGNORE = (
|
||||||
".css",
|
".css",
|
||||||
@ -52,6 +55,7 @@ def extract_sub_links(
|
|||||||
pattern: Union[str, re.Pattern, None] = None,
|
pattern: Union[str, re.Pattern, None] = None,
|
||||||
prevent_outside: bool = True,
|
prevent_outside: bool = True,
|
||||||
exclude_prefixes: Sequence[str] = (),
|
exclude_prefixes: Sequence[str] = (),
|
||||||
|
continue_on_failure: bool = False,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Extract all links from a raw html string and convert into absolute paths.
|
"""Extract all links from a raw html string and convert into absolute paths.
|
||||||
|
|
||||||
@ -63,25 +67,34 @@ def extract_sub_links(
|
|||||||
prevent_outside: If True, ignore external links which are not children
|
prevent_outside: If True, ignore external links which are not children
|
||||||
of the base url.
|
of the base url.
|
||||||
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
|
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
|
||||||
|
continue_on_failure: If True, continue if parsing a specific link raises an
|
||||||
|
exception. Otherwise, raise the exception.
|
||||||
Returns:
|
Returns:
|
||||||
List[str]: sub links
|
List[str]: sub links
|
||||||
"""
|
"""
|
||||||
base_url_to_use = base_url if base_url is not None else url
|
base_url_to_use = base_url if base_url is not None else url
|
||||||
parsed_base_url = urlparse(base_url_to_use)
|
parsed_base_url = urlparse(base_url_to_use)
|
||||||
|
parsed_url = urlparse(url)
|
||||||
all_links = find_all_links(raw_html, pattern=pattern)
|
all_links = find_all_links(raw_html, pattern=pattern)
|
||||||
absolute_paths = set()
|
absolute_paths = set()
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
|
try:
|
||||||
parsed_link = urlparse(link)
|
parsed_link = urlparse(link)
|
||||||
# Some may be absolute links like https://to/path
|
# Some may be absolute links like https://to/path
|
||||||
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
|
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
|
||||||
absolute_path = link
|
absolute_path = link
|
||||||
# Some may have omitted the protocol like //to/path
|
# Some may have omitted the protocol like //to/path
|
||||||
elif link.startswith("//"):
|
elif link.startswith("//"):
|
||||||
absolute_path = f"{urlparse(url).scheme}:{link}"
|
absolute_path = f"{parsed_url.scheme}:{link}"
|
||||||
else:
|
else:
|
||||||
absolute_path = urljoin(url, parsed_link.path)
|
absolute_path = urljoin(url, parsed_link.path)
|
||||||
absolute_paths.add(absolute_path)
|
absolute_paths.add(absolute_path)
|
||||||
|
except Exception as e:
|
||||||
|
if continue_on_failure:
|
||||||
|
logger.warning(f"Unable to load link {link}. Raised exception:\n\n{e}")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for path in absolute_paths:
|
for path in absolute_paths:
|
||||||
|
Loading…
Reference in New Issue
Block a user