mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-13 16:36:06 +00:00
skip excluded sublinks before recursion (#11036)
This commit is contained in:
parent
9c5eca92e4
commit
a2f7246f0e
@ -99,6 +99,13 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
else _metadata_extractor
|
||||
)
|
||||
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
|
||||
|
||||
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
|
||||
raise ValueError(
|
||||
f"Base url is included in exclude_dirs. Received base_url: {url} and "
|
||||
f"exclude_dirs: {self.exclude_dirs}"
|
||||
)
|
||||
|
||||
self.timeout = timeout
|
||||
self.prevent_outside = prevent_outside if prevent_outside is not None else True
|
||||
self.link_regex = link_regex
|
||||
@ -149,6 +156,7 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
base_url=self.url,
|
||||
pattern=self.link_regex,
|
||||
prevent_outside=self.prevent_outside,
|
||||
exclude_prefixes=self.exclude_dirs,
|
||||
)
|
||||
for link in sub_links:
|
||||
# Check all unvisited links
|
||||
@ -182,10 +190,6 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
if depth >= self.max_depth:
|
||||
return []
|
||||
|
||||
# Exclude the root and parent from a list
|
||||
# Exclude the links that start with any of the excluded directories
|
||||
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
|
||||
return []
|
||||
# Disable SSL verification because websites may have invalid SSL certificates,
|
||||
# but won't cause any security issues for us.
|
||||
close_session = session is None
|
||||
|
@ -1,5 +1,5 @@
|
||||
import re
|
||||
from typing import List, Optional, Union
|
||||
from typing import List, Optional, Sequence, Union
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
|
||||
@ -42,6 +42,7 @@ def extract_sub_links(
|
||||
base_url: Optional[str] = None,
|
||||
pattern: Union[str, re.Pattern, None] = None,
|
||||
prevent_outside: bool = True,
|
||||
exclude_prefixes: Sequence[str] = (),
|
||||
) -> List[str]:
|
||||
"""Extract all links from a raw html string and convert into absolute paths.
|
||||
|
||||
@ -52,6 +53,7 @@ def extract_sub_links(
|
||||
pattern: Regex to use for extracting links from raw html.
|
||||
prevent_outside: If True, ignore external links which are not children
|
||||
of the base url.
|
||||
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
|
||||
|
||||
Returns:
|
||||
List[str]: sub links
|
||||
@ -60,8 +62,10 @@ def extract_sub_links(
|
||||
all_links = find_all_links(raw_html, pattern=pattern)
|
||||
absolute_paths = set()
|
||||
for link in all_links:
|
||||
if any(link.startswith(exclude) for exclude in exclude_prefixes):
|
||||
continue
|
||||
# Some may be absolute links like https://to/path
|
||||
if link.startswith("http"):
|
||||
elif link.startswith("http"):
|
||||
absolute_paths.add(link)
|
||||
# Some may have omitted the protocol like //to/path
|
||||
elif link.startswith("//"):
|
||||
|
Loading…
Reference in New Issue
Block a user