Compare commits

...

1 Commits

Author SHA1 Message Date
William Fu-Hinthorn
d17b70eda9 Prevent Outside update 2023-12-06 19:03:23 -08:00
2 changed files with 10 additions and 4 deletions

View File

@@ -89,7 +89,7 @@ class RecursiveUrlLoader(BaseLoader):
metadata_extractor: Optional[Callable[[str, str], str]] = None,
exclude_dirs: Optional[Sequence[str]] = (),
timeout: Optional[int] = 10,
prevent_outside: bool = True,
prevent_outside: Union[bool, str] = True,
link_regex: Union[str, re.Pattern, None] = None,
headers: Optional[dict] = None,
check_response_status: bool = False,
@@ -113,7 +113,7 @@ class RecursiveUrlLoader(BaseLoader):
timeout: The timeout for the requests, in the unit of seconds. If None then
connection will not timeout.
prevent_outside: If True, prevent loading from urls which are not children
of the root url.
of the root url. If a str, treated as a parent url to check against.
link_regex: Regex for extracting sub-links from the raw html of a web page.
check_response_status: If True, check HTTP response status and skip
URLs with error responses (400-599).

View File

@@ -50,7 +50,7 @@ def extract_sub_links(
*,
base_url: Optional[str] = None,
pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True,
prevent_outside: Union[bool, str] = True,
exclude_prefixes: Sequence[str] = (),
) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths.
@@ -67,7 +67,13 @@ def extract_sub_links(
Returns:
List[str]: sub links
"""
base_url = base_url if base_url is not None else url
base_url = (
prevent_outside
if isinstance(prevent_outside, str)
else base_url
if base_url is not None
else url
)
all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set()
for link in all_links: