mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 18:08:36 +00:00
core[patch]: Further restrict recursive URL loader (#15559)
Includes code from this PR: https://github.com/langchain-ai/langchain/compare/HEAD...m0kr4n3:security/fix_ssrf with additional fixes Unit tests cover new test cases
This commit is contained in:
parent
817b84de9e
commit
bf0b3cc0b5
@ -67,23 +67,37 @@ def extract_sub_links(
|
|||||||
Returns:
|
Returns:
|
||||||
List[str]: sub links
|
List[str]: sub links
|
||||||
"""
|
"""
|
||||||
base_url = base_url if base_url is not None else url
|
base_url_to_use = base_url if base_url is not None else url
|
||||||
|
parsed_base_url = urlparse(base_url_to_use)
|
||||||
all_links = find_all_links(raw_html, pattern=pattern)
|
all_links = find_all_links(raw_html, pattern=pattern)
|
||||||
absolute_paths = set()
|
absolute_paths = set()
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
|
parsed_link = urlparse(link)
|
||||||
# Some may be absolute links like https://to/path
|
# Some may be absolute links like https://to/path
|
||||||
if link.startswith("http"):
|
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
|
||||||
absolute_paths.add(link)
|
absolute_path = link
|
||||||
# Some may have omitted the protocol like //to/path
|
# Some may have omitted the protocol like //to/path
|
||||||
elif link.startswith("//"):
|
elif link.startswith("//"):
|
||||||
absolute_paths.add(f"{urlparse(url).scheme}:{link}")
|
absolute_path = f"{urlparse(url).scheme}:{link}"
|
||||||
else:
|
else:
|
||||||
absolute_paths.add(urljoin(url, link))
|
absolute_path = urljoin(url, parsed_link.path)
|
||||||
res = []
|
absolute_paths.add(absolute_path)
|
||||||
|
|
||||||
|
results = []
|
||||||
for path in absolute_paths:
|
for path in absolute_paths:
|
||||||
if any(path.startswith(exclude) for exclude in exclude_prefixes):
|
if any(path.startswith(exclude_prefix) for exclude_prefix in exclude_prefixes):
|
||||||
continue
|
continue
|
||||||
if prevent_outside and not path.startswith(base_url):
|
|
||||||
|
if prevent_outside:
|
||||||
|
parsed_path = urlparse(path)
|
||||||
|
|
||||||
|
if parsed_base_url.netloc != parsed_path.netloc:
|
||||||
continue
|
continue
|
||||||
res.append(path)
|
|
||||||
return res
|
# Will take care of verifying rest of path after netloc
|
||||||
|
# if it's more specific
|
||||||
|
if not path.startswith(base_url_to_use):
|
||||||
|
continue
|
||||||
|
|
||||||
|
results.append(path)
|
||||||
|
return results
|
||||||
|
@ -156,3 +156,30 @@ def test_extract_sub_links_exclude() -> None:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
assert actual == expected
|
assert actual == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_prevent_outside() -> None:
|
||||||
|
"""Test that prevent outside compares against full base URL."""
|
||||||
|
html = (
|
||||||
|
'<a href="https://foobar.comic.com">BAD</a>'
|
||||||
|
'<a href="https://foobar.comic:9999">BAD</a>'
|
||||||
|
'<a href="https://foobar.com:9999">BAD</a>'
|
||||||
|
'<a href="http://foobar.com:9999/">BAD</a>'
|
||||||
|
'<a href="https://foobar.com/OK">OK</a>'
|
||||||
|
'<a href="http://foobar.com/BAD">BAD</a>' # Change in scheme is not OK here
|
||||||
|
)
|
||||||
|
|
||||||
|
expected = sorted(
|
||||||
|
[
|
||||||
|
"https://foobar.com/OK",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
actual = sorted(
|
||||||
|
extract_sub_links(
|
||||||
|
html,
|
||||||
|
"https://foobar.com/hello/bill.html",
|
||||||
|
base_url="https://foobar.com",
|
||||||
|
prevent_outside=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert actual == expected
|
||||||
|
Loading…
Reference in New Issue
Block a user