diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py
index 8839b4a943c..ebdd7b86bae 100644
--- a/libs/langchain/langchain/utils/html.py
+++ b/libs/langchain/langchain/utils/html.py
@@ -59,11 +59,12 @@ def extract_sub_links(
for link in all_links:
# Some may be absolute links like https://to/path
if link.startswith("http"):
- if not prevent_outside or link.startswith(base_url):
- absolute_paths.add(link)
+ absolute_paths.add(link)
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_paths.add(f"{urlparse(base_url).scheme}:{link}")
else:
absolute_paths.add(urljoin(base_url, link))
+ if prevent_outside:
+ return [p for p in absolute_paths if p.startswith(base_url)]
return list(absolute_paths)
diff --git a/libs/langchain/tests/unit_tests/utils/test_html.py b/libs/langchain/tests/unit_tests/utils/test_html.py
index a5c42b6a34f..eaaa3544e86 100644
--- a/libs/langchain/tests/unit_tests/utils/test_html.py
+++ b/libs/langchain/tests/unit_tests/utils/test_html.py
@@ -86,13 +86,8 @@ def test_extract_sub_links() -> None:
actual = sorted(extract_sub_links(html, "https://foobar.com"))
assert actual == expected
- actual = sorted(extract_sub_links(html, "https://foobar.com/hello"))
- expected = sorted(
- [
- "https://foobar.com/hello",
- "https://foobar.com/how/are/you/doing",
- ]
- )
+ actual = extract_sub_links(html, "https://foobar.com/hello")
+ expected = ["https://foobar.com/hello"]
assert actual == expected
actual = sorted(