diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
index a3fdbbcfcc2..60ee27013e9 100644
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@@ -126,9 +126,6 @@ class RecursiveUrlLoader(BaseLoader):
if depth >= self.max_depth:
return
- # Exclude the links that start with any of the excluded directories
- if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
- return
# Get all links that can be accessed from the current URL
visited.add(url)
diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py
index d981b1dc7aa..09a76876d1c 100644
--- a/libs/langchain/langchain/utils/html.py
+++ b/libs/langchain/langchain/utils/html.py
@@ -62,16 +62,19 @@ def extract_sub_links(
all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set()
for link in all_links:
- if any(link.startswith(exclude) for exclude in exclude_prefixes):
- continue
# Some may be absolute links like https://to/path
- elif link.startswith("http"):
+ if link.startswith("http"):
absolute_paths.add(link)
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_paths.add(f"{urlparse(url).scheme}:{link}")
else:
absolute_paths.add(urljoin(url, link))
- if prevent_outside:
- return [p for p in absolute_paths if p.startswith(base_url)]
- return list(absolute_paths)
+ res = []
+ for path in absolute_paths:
+ if any(path.startswith(exclude) for exclude in exclude_prefixes):
+ continue
+ if prevent_outside and not path.startswith(base_url):
+ continue
+ res.append(path)
+ return res
diff --git a/libs/langchain/tests/unit_tests/utils/test_html.py b/libs/langchain/tests/unit_tests/utils/test_html.py
index b961f966d93..692eae5865c 100644
--- a/libs/langchain/tests/unit_tests/utils/test_html.py
+++ b/libs/langchain/tests/unit_tests/utils/test_html.py
@@ -127,3 +127,32 @@ def test_extract_sub_links_base() -> None:
)
)
assert actual == expected
+
+
+def test_extract_sub_links_exclude() -> None:
+ html = (
+ 'one'
+ 'two'
+ 'three'
+ 'four'
+ ''
+ )
+
+ expected = sorted(
+ [
+ "http://baz.net",
+ "https://foobar.com",
+ "https://foobar.com/hello",
+ "https://foobar.com/hello/alexis.html",
+ ]
+ )
+ actual = sorted(
+ extract_sub_links(
+ html,
+ "https://foobar.com/hello/bill.html",
+ base_url="https://foobar.com",
+ prevent_outside=False,
+ exclude_prefixes=("https://foobar.com/how", "http://baz.org"),
+ )
+ )
+ assert actual == expected