Compare commits

...

3 Commits

Author SHA1 Message Date
William Fu-Hinthorn
0b7376a44d lint 2023-08-04 16:36:48 -07:00
William Fu-Hinthorn
8f0cda2ca2 update 2023-08-04 16:03:02 -07:00
William Fu-Hinthorn
52aa7a93c9 Fix Recursive URL Loader 2023-08-04 15:47:48 -07:00

View File

@@ -1,5 +1,5 @@
from typing import Iterator, List, Optional, Set
from urllib.parse import urljoin, urlparse
from urllib.parse import urldefrag, urljoin, urlparse
import requests
@@ -14,16 +14,20 @@ class RecursiveUrlLoader(BaseLoader):
self,
url: str,
exclude_dirs: Optional[str] = None,
crawl_siblings: bool = False,
) -> None:
"""Initialize with URL to crawl and any subdirectories to exclude.
Args:
url: The URL to crawl.
exclude_dirs: A list of subdirectories to exclude.
crawl_siblings: Whether to crawl to sibling directories.
Useful if the main index page is in a subdirectory.
"""
self.url = url
self.exclude_dirs = exclude_dirs
self.crawl_siblings = crawl_siblings
def get_child_links_recursive(
self, url: str, visited: Optional[Set[str]] = None
@@ -69,19 +73,30 @@ class RecursiveUrlLoader(BaseLoader):
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
all_links = [link.get("href") for link in soup.find_all("a")]
# Extract only the links that are children of the current URL
child_links = list(
{
link
for link in all_links
if link and link.startswith(current_path) and link != current_path
}
)
child_links = set()
current_root = parent_url if self.crawl_siblings else current_path
for link in all_links:
link, _ = urldefrag(link)
if link:
parsed_link = urlparse(link)
# Relative Links
if not parsed_link.scheme and not parsed_link.netloc:
if parsed_link.path.startswith("/"):
if link.startswith(current_root):
child_links.add(link)
else:
joined_path = urljoin(current_path, link)
if joined_path.startswith(current_root):
child_links.add(joined_path)
# Absolute Links
elif (
parsed_link.netloc == parsed_url.netloc
and link != current_path
and link.startswith(current_root)
):
child_links.add(link)
# Get absolute path for all root relative links listed
absolute_paths = [urljoin(base_url, link) for link in child_links]
# Store the visited links and recursively visit the children
for link in absolute_paths:
# Check all unvisited links