Compare commits

...

5 Commits

Author SHA1 Message Date
William Fu-Hinthorn
22e0b5a45f update 2023-09-01 16:14:56 -07:00
William Fu-Hinthorn
648a590b79 Merge branch 'master' into wfh/redirects 2023-09-01 16:11:49 -07:00
William Fu-Hinthorn
18df1be6d3 Update url loader 2023-09-01 16:10:17 -07:00
William Fu-Hinthorn
cbbe3bd713 Update 2023-09-01 15:49:57 -07:00
William Fu-Hinthorn
6e26df32ba Update redirects meta tags 2023-09-01 15:02:53 -07:00

View File

@@ -1,7 +1,7 @@
import asyncio
import re
from typing import Callable, Iterator, List, Optional, Set, Union
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin, urlparse, urlsplit
import requests
@@ -43,6 +43,29 @@ class RecursiveUrlLoader(BaseLoader):
self.timeout = timeout if timeout is not None else 10
self.prevent_outside = prevent_outside if prevent_outside is not None else True
@staticmethod
def get_directory_url(url: str) -> str:
"""Get the parent directory URL from a given URL.
Args:
url (str): The URL to extract the parent directory from.
Returns:
str: The parent directory URL.
"""
parsed_url = urlsplit(url)
path = parsed_url.path
if path.endswith("/"):
directory_url = url
else:
suffix = path.rsplit(".", 1)[-1]
if suffix in {"html", "htm", "xml", "php", "aspx", "jsp", "asp"}:
directory_url = url.rsplit("/", 1)[0] + "/"
else:
directory_url = url + "/"
return directory_url
def _get_sub_links(self, raw_html: str, base_url: str) -> List[str]:
"""This function extracts all the links from the raw html,
and convert them into absolute paths.
@@ -93,13 +116,14 @@ class RecursiveUrlLoader(BaseLoader):
continue
# Remove duplicates
# also do another filter to prevent outside links
directory_url = self.get_directory_url(base_url)
absolute_paths = list(
set(
[
path
for path in absolute_paths
if not self.prevent_outside
or path.startswith(base_url)
or path.startswith(directory_url)
and path != base_url
]
)
@@ -138,9 +162,17 @@ class RecursiveUrlLoader(BaseLoader):
return []
# Add a trailing slash if not present
if not url.endswith("/"):
suffix = url.rsplit(".", 1)[-1]
if not url.endswith("/") and suffix not in {
"html",
"htm",
"xml",
"php",
"aspx",
"jsp",
"asp",
}:
url += "/"
# Exclude the root and parent from a list
visited = set() if visited is None else visited