update

Merge branch 'master' into wfh/redirects
Update url loader
2026-02-03 15:55:44 +00:00 · 2023-09-01 16:14:56 -07:00 · 2023-09-01 16:11:49 -07:00 · 2023-09-01 16:10:17 -07:00 · 2023-09-01 15:49:57 -07:00 · 2023-09-01 15:02:53 -07:00
1 changed files with 36 additions and 4 deletions
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@@ -1,7 +1,7 @@
 import asyncio
 import re
 from typing import Callable, Iterator, List, Optional, Set, Union
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, urlparse, urlsplit

 import requests

@@ -43,6 +43,29 @@ class RecursiveUrlLoader(BaseLoader):
        self.timeout = timeout if timeout is not None else 10
        self.prevent_outside = prevent_outside if prevent_outside is not None else True

+    @staticmethod
+    def get_directory_url(url: str) -> str:
+        """Get the parent directory URL from a given URL.
+
+        Args:
+            url (str): The URL to extract the parent directory from.
+
+        Returns:
+            str: The parent directory URL.
+        """
+        parsed_url = urlsplit(url)
+        path = parsed_url.path
+        if path.endswith("/"):
+            directory_url = url
+        else:
+            suffix = path.rsplit(".", 1)[-1]
+            if suffix in {"html", "htm", "xml", "php", "aspx", "jsp", "asp"}:
+                directory_url = url.rsplit("/", 1)[0] + "/"
+            else:
+                directory_url = url + "/"
+
+        return directory_url
+
    def _get_sub_links(self, raw_html: str, base_url: str) -> List[str]:
        """This function extracts all the links from the raw html,
        and convert them into absolute paths.
@@ -93,13 +116,14 @@ class RecursiveUrlLoader(BaseLoader):
                continue
        # Remove duplicates
        # also do another filter to prevent outside links
+        directory_url = self.get_directory_url(base_url)
        absolute_paths = list(
            set(
                [
                    path
                    for path in absolute_paths
                    if not self.prevent_outside
-                    or path.startswith(base_url)
+                    or path.startswith(directory_url)
                    and path != base_url
                ]
            )
@@ -138,9 +162,17 @@ class RecursiveUrlLoader(BaseLoader):
            return []

        # Add a trailing slash if not present
-        if not url.endswith("/"):
+        suffix = url.rsplit(".", 1)[-1]
+        if not url.endswith("/") and suffix not in {
+            "html",
+            "htm",
+            "xml",
+            "php",
+            "aspx",
+            "jsp",
+            "asp",
+        }:
            url += "/"
-
        # Exclude the root and parent from a list
        visited = set() if visited is None else visited
Author	SHA1	Message	Date
William Fu-Hinthorn	22e0b5a45f	update	2023-09-01 16:14:56 -07:00
William Fu-Hinthorn	648a590b79	Merge branch 'master' into wfh/redirects	2023-09-01 16:11:49 -07:00
William Fu-Hinthorn	18df1be6d3	Update url loader	2023-09-01 16:10:17 -07:00
William Fu-Hinthorn	cbbe3bd713	Update	2023-09-01 15:49:57 -07:00
William Fu-Hinthorn	6e26df32ba	Update redirects meta tags	2023-09-01 15:02:53 -07:00