From 8616e1c44aae11071cd324b3df8755d4a1cc0bcf Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 6 Jun 2024 14:45:05 -0700 Subject: [PATCH] wip --- .../document_loaders/recursive_url_loader.py | 3 ++- .../document_loaders/test_recursive_url_loader.py | 2 +- libs/core/langchain_core/utils/html.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index 60f3bb70267..d872c7f8afa 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -199,7 +199,8 @@ class RecursiveUrlLoader(BaseLoader): if depth + 1 < self.max_depth: for link in self._extract_sub_links(text, url): if link not in visited: - yield from self._lazy_load_recursive(link, visited, depth=depth + 1) + for doc in self._lazy_load_recursive(link, visited, depth=depth + 1): + yield doc if link not in visited: raise ValueError diff --git a/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py b/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py index c0cec92db2b..5ed846b3c32 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py +++ b/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py @@ -32,7 +32,7 @@ def test_async_recursive_url_loader_deterministic() -> None: def test_sync_recursive_url_loader() -> None: - url = "https://docs.python.org/3.9/" + url = "https://python.langchain.com/" loader = RecursiveUrlLoader( url, extractor=lambda _: "placeholder", diff --git a/libs/core/langchain_core/utils/html.py b/libs/core/langchain_core/utils/html.py index 2f9663aac97..4a6a46d979d 100644 --- a/libs/core/langchain_core/utils/html.py +++ b/libs/core/langchain_core/utils/html.py @@ -121,4 +121,4 @@ def extract_sub_links( continue results.append(path) - return results + return sorted(results)