From 6221eb5974d48f405c4369c80367bb40c9b546eb Mon Sep 17 00:00:00 2001 From: Zend <108248080+proximal-phalanx@users.noreply.github.com> Date: Fri, 11 Aug 2023 05:50:31 +0800 Subject: [PATCH] Recursive url loader w/ test (#8813) Description: Due to some issue on the test, this is a separate PR with the test for #8502 Tag maintainer: @rlancemartin --------- Co-authored-by: Lance Martin Co-authored-by: Bagatur --- .../test_recursive_url_loader.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py b/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py new file mode 100644 index 00000000000..c31bd369d72 --- /dev/null +++ b/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py @@ -0,0 +1,30 @@ +from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader + + +def test_async_recursive_url_loader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url=url, extractor=lambda _: "placeholder", use_async=True, max_depth=1 + ) + docs = loader.load() + assert len(docs) == 24 + assert docs[0].page_content == "placeholder" + + +def test_sync_recursive_url_loader() -> None: + url = "https://docs.python.org/3.9/" + loader = RecursiveUrlLoader( + url=url, extractor=lambda _: "placeholder", use_async=False, max_depth=1 + ) + docs = loader.load() + assert len(docs) == 24 + assert docs[0].page_content == "placeholder" + + +def test_loading_invalid_url() -> None: + url = "https://this.url.is.invalid/this/is/a/test" + loader = RecursiveUrlLoader( + url=url, max_depth=1, extractor=lambda _: "placeholder", use_async=False + ) + docs = loader.load() + assert len(docs) == 0