Compare commits

...

6 Commits

Author SHA1 Message Date
Bagatur
255fb6c6d6 fmt 2024-09-02 15:31:39 -07:00
Bagatur
8c49af3e9c fmt 2024-09-02 15:30:51 -07:00
Bagatur
71661fd16b fmt 2024-09-02 15:29:51 -07:00
isaac hershenson
6d3568d992 deafult depth tests 2024-06-04 20:42:51 -07:00
isaac hershenson
09db121339 unit tests recursiveurlloader 2024-06-04 20:40:46 -07:00
isaac hershenson
0a18a48171 integration tests changed 2024-06-04 10:00:36 -07:00
2 changed files with 190 additions and 3 deletions

View File

@@ -1,3 +1,5 @@
import asyncio
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
@@ -29,6 +31,98 @@ def test_async_recursive_url_loader_deterministic() -> None:
assert docs == docs_2 assert docs == docs_2
def test_async_recursive_url_lazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
extractor=lambda _: "placeholder",
use_async=True,
max_depth=3,
timeout=None,
check_response_status=True,
)
docs = [doc for doc in loader.lazy_load()]
assert len(docs) == 512
assert docs[0].page_content == "placeholder"
def test_async_recursive_url_lazy_loader_deterministic() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
use_async=True,
max_depth=3,
timeout=None,
)
docs = sorted(
[doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]
)
docs_2 = sorted(
[doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]
)
assert docs == docs_2
async def test_async_recursive_url_alazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
extractor=lambda _: "placeholder",
use_async=True,
max_depth=3,
timeout=None,
check_response_status=True,
)
docs = [doc async for doc in loader.alazy_load()]
assert len(docs) == 512
assert docs[0].page_content == "placeholder"
def test_async_recursive_url_aloader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
extractor=lambda _: "placeholder",
use_async=True,
max_depth=3,
timeout=None,
check_response_status=True,
)
docs = asyncio.run(loader.aload())
assert len(docs) == 512
assert docs[0].page_content == "placeholder"
def test_async_recursive_url_aloader_deterministic() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
use_async=True,
max_depth=3,
timeout=None,
)
docs = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"])
docs_2 = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"])
assert docs == docs_2
async def test_async_recursive_url_alazy_loader_deterministic() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
use_async=True,
max_depth=3,
timeout=None,
)
docs = sorted(
[doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]
)
docs_2 = sorted(
[doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]
)
assert docs == docs_2
def test_sync_recursive_url_loader() -> None: def test_sync_recursive_url_loader() -> None:
url = "https://docs.python.org/3.9/" url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader( loader = RecursiveUrlLoader(
@@ -39,10 +133,40 @@ def test_sync_recursive_url_loader() -> None:
assert docs[0].page_content == "placeholder" assert docs[0].page_content == "placeholder"
def test_sync_recursive_url_lazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
)
docs = [doc for doc in loader.lazy_load()]
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
def test_sync_recursive_url_aloader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
)
docs = asyncio.run(loader.aload())
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
async def test_sync_recursive_url_alazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
)
docs = [doc async for doc in loader.alazy_load()]
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
def test_sync_async_equivalent() -> None: def test_sync_async_equivalent() -> None:
url = "https://docs.python.org/3.9/" url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2)
docs = sorted(loader.load(), key=lambda d: d.metadata["source"]) docs = sorted(loader.load(), key=lambda d: d.metadata["source"])
async_docs = sorted(async_loader.load(), key=lambda d: d.metadata["source"]) async_docs = sorted(async_loader.load(), key=lambda d: d.metadata["source"])
assert docs == async_docs assert docs == async_docs
@@ -60,7 +184,7 @@ def test_loading_invalid_url() -> None:
def test_sync_async_metadata_necessary_properties() -> None: def test_sync_async_metadata_necessary_properties() -> None:
url = "https://docs.python.org/3.9/" url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2) async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2)
docs = loader.load() docs = loader.load()
async_docs = async_loader.load() async_docs = async_loader.load()
for doc in docs: for doc in docs:

View File

@@ -3,11 +3,12 @@ from __future__ import annotations
import inspect import inspect
import uuid import uuid
from types import TracebackType from types import TracebackType
from typing import Any, Type from typing import Any, List, Type
import aiohttp import aiohttp
import pytest import pytest
import requests_mock import requests_mock
from langchain_core.documents import Document
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
@@ -97,3 +98,65 @@ def test_no_runtime_args(method: str) -> None:
method_attr = getattr(RecursiveUrlLoader, method) method_attr = getattr(RecursiveUrlLoader, method)
args = list(inspect.signature(method_attr).parameters) args = list(inspect.signature(method_attr).parameters)
assert args == ["self"] assert args == ["self"]
def mock_requests(loader: RecursiveUrlLoader) -> List[Document]:
html1 = (
'<div><a class="blah" href="/one">hullo</a></div>'
'<div><a class="bleh" href="/two">buhbye</a></div>'
)
html2 = '<div><a class="first" href="../three">buhbye</a></div>'
html3 = '<div><a class="second" href="../three">buhbye</a></div>'
html4 = "<p>the end<p>"
MOCK_DEFINITIONS = [
("http://test.com", html1),
("http://test.com/one", html2),
("http://test.com/two", html3),
("http://test.com/three", html4),
]
with requests_mock.Mocker() as m:
for url, html in MOCK_DEFINITIONS:
m.get(url, text=html)
docs = loader.load()
return docs
def test_sync__init__() -> None:
loader = RecursiveUrlLoader("http://test.com", max_depth=1)
docs = mock_requests(loader)
assert len(docs) == 1
def test_async__init__(mocker: Any) -> None:
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
loader = RecursiveUrlLoader("http://test.com", max_depth=1, use_async=True)
docs = loader.load()
assert len(docs) == 1
def test_sync_default_depth() -> None:
loader = RecursiveUrlLoader("http://test.com")
docs = mock_requests(loader)
assert len(docs) == 3
def test_async_default_depth(mocker: Any) -> None:
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
loader = RecursiveUrlLoader("http://test.com", use_async=True)
docs = loader.load()
assert len(docs) == 3
def test_sync_deduplication() -> None:
loader = RecursiveUrlLoader("http://test.com", max_depth=3)
docs = mock_requests(loader)
assert len(docs) == 4
def test_async_deduplication(mocker: Any) -> None:
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
loader = RecursiveUrlLoader("http://test.com", max_depth=3, use_async=True)
docs = loader.load()
assert len(docs) == 4