Compare commits

...

6 Commits

Author SHA1 Message Date
Bagatur
255fb6c6d6 fmt 2024-09-02 15:31:39 -07:00
Bagatur
8c49af3e9c fmt 2024-09-02 15:30:51 -07:00
Bagatur
71661fd16b fmt 2024-09-02 15:29:51 -07:00
isaac hershenson
6d3568d992 deafult depth tests 2024-06-04 20:42:51 -07:00
isaac hershenson
09db121339 unit tests recursiveurlloader 2024-06-04 20:40:46 -07:00
isaac hershenson
0a18a48171 integration tests changed 2024-06-04 10:00:36 -07:00
2 changed files with 190 additions and 3 deletions

View File

@@ -1,3 +1,5 @@
import asyncio
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
@@ -29,6 +31,98 @@ def test_async_recursive_url_loader_deterministic() -> None:
assert docs == docs_2
def test_async_recursive_url_lazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
extractor=lambda _: "placeholder",
use_async=True,
max_depth=3,
timeout=None,
check_response_status=True,
)
docs = [doc for doc in loader.lazy_load()]
assert len(docs) == 512
assert docs[0].page_content == "placeholder"
def test_async_recursive_url_lazy_loader_deterministic() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
use_async=True,
max_depth=3,
timeout=None,
)
docs = sorted(
[doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]
)
docs_2 = sorted(
[doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]
)
assert docs == docs_2
async def test_async_recursive_url_alazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
extractor=lambda _: "placeholder",
use_async=True,
max_depth=3,
timeout=None,
check_response_status=True,
)
docs = [doc async for doc in loader.alazy_load()]
assert len(docs) == 512
assert docs[0].page_content == "placeholder"
def test_async_recursive_url_aloader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
extractor=lambda _: "placeholder",
use_async=True,
max_depth=3,
timeout=None,
check_response_status=True,
)
docs = asyncio.run(loader.aload())
assert len(docs) == 512
assert docs[0].page_content == "placeholder"
def test_async_recursive_url_aloader_deterministic() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
use_async=True,
max_depth=3,
timeout=None,
)
docs = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"])
docs_2 = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"])
assert docs == docs_2
async def test_async_recursive_url_alazy_loader_deterministic() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url,
use_async=True,
max_depth=3,
timeout=None,
)
docs = sorted(
[doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]
)
docs_2 = sorted(
[doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]
)
assert docs == docs_2
def test_sync_recursive_url_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
@@ -39,10 +133,40 @@ def test_sync_recursive_url_loader() -> None:
assert docs[0].page_content == "placeholder"
def test_sync_recursive_url_lazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
)
docs = [doc for doc in loader.lazy_load()]
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
def test_sync_recursive_url_aloader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
)
docs = asyncio.run(loader.aload())
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
async def test_sync_recursive_url_alazy_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
)
docs = [doc async for doc in loader.alazy_load()]
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
def test_sync_async_equivalent() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2)
docs = sorted(loader.load(), key=lambda d: d.metadata["source"])
async_docs = sorted(async_loader.load(), key=lambda d: d.metadata["source"])
assert docs == async_docs
@@ -60,7 +184,7 @@ def test_loading_invalid_url() -> None:
def test_sync_async_metadata_necessary_properties() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2)
docs = loader.load()
async_docs = async_loader.load()
for doc in docs:

View File

@@ -3,11 +3,12 @@ from __future__ import annotations
import inspect
import uuid
from types import TracebackType
from typing import Any, Type
from typing import Any, List, Type
import aiohttp
import pytest
import requests_mock
from langchain_core.documents import Document
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
@@ -97,3 +98,65 @@ def test_no_runtime_args(method: str) -> None:
method_attr = getattr(RecursiveUrlLoader, method)
args = list(inspect.signature(method_attr).parameters)
assert args == ["self"]
def mock_requests(loader: RecursiveUrlLoader) -> List[Document]:
html1 = (
'<div><a class="blah" href="/one">hullo</a></div>'
'<div><a class="bleh" href="/two">buhbye</a></div>'
)
html2 = '<div><a class="first" href="../three">buhbye</a></div>'
html3 = '<div><a class="second" href="../three">buhbye</a></div>'
html4 = "<p>the end<p>"
MOCK_DEFINITIONS = [
("http://test.com", html1),
("http://test.com/one", html2),
("http://test.com/two", html3),
("http://test.com/three", html4),
]
with requests_mock.Mocker() as m:
for url, html in MOCK_DEFINITIONS:
m.get(url, text=html)
docs = loader.load()
return docs
def test_sync__init__() -> None:
loader = RecursiveUrlLoader("http://test.com", max_depth=1)
docs = mock_requests(loader)
assert len(docs) == 1
def test_async__init__(mocker: Any) -> None:
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
loader = RecursiveUrlLoader("http://test.com", max_depth=1, use_async=True)
docs = loader.load()
assert len(docs) == 1
def test_sync_default_depth() -> None:
loader = RecursiveUrlLoader("http://test.com")
docs = mock_requests(loader)
assert len(docs) == 3
def test_async_default_depth(mocker: Any) -> None:
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
loader = RecursiveUrlLoader("http://test.com", use_async=True)
docs = loader.load()
assert len(docs) == 3
def test_sync_deduplication() -> None:
loader = RecursiveUrlLoader("http://test.com", max_depth=3)
docs = mock_requests(loader)
assert len(docs) == 4
def test_async_deduplication(mocker: Any) -> None:
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
loader = RecursiveUrlLoader("http://test.com", max_depth=3, use_async=True)
docs = loader.load()
assert len(docs) == 4