mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-21 06:33:41 +00:00
Compare commits
6 Commits
mdrxy/vers
...
isaac-recu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
255fb6c6d6 | ||
|
|
8c49af3e9c | ||
|
|
71661fd16b | ||
|
|
6d3568d992 | ||
|
|
09db121339 | ||
|
|
0a18a48171 |
@@ -1,3 +1,5 @@
|
|||||||
|
import asyncio
|
||||||
|
|
||||||
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||||
|
|
||||||
|
|
||||||
@@ -29,6 +31,98 @@ def test_async_recursive_url_loader_deterministic() -> None:
|
|||||||
assert docs == docs_2
|
assert docs == docs_2
|
||||||
|
|
||||||
|
|
||||||
|
def test_async_recursive_url_lazy_loader() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url,
|
||||||
|
extractor=lambda _: "placeholder",
|
||||||
|
use_async=True,
|
||||||
|
max_depth=3,
|
||||||
|
timeout=None,
|
||||||
|
check_response_status=True,
|
||||||
|
)
|
||||||
|
docs = [doc for doc in loader.lazy_load()]
|
||||||
|
assert len(docs) == 512
|
||||||
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
|
def test_async_recursive_url_lazy_loader_deterministic() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url,
|
||||||
|
use_async=True,
|
||||||
|
max_depth=3,
|
||||||
|
timeout=None,
|
||||||
|
)
|
||||||
|
docs = sorted(
|
||||||
|
[doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]
|
||||||
|
)
|
||||||
|
docs_2 = sorted(
|
||||||
|
[doc for doc in loader.lazy_load()], key=lambda d: d.metadata["source"]
|
||||||
|
)
|
||||||
|
assert docs == docs_2
|
||||||
|
|
||||||
|
|
||||||
|
async def test_async_recursive_url_alazy_loader() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url,
|
||||||
|
extractor=lambda _: "placeholder",
|
||||||
|
use_async=True,
|
||||||
|
max_depth=3,
|
||||||
|
timeout=None,
|
||||||
|
check_response_status=True,
|
||||||
|
)
|
||||||
|
docs = [doc async for doc in loader.alazy_load()]
|
||||||
|
assert len(docs) == 512
|
||||||
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
|
def test_async_recursive_url_aloader() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url,
|
||||||
|
extractor=lambda _: "placeholder",
|
||||||
|
use_async=True,
|
||||||
|
max_depth=3,
|
||||||
|
timeout=None,
|
||||||
|
check_response_status=True,
|
||||||
|
)
|
||||||
|
docs = asyncio.run(loader.aload())
|
||||||
|
assert len(docs) == 512
|
||||||
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
|
def test_async_recursive_url_aloader_deterministic() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url,
|
||||||
|
use_async=True,
|
||||||
|
max_depth=3,
|
||||||
|
timeout=None,
|
||||||
|
)
|
||||||
|
docs = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"])
|
||||||
|
docs_2 = sorted(asyncio.run(loader.aload()), key=lambda d: d.metadata["source"])
|
||||||
|
assert docs == docs_2
|
||||||
|
|
||||||
|
|
||||||
|
async def test_async_recursive_url_alazy_loader_deterministic() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url,
|
||||||
|
use_async=True,
|
||||||
|
max_depth=3,
|
||||||
|
timeout=None,
|
||||||
|
)
|
||||||
|
docs = sorted(
|
||||||
|
[doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]
|
||||||
|
)
|
||||||
|
docs_2 = sorted(
|
||||||
|
[doc async for doc in loader.alazy_load()], key=lambda d: d.metadata["source"]
|
||||||
|
)
|
||||||
|
assert docs == docs_2
|
||||||
|
|
||||||
|
|
||||||
def test_sync_recursive_url_loader() -> None:
|
def test_sync_recursive_url_loader() -> None:
|
||||||
url = "https://docs.python.org/3.9/"
|
url = "https://docs.python.org/3.9/"
|
||||||
loader = RecursiveUrlLoader(
|
loader = RecursiveUrlLoader(
|
||||||
@@ -39,10 +133,40 @@ def test_sync_recursive_url_loader() -> None:
|
|||||||
assert docs[0].page_content == "placeholder"
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_recursive_url_lazy_loader() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
|
||||||
|
)
|
||||||
|
docs = [doc for doc in loader.lazy_load()]
|
||||||
|
assert len(docs) == 24
|
||||||
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_recursive_url_aloader() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
|
||||||
|
)
|
||||||
|
docs = asyncio.run(loader.aload())
|
||||||
|
assert len(docs) == 24
|
||||||
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
|
async def test_sync_recursive_url_alazy_loader() -> None:
|
||||||
|
url = "https://docs.python.org/3.9/"
|
||||||
|
loader = RecursiveUrlLoader(
|
||||||
|
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
|
||||||
|
)
|
||||||
|
docs = [doc async for doc in loader.alazy_load()]
|
||||||
|
assert len(docs) == 24
|
||||||
|
assert docs[0].page_content == "placeholder"
|
||||||
|
|
||||||
|
|
||||||
def test_sync_async_equivalent() -> None:
|
def test_sync_async_equivalent() -> None:
|
||||||
url = "https://docs.python.org/3.9/"
|
url = "https://docs.python.org/3.9/"
|
||||||
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
|
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
|
||||||
async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
|
async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2)
|
||||||
docs = sorted(loader.load(), key=lambda d: d.metadata["source"])
|
docs = sorted(loader.load(), key=lambda d: d.metadata["source"])
|
||||||
async_docs = sorted(async_loader.load(), key=lambda d: d.metadata["source"])
|
async_docs = sorted(async_loader.load(), key=lambda d: d.metadata["source"])
|
||||||
assert docs == async_docs
|
assert docs == async_docs
|
||||||
@@ -60,7 +184,7 @@ def test_loading_invalid_url() -> None:
|
|||||||
def test_sync_async_metadata_necessary_properties() -> None:
|
def test_sync_async_metadata_necessary_properties() -> None:
|
||||||
url = "https://docs.python.org/3.9/"
|
url = "https://docs.python.org/3.9/"
|
||||||
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
|
loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
|
||||||
async_loader = RecursiveUrlLoader(url, use_async=False, max_depth=2)
|
async_loader = RecursiveUrlLoader(url, use_async=True, max_depth=2)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
async_docs = async_loader.load()
|
async_docs = async_loader.load()
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|||||||
@@ -3,11 +3,12 @@ from __future__ import annotations
|
|||||||
import inspect
|
import inspect
|
||||||
import uuid
|
import uuid
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any, Type
|
from typing import Any, List, Type
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import pytest
|
import pytest
|
||||||
import requests_mock
|
import requests_mock
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||||
|
|
||||||
@@ -97,3 +98,65 @@ def test_no_runtime_args(method: str) -> None:
|
|||||||
method_attr = getattr(RecursiveUrlLoader, method)
|
method_attr = getattr(RecursiveUrlLoader, method)
|
||||||
args = list(inspect.signature(method_attr).parameters)
|
args = list(inspect.signature(method_attr).parameters)
|
||||||
assert args == ["self"]
|
assert args == ["self"]
|
||||||
|
|
||||||
|
|
||||||
|
def mock_requests(loader: RecursiveUrlLoader) -> List[Document]:
|
||||||
|
html1 = (
|
||||||
|
'<div><a class="blah" href="/one">hullo</a></div>'
|
||||||
|
'<div><a class="bleh" href="/two">buhbye</a></div>'
|
||||||
|
)
|
||||||
|
html2 = '<div><a class="first" href="../three">buhbye</a></div>'
|
||||||
|
html3 = '<div><a class="second" href="../three">buhbye</a></div>'
|
||||||
|
html4 = "<p>the end<p>"
|
||||||
|
|
||||||
|
MOCK_DEFINITIONS = [
|
||||||
|
("http://test.com", html1),
|
||||||
|
("http://test.com/one", html2),
|
||||||
|
("http://test.com/two", html3),
|
||||||
|
("http://test.com/three", html4),
|
||||||
|
]
|
||||||
|
|
||||||
|
with requests_mock.Mocker() as m:
|
||||||
|
for url, html in MOCK_DEFINITIONS:
|
||||||
|
m.get(url, text=html)
|
||||||
|
docs = loader.load()
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync__init__() -> None:
|
||||||
|
loader = RecursiveUrlLoader("http://test.com", max_depth=1)
|
||||||
|
docs = mock_requests(loader)
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_async__init__(mocker: Any) -> None:
|
||||||
|
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
|
||||||
|
loader = RecursiveUrlLoader("http://test.com", max_depth=1, use_async=True)
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_default_depth() -> None:
|
||||||
|
loader = RecursiveUrlLoader("http://test.com")
|
||||||
|
docs = mock_requests(loader)
|
||||||
|
assert len(docs) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_async_default_depth(mocker: Any) -> None:
|
||||||
|
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
|
||||||
|
loader = RecursiveUrlLoader("http://test.com", use_async=True)
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_deduplication() -> None:
|
||||||
|
loader = RecursiveUrlLoader("http://test.com", max_depth=3)
|
||||||
|
docs = mock_requests(loader)
|
||||||
|
assert len(docs) == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_async_deduplication(mocker: Any) -> None:
|
||||||
|
mocker.patch.object(aiohttp.ClientSession, "get", new=MockGet)
|
||||||
|
loader = RecursiveUrlLoader("http://test.com", max_depth=3, use_async=True)
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 4
|
||||||
|
|||||||
Reference in New Issue
Block a user