community: Corrected aload func to be asynchronous from webBaseLoader (#28337)

- **Description:** The aload function, contrary to its name, is not an
asynchronous function, so it cannot work concurrently with other
asynchronous functions.

- **Issue:** #28336 

- **Test: **: Done

- **Docs: **
[here](e0a95e5646/docs/docs/integrations/document_loaders/web_base.ipynb (L201))

- **Lint: ** All checks passed

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
yeounhak 2024-12-21 04:42:52 +09:00 committed by GitHub
parent a08c76a6b2
commit f38fc89f35
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 213 additions and 93 deletions

File diff suppressed because one or more lines are too long

View File

@ -3,10 +3,11 @@
import asyncio
import logging
import warnings
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Union
import aiohttp
import requests
from langchain_core._api import deprecated
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
@ -78,12 +79,7 @@ class WebBaseLoader(BaseLoader):
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
for doc in loader.lazy_load():
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
@ -98,7 +94,9 @@ class WebBaseLoader(BaseLoader):
Async load:
.. code-block:: python
docs = await loader.aload()
docs = []
async for doc in loader.alazy_load():
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
@ -108,6 +106,37 @@ class WebBaseLoader(BaseLoader):
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
.. versionchanged:: 0.3.14
Deprecated ``aload`` (which was not async) and implemented a native async
``alazy_load``. Expand below for more details.
.. dropdown:: How to update ``aload``
Instead of using ``aload``, you can use ``load`` for synchronous loading or
``alazy_load`` for asynchronous lazy loading.
Example using ``load`` (synchronous):
.. code-block:: python
docs: List[Document] = loader.load()
Example using ``alazy_load`` (asynchronous):
.. code-block:: python
docs: List[Document] = []
async for doc in loader.alazy_load():
docs.append(doc)
This is in preparation for accommodating an asynchronous ``aload`` in the
future:
.. code-block:: python
docs: List[Document] = await loader.aload()
""" # noqa: E501
def __init__(
@ -279,11 +308,12 @@ class WebBaseLoader(BaseLoader):
"`parser` must be one of " + ", ".join(valid_parsers) + "."
)
def scrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
"""Fetch all urls, then return soups for all results."""
def _unpack_fetch_results(
self, results: Any, urls: List[str], parser: Union[str, None] = None
) -> List[Any]:
"""Unpack fetch results into BeautifulSoup objects."""
from bs4 import BeautifulSoup
results = asyncio.run(self.fetch_all(urls))
final_results = []
for i, result in enumerate(results):
url = urls[i]
@ -294,9 +324,20 @@ class WebBaseLoader(BaseLoader):
parser = self.default_parser
self._check_parser(parser)
final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
return final_results
def scrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
"""Fetch all urls, then return soups for all results."""
results = asyncio.run(self.fetch_all(urls))
return self._unpack_fetch_results(results, urls, parser=parser)
async def ascrape_all(
self, urls: List[str], parser: Union[str, None] = None
) -> List[Any]:
"""Async fetch all urls, then return soups for all results."""
results = await self.fetch_all(urls)
return self._unpack_fetch_results(results, urls, parser=parser)
def _scrape(
self,
url: str,
@ -339,6 +380,22 @@ class WebBaseLoader(BaseLoader):
metadata = _build_metadata(soup, path)
yield Document(page_content=text, metadata=metadata)
async def alazy_load(self) -> AsyncIterator[Document]:
"""Async lazy load text from the url(s) in web_path."""
results = await self.ascrape_all(self.web_paths)
for path, soup in zip(self.web_paths, results):
text = soup.get_text(**self.bs_get_text_kwargs)
metadata = _build_metadata(soup, path)
yield Document(page_content=text, metadata=metadata)
@deprecated(
since="0.3.14",
removal="1.0",
message=(
"See API reference for updated usage: "
"https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html" # noqa: E501
),
)
def aload(self) -> List[Document]: # type: ignore
"""Load text from the urls in web_path async into Documents."""

View File

@ -102,8 +102,8 @@ def test_falkordbvector() -> None:
pre_delete_collection=True,
)
output = docsearch.similarity_search("foo", k=1)
assert type(output) is list
assert type(output[0]) is Document
assert isinstance(output, list)
assert isinstance(output[0], Document)
assert output[0].page_content == "foo"
drop_vector_indexes(docsearch)
@ -121,8 +121,8 @@ def test_falkordbvector_embeddings() -> None:
pre_delete_collection=True,
)
output = docsearch.similarity_search("foo", k=1)
assert type(output) is list
assert type(output[0]) is Document
assert isinstance(output, list)
assert isinstance(output[0], Document)
assert output[0].page_content == "foo"
drop_vector_indexes(docsearch)
@ -168,8 +168,8 @@ def test_falkordbvector_with_metadatas() -> None:
pre_delete_collection=True,
)
output = docsearch.similarity_search("foo", k=1)
assert type(output) is list
assert type(output[0]) is Document
assert isinstance(output, list)
assert isinstance(output[0], Document)
assert output[0].metadata.get("page") == "0"
drop_vector_indexes(docsearch)

View File

@ -62,6 +62,52 @@ def test_lazy_load(mock_get: Any) -> None:
assert results[0].page_content == "This is a div with a special class"
@pytest.mark.requires("bs4")
@patch("aiohttp.ClientSession.get")
async def test_alazy_load(mock_get: Any) -> None:
async def mock_text() -> str:
return "<html><body><p>Test content</p></body></html>"
import bs4
mock_response = MagicMock()
mock_response.text = mock_text
mock_get.return_value.__aenter__.return_value = mock_response
loader = WebBaseLoader(web_paths=["https://www.example.com"])
results = []
async for result in loader.alazy_load():
results.append(result)
# mock_get.assert_called_with("https://www.example.com")
assert len(results) == 1
assert results[0].page_content == "Test content"
# Test bs4 kwargs
async def mock_text_bs4() -> str:
return dedent("""
<html>
<body>
<p>Test content</p>
<div class="special-class">This is a div with a special class</div>
</body>
</html>
""")
mock_response = MagicMock()
mock_response.text = mock_text_bs4
mock_get.return_value.__aenter__.return_value = mock_response
loader = WebBaseLoader(
web_paths=["https://www.example.com"],
bs_kwargs={"parse_only": bs4.SoupStrainer(class_="special-class")},
)
results = []
async for result in loader.alazy_load():
results.append(result)
assert len(results) == 1
assert results[0].page_content == "This is a div with a special class"
@pytest.mark.requires("bs4")
@patch("aiohttp.ClientSession.get")
def test_aload(mock_get: Any) -> None: