community: Corrected aload func to be asynchronous from webBaseLoader (#28337)

- **Description:** The aload function, contrary to its name, is not an asynchronous function, so it cannot work concurrently with other asynchronous functions. - **Issue:** #28336 - **Test: **: Done - **Docs: ** [here](e0a95e5646/docs/docs/integrations/document_loaders/web_base.ipynb (L201)) - **Lint: ** All checks passed If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
2025-09-24 12:01:54 +00:00 · 2024-12-21 04:42:52 +09:00
parent a08c76a6b2
commit f38fc89f35
4 changed files with 213 additions and 93 deletions
--- a/docs/docs/integrations/document_loaders/web_base.ipynb
+++ b/docs/docs/integrations/document_loaders/web_base.ipynb
--- a/libs/community/langchain_community/document_loaders/web_base.py
+++ b/libs/community/langchain_community/document_loaders/web_base.py
@@ -3,10 +3,11 @@
 import asyncio
 import logging
 import warnings
-from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
+from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Union
 import aiohttp
 import requests
 from langchain_core._api import deprecated
 from langchain_core.documents import Document
 from langchain_community.document_loaders.base import BaseLoader
@@ -78,12 +79,7 @@ class WebBaseLoader(BaseLoader):
        .. code-block:: python
            docs = []
-            docs_lazy = loader.lazy_load()
+            for doc in loader.lazy_load():
            # async variant:
            # docs_lazy = await loader.alazy_load()
            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
@@ -98,7 +94,9 @@ class WebBaseLoader(BaseLoader):
    Async load:
        .. code-block:: python
-            docs = await loader.aload()
+            docs = []
            async for doc in loader.alazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
@@ -108,6 +106,37 @@ class WebBaseLoader(BaseLoader):
            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
    .. versionchanged:: 0.3.14
        Deprecated ``aload`` (which was not async) and implemented a native async
        ``alazy_load``. Expand below for more details.
        .. dropdown:: How to update ``aload``
            Instead of using ``aload``, you can use ``load`` for synchronous loading or
            ``alazy_load`` for asynchronous lazy loading.
            Example using ``load`` (synchronous):
            .. code-block:: python
                docs: List[Document] = loader.load()
            Example using ``alazy_load`` (asynchronous):
            .. code-block:: python
                docs: List[Document] = []
                async for doc in loader.alazy_load():
                    docs.append(doc)
            This is in preparation for accommodating an asynchronous ``aload`` in the
            future:
            .. code-block:: python
                docs: List[Document] = await loader.aload()
    """  # noqa: E501
    def __init__(
@@ -279,11 +308,12 @@ class WebBaseLoader(BaseLoader):
                "`parser` must be one of " + ", ".join(valid_parsers) + "."
            )
-    def scrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
+    def _unpack_fetch_results(
-        """Fetch all urls, then return soups for all results."""
+        self, results: Any, urls: List[str], parser: Union[str, None] = None
    ) -> List[Any]:
        """Unpack fetch results into BeautifulSoup objects."""
        from bs4 import BeautifulSoup
        results = asyncio.run(self.fetch_all(urls))
        final_results = []
        for i, result in enumerate(results):
            url = urls[i]
@@ -294,9 +324,20 @@ class WebBaseLoader(BaseLoader):
                    parser = self.default_parser
                self._check_parser(parser)
            final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
        return final_results
    def scrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
        """Fetch all urls, then return soups for all results."""
        results = asyncio.run(self.fetch_all(urls))
        return self._unpack_fetch_results(results, urls, parser=parser)
    async def ascrape_all(
        self, urls: List[str], parser: Union[str, None] = None
    ) -> List[Any]:
        """Async fetch all urls, then return soups for all results."""
        results = await self.fetch_all(urls)
        return self._unpack_fetch_results(results, urls, parser=parser)
    def _scrape(
        self,
        url: str,
@@ -339,6 +380,22 @@ class WebBaseLoader(BaseLoader):
            metadata = _build_metadata(soup, path)
            yield Document(page_content=text, metadata=metadata)
    async def alazy_load(self) -> AsyncIterator[Document]:
        """Async lazy load text from the url(s) in web_path."""
        results = await self.ascrape_all(self.web_paths)
        for path, soup in zip(self.web_paths, results):
            text = soup.get_text(**self.bs_get_text_kwargs)
            metadata = _build_metadata(soup, path)
            yield Document(page_content=text, metadata=metadata)
    @deprecated(
        since="0.3.14",
        removal="1.0",
        message=(
            "See API reference for updated usage: "
            "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"  # noqa: E501
        ),
    )
    def aload(self) -> List[Document]:  # type: ignore
        """Load text from the urls in web_path async into Documents."""
--- a/libs/community/tests/integration_tests/vectorstores/test_falkordb_vector_integration.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_falkordb_vector_integration.py
@@ -102,8 +102,8 @@ def test_falkordbvector() -> None:
        pre_delete_collection=True,
    )
    output = docsearch.similarity_search("foo", k=1)
-    assert type(output) is list
+    assert isinstance(output, list)
-    assert type(output[0]) is Document
+    assert isinstance(output[0], Document)
    assert output[0].page_content == "foo"
    drop_vector_indexes(docsearch)
@@ -121,8 +121,8 @@ def test_falkordbvector_embeddings() -> None:
        pre_delete_collection=True,
    )
    output = docsearch.similarity_search("foo", k=1)
-    assert type(output) is list
+    assert isinstance(output, list)
-    assert type(output[0]) is Document
+    assert isinstance(output[0], Document)
    assert output[0].page_content == "foo"
    drop_vector_indexes(docsearch)
@@ -168,8 +168,8 @@ def test_falkordbvector_with_metadatas() -> None:
        pre_delete_collection=True,
    )
    output = docsearch.similarity_search("foo", k=1)
-    assert type(output) is list
+    assert isinstance(output, list)
-    assert type(output[0]) is Document
+    assert isinstance(output[0], Document)
    assert output[0].metadata.get("page") == "0"
    drop_vector_indexes(docsearch)
--- a/libs/community/tests/unit_tests/document_loaders/test_web_base.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_web_base.py
@@ -62,6 +62,52 @@ def test_lazy_load(mock_get: Any) -> None:
    assert results[0].page_content == "This is a div with a special class"
@pytest.mark.requires("bs4")
@patch("aiohttp.ClientSession.get")
 async def test_alazy_load(mock_get: Any) -> None:
    async def mock_text() -> str:
        return "<html><body><p>Test content</p></body></html>"
    import bs4
    mock_response = MagicMock()
    mock_response.text = mock_text
    mock_get.return_value.__aenter__.return_value = mock_response
    loader = WebBaseLoader(web_paths=["https://www.example.com"])
    results = []
    async for result in loader.alazy_load():
        results.append(result)
    # mock_get.assert_called_with("https://www.example.com")
    assert len(results) == 1
    assert results[0].page_content == "Test content"
    # Test bs4 kwargs
    async def mock_text_bs4() -> str:
        return dedent("""
            <html>
            <body>
                <p>Test content</p>
                <div class="special-class">This is a div with a special class</div>
            </body>
            </html>
            """)
    mock_response = MagicMock()
    mock_response.text = mock_text_bs4
    mock_get.return_value.__aenter__.return_value = mock_response
    loader = WebBaseLoader(
        web_paths=["https://www.example.com"],
        bs_kwargs={"parse_only": bs4.SoupStrainer(class_="special-class")},
    )
    results = []
    async for result in loader.alazy_load():
        results.append(result)
    assert len(results) == 1
    assert results[0].page_content == "This is a div with a special class"
@pytest.mark.requires("bs4")
@patch("aiohttp.ClientSession.get")
 def test_aload(mock_get: Any) -> None: