mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
community: Corrected aload func to be asynchronous from webBaseLoader (#28337)
- **Description:** The aload function, contrary to its name, is not an
asynchronous function, so it cannot work concurrently with other
asynchronous functions.
- **Issue:** #28336
- **Test: **: Done
- **Docs: **
[here](e0a95e5646/docs/docs/integrations/document_loaders/web_base.ipynb (L201)
)
- **Lint: ** All checks passed
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
---------
Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
a08c76a6b2
commit
f38fc89f35
File diff suppressed because one or more lines are too long
@ -3,10 +3,11 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
|
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import requests
|
import requests
|
||||||
|
from langchain_core._api import deprecated
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
@ -78,12 +79,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
docs_lazy = loader.lazy_load()
|
for doc in loader.lazy_load():
|
||||||
|
|
||||||
# async variant:
|
|
||||||
# docs_lazy = await loader.alazy_load()
|
|
||||||
|
|
||||||
for doc in docs_lazy:
|
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
print(docs[0].page_content[:100])
|
print(docs[0].page_content[:100])
|
||||||
print(docs[0].metadata)
|
print(docs[0].metadata)
|
||||||
@ -98,7 +94,9 @@ class WebBaseLoader(BaseLoader):
|
|||||||
Async load:
|
Async load:
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
docs = await loader.aload()
|
docs = []
|
||||||
|
async for doc in loader.alazy_load():
|
||||||
|
docs.append(doc)
|
||||||
print(docs[0].page_content[:100])
|
print(docs[0].page_content[:100])
|
||||||
print(docs[0].metadata)
|
print(docs[0].metadata)
|
||||||
|
|
||||||
@ -108,6 +106,37 @@ class WebBaseLoader(BaseLoader):
|
|||||||
|
|
||||||
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
|
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
|
||||||
|
|
||||||
|
.. versionchanged:: 0.3.14
|
||||||
|
|
||||||
|
Deprecated ``aload`` (which was not async) and implemented a native async
|
||||||
|
``alazy_load``. Expand below for more details.
|
||||||
|
|
||||||
|
.. dropdown:: How to update ``aload``
|
||||||
|
|
||||||
|
Instead of using ``aload``, you can use ``load`` for synchronous loading or
|
||||||
|
``alazy_load`` for asynchronous lazy loading.
|
||||||
|
|
||||||
|
Example using ``load`` (synchronous):
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs: List[Document] = loader.load()
|
||||||
|
|
||||||
|
Example using ``alazy_load`` (asynchronous):
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs: List[Document] = []
|
||||||
|
async for doc in loader.alazy_load():
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
This is in preparation for accommodating an asynchronous ``aload`` in the
|
||||||
|
future:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs: List[Document] = await loader.aload()
|
||||||
|
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -279,11 +308,12 @@ class WebBaseLoader(BaseLoader):
|
|||||||
"`parser` must be one of " + ", ".join(valid_parsers) + "."
|
"`parser` must be one of " + ", ".join(valid_parsers) + "."
|
||||||
)
|
)
|
||||||
|
|
||||||
def scrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
|
def _unpack_fetch_results(
|
||||||
"""Fetch all urls, then return soups for all results."""
|
self, results: Any, urls: List[str], parser: Union[str, None] = None
|
||||||
|
) -> List[Any]:
|
||||||
|
"""Unpack fetch results into BeautifulSoup objects."""
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
results = asyncio.run(self.fetch_all(urls))
|
|
||||||
final_results = []
|
final_results = []
|
||||||
for i, result in enumerate(results):
|
for i, result in enumerate(results):
|
||||||
url = urls[i]
|
url = urls[i]
|
||||||
@ -294,9 +324,20 @@ class WebBaseLoader(BaseLoader):
|
|||||||
parser = self.default_parser
|
parser = self.default_parser
|
||||||
self._check_parser(parser)
|
self._check_parser(parser)
|
||||||
final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
|
final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
|
||||||
|
|
||||||
return final_results
|
return final_results
|
||||||
|
|
||||||
|
def scrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
|
||||||
|
"""Fetch all urls, then return soups for all results."""
|
||||||
|
results = asyncio.run(self.fetch_all(urls))
|
||||||
|
return self._unpack_fetch_results(results, urls, parser=parser)
|
||||||
|
|
||||||
|
async def ascrape_all(
|
||||||
|
self, urls: List[str], parser: Union[str, None] = None
|
||||||
|
) -> List[Any]:
|
||||||
|
"""Async fetch all urls, then return soups for all results."""
|
||||||
|
results = await self.fetch_all(urls)
|
||||||
|
return self._unpack_fetch_results(results, urls, parser=parser)
|
||||||
|
|
||||||
def _scrape(
|
def _scrape(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
@ -339,6 +380,22 @@ class WebBaseLoader(BaseLoader):
|
|||||||
metadata = _build_metadata(soup, path)
|
metadata = _build_metadata(soup, path)
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
||||||
|
"""Async lazy load text from the url(s) in web_path."""
|
||||||
|
results = await self.ascrape_all(self.web_paths)
|
||||||
|
for path, soup in zip(self.web_paths, results):
|
||||||
|
text = soup.get_text(**self.bs_get_text_kwargs)
|
||||||
|
metadata = _build_metadata(soup, path)
|
||||||
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
since="0.3.14",
|
||||||
|
removal="1.0",
|
||||||
|
message=(
|
||||||
|
"See API reference for updated usage: "
|
||||||
|
"https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html" # noqa: E501
|
||||||
|
),
|
||||||
|
)
|
||||||
def aload(self) -> List[Document]: # type: ignore
|
def aload(self) -> List[Document]: # type: ignore
|
||||||
"""Load text from the urls in web_path async into Documents."""
|
"""Load text from the urls in web_path async into Documents."""
|
||||||
|
|
||||||
|
@ -102,8 +102,8 @@ def test_falkordbvector() -> None:
|
|||||||
pre_delete_collection=True,
|
pre_delete_collection=True,
|
||||||
)
|
)
|
||||||
output = docsearch.similarity_search("foo", k=1)
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
assert type(output) is list
|
assert isinstance(output, list)
|
||||||
assert type(output[0]) is Document
|
assert isinstance(output[0], Document)
|
||||||
assert output[0].page_content == "foo"
|
assert output[0].page_content == "foo"
|
||||||
|
|
||||||
drop_vector_indexes(docsearch)
|
drop_vector_indexes(docsearch)
|
||||||
@ -121,8 +121,8 @@ def test_falkordbvector_embeddings() -> None:
|
|||||||
pre_delete_collection=True,
|
pre_delete_collection=True,
|
||||||
)
|
)
|
||||||
output = docsearch.similarity_search("foo", k=1)
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
assert type(output) is list
|
assert isinstance(output, list)
|
||||||
assert type(output[0]) is Document
|
assert isinstance(output[0], Document)
|
||||||
assert output[0].page_content == "foo"
|
assert output[0].page_content == "foo"
|
||||||
|
|
||||||
drop_vector_indexes(docsearch)
|
drop_vector_indexes(docsearch)
|
||||||
@ -168,8 +168,8 @@ def test_falkordbvector_with_metadatas() -> None:
|
|||||||
pre_delete_collection=True,
|
pre_delete_collection=True,
|
||||||
)
|
)
|
||||||
output = docsearch.similarity_search("foo", k=1)
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
assert type(output) is list
|
assert isinstance(output, list)
|
||||||
assert type(output[0]) is Document
|
assert isinstance(output[0], Document)
|
||||||
assert output[0].metadata.get("page") == "0"
|
assert output[0].metadata.get("page") == "0"
|
||||||
|
|
||||||
drop_vector_indexes(docsearch)
|
drop_vector_indexes(docsearch)
|
||||||
|
@ -62,6 +62,52 @@ def test_lazy_load(mock_get: Any) -> None:
|
|||||||
assert results[0].page_content == "This is a div with a special class"
|
assert results[0].page_content == "This is a div with a special class"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
@patch("aiohttp.ClientSession.get")
|
||||||
|
async def test_alazy_load(mock_get: Any) -> None:
|
||||||
|
async def mock_text() -> str:
|
||||||
|
return "<html><body><p>Test content</p></body></html>"
|
||||||
|
|
||||||
|
import bs4
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.text = mock_text
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
loader = WebBaseLoader(web_paths=["https://www.example.com"])
|
||||||
|
results = []
|
||||||
|
async for result in loader.alazy_load():
|
||||||
|
results.append(result)
|
||||||
|
# mock_get.assert_called_with("https://www.example.com")
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0].page_content == "Test content"
|
||||||
|
|
||||||
|
# Test bs4 kwargs
|
||||||
|
async def mock_text_bs4() -> str:
|
||||||
|
return dedent("""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>Test content</p>
|
||||||
|
<div class="special-class">This is a div with a special class</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.text = mock_text_bs4
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
loader = WebBaseLoader(
|
||||||
|
web_paths=["https://www.example.com"],
|
||||||
|
bs_kwargs={"parse_only": bs4.SoupStrainer(class_="special-class")},
|
||||||
|
)
|
||||||
|
results = []
|
||||||
|
async for result in loader.alazy_load():
|
||||||
|
results.append(result)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0].page_content == "This is a div with a special class"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("bs4")
|
@pytest.mark.requires("bs4")
|
||||||
@patch("aiohttp.ClientSession.get")
|
@patch("aiohttp.ClientSession.get")
|
||||||
def test_aload(mock_get: Any) -> None:
|
def test_aload(mock_get: Any) -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user