diff --git a/libs/community/langchain_community/document_loaders/chromium.py b/libs/community/langchain_community/document_loaders/chromium.py index 8c71e895965..e15e78b12ce 100644 --- a/libs/community/langchain_community/document_loaders/chromium.py +++ b/libs/community/langchain_community/document_loaders/chromium.py @@ -1,6 +1,6 @@ import asyncio import logging -from typing import Iterator, List +from typing import AsyncIterator, Iterator, List from langchain_core.documents import Document @@ -13,14 +13,8 @@ class AsyncChromiumLoader(BaseLoader): """Scrape HTML pages from URLs using a headless instance of the Chromium.""" - def __init__( - self, - urls: List[str], - *, - headless: bool = True, - ): - """ - Initialize the loader with a list of URL paths. + def __init__(self, urls: List[str], *, headless: bool = True): + """Initialize the loader with a list of URL paths. Args: urls: A list of URLs to scrape content from. @@ -82,3 +76,22 @@ class AsyncChromiumLoader(BaseLoader): html_content = asyncio.run(self.ascrape_playwright(url)) metadata = {"source": url} yield Document(page_content=html_content, metadata=metadata) + + async def alazy_load(self) -> AsyncIterator[Document]: + """ + Asynchronously load text content from the provided URLs. + + This method leverages asyncio to initiate the scraping of all provided URLs + simultaneously. It improves performance by utilizing concurrent asynchronous + requests. Each Document is yielded as soon as its content is available, + encapsulating the scraped content. + + Yields: + Document: A Document object containing the scraped content, along with its + source URL as metadata. + """ + tasks = [self.ascrape_playwright(url) for url in self.urls] + results = await asyncio.gather(*tasks) + for url, content in zip(self.urls, results): + metadata = {"source": url} + yield Document(page_content=content, metadata=metadata)