From 45ddf4d26feefb3308e9f0191fdacc4df0cb9056 Mon Sep 17 00:00:00 2001 From: Noah <77106242+Noah0115@users.noreply.github.com> Date: Wed, 1 May 2024 13:20:57 +0800 Subject: [PATCH] community[patch]: Update comments for lazy_load method (#21063) - [ ] **PR message**: - **Description:** Refactored the lazy_load method to use asynchronous execution for improved performance. The method now initiates scraping of all URLs simultaneously using asyncio.gather, enhancing data fetching efficiency. Each Document object is yielded immediately once its content becomes available, streamlining the entire process. - **Issue:** N/A - **Dependencies:** Requires the asyncio library for handling asynchronous tasks, which should already be part of standard Python libraries in Python 3.7 and above. - **Email:** [r73327118@gmail.com](mailto:r73327118@gmail.com) --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur --- .../document_loaders/chromium.py | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/chromium.py b/libs/community/langchain_community/document_loaders/chromium.py index 8c71e895965..e15e78b12ce 100644 --- a/libs/community/langchain_community/document_loaders/chromium.py +++ b/libs/community/langchain_community/document_loaders/chromium.py @@ -1,6 +1,6 @@ import asyncio import logging -from typing import Iterator, List +from typing import AsyncIterator, Iterator, List from langchain_core.documents import Document @@ -13,14 +13,8 @@ class AsyncChromiumLoader(BaseLoader): """Scrape HTML pages from URLs using a headless instance of the Chromium.""" - def __init__( - self, - urls: List[str], - *, - headless: bool = True, - ): - """ - Initialize the loader with a list of URL paths. + def __init__(self, urls: List[str], *, headless: bool = True): + """Initialize the loader with a list of URL paths. Args: urls: A list of URLs to scrape content from. @@ -82,3 +76,22 @@ class AsyncChromiumLoader(BaseLoader): html_content = asyncio.run(self.ascrape_playwright(url)) metadata = {"source": url} yield Document(page_content=html_content, metadata=metadata) + + async def alazy_load(self) -> AsyncIterator[Document]: + """ + Asynchronously load text content from the provided URLs. + + This method leverages asyncio to initiate the scraping of all provided URLs + simultaneously. It improves performance by utilizing concurrent asynchronous + requests. Each Document is yielded as soon as its content is available, + encapsulating the scraped content. + + Yields: + Document: A Document object containing the scraped content, along with its + source URL as metadata. + """ + tasks = [self.ascrape_playwright(url) for url in self.urls] + results = await asyncio.gather(*tasks) + for url, content in zip(self.urls, results): + metadata = {"source": url} + yield Document(page_content=content, metadata=metadata)