From db8db6faae10e308d3227a085d19a69f0e22efb4 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Wed, 6 Mar 2024 22:52:13 +0100 Subject: [PATCH] community: Implement lazy_load() for PlaywrightURLLoader (#18676) Integration tests: `tests/integration_tests/document_loaders/test_url_playwright.py` --- .../document_loaders/url_playwright.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index 106f15cee6f..016626086cc 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -2,7 +2,7 @@ """ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, AsyncIterator, Dict, Iterator, List, Optional from langchain_core.documents import Document @@ -160,16 +160,14 @@ class PlaywrightURLLoader(BaseLoader): # Use the provided evaluator, if any, otherwise, use the default. self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors) - def load(self) -> List[Document]: + def lazy_load(self) -> Iterator[Document]: """Load the specified URLs using Playwright and create Document instances. Returns: - List[Document]: A list of Document instances with loaded content. + A list of Document instances with loaded content. """ from playwright.sync_api import sync_playwright - docs: List[Document] = list() - with sync_playwright() as p: browser = p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: @@ -181,7 +179,7 @@ class PlaywrightURLLoader(BaseLoader): text = self.evaluator.evaluate(page, browser, response) metadata = {"source": url} - docs.append(Document(page_content=text, metadata=metadata)) + yield Document(page_content=text, metadata=metadata) except Exception as e: if self.continue_on_failure: logger.error( @@ -190,19 +188,25 @@ class PlaywrightURLLoader(BaseLoader): else: raise e browser.close() - return docs async def aload(self) -> List[Document]: """Load the specified URLs with Playwright and create Documents asynchronously. Use this function when in a jupyter notebook environment. Returns: - List[Document]: A list of Document instances with loaded content. + A list of Document instances with loaded content. + """ + return [doc async for doc in self.alazy_load()] + + async def alazy_load(self) -> AsyncIterator[Document]: + """Load the specified URLs with Playwright and create Documents asynchronously. + Use this function when in a jupyter notebook environment. + + Returns: + A list of Document instances with loaded content. """ from playwright.async_api import async_playwright - docs: List[Document] = list() - async with async_playwright() as p: browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: @@ -214,7 +218,7 @@ class PlaywrightURLLoader(BaseLoader): text = await self.evaluator.evaluate_async(page, browser, response) metadata = {"source": url} - docs.append(Document(page_content=text, metadata=metadata)) + yield Document(page_content=text, metadata=metadata) except Exception as e: if self.continue_on_failure: logger.error( @@ -223,4 +227,3 @@ class PlaywrightURLLoader(BaseLoader): else: raise e await browser.close() - return docs