mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 05:43:55 +00:00
community: Implement lazy_load() for PlaywrightURLLoader (#18676)
Integration tests: `tests/integration_tests/document_loaders/test_url_playwright.py`
This commit is contained in:
parent
c092db862e
commit
db8db6faae
@ -2,7 +2,7 @@
|
||||
"""
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||
from typing import TYPE_CHECKING, AsyncIterator, Dict, Iterator, List, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -160,16 +160,14 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
# Use the provided evaluator, if any, otherwise, use the default.
|
||||
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load the specified URLs using Playwright and create Document instances.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document instances with loaded content.
|
||||
A list of Document instances with loaded content.
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
docs: List[Document] = list()
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||
for url in self.urls:
|
||||
@ -181,7 +179,7 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
|
||||
text = self.evaluator.evaluate(page, browser, response)
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(
|
||||
@ -190,19 +188,25 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
else:
|
||||
raise e
|
||||
browser.close()
|
||||
return docs
|
||||
|
||||
async def aload(self) -> List[Document]:
|
||||
"""Load the specified URLs with Playwright and create Documents asynchronously.
|
||||
Use this function when in a jupyter notebook environment.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document instances with loaded content.
|
||||
A list of Document instances with loaded content.
|
||||
"""
|
||||
return [doc async for doc in self.alazy_load()]
|
||||
|
||||
async def alazy_load(self) -> AsyncIterator[Document]:
|
||||
"""Load the specified URLs with Playwright and create Documents asynchronously.
|
||||
Use this function when in a jupyter notebook environment.
|
||||
|
||||
Returns:
|
||||
A list of Document instances with loaded content.
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
docs: List[Document] = list()
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||
for url in self.urls:
|
||||
@ -214,7 +218,7 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
|
||||
text = await self.evaluator.evaluate_async(page, browser, response)
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(
|
||||
@ -223,4 +227,3 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
else:
|
||||
raise e
|
||||
await browser.close()
|
||||
return docs
|
||||
|
Loading…
Reference in New Issue
Block a user