mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-16 17:53:37 +00:00
Added matching async load func to PlaywrightURLLoader (#5938)
Fixes # (issue) The existing PlaywrightURLLoader load() function uses a synchronous browser which is not compatible with jupyter. This PR adds a sister function aload() which can be run insisde a notebook. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
ae7714f1ba
commit
c087ce74f7
@ -86,3 +86,43 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
raise e
|
||||
browser.close()
|
||||
return docs
|
||||
|
||||
async def aload(self) -> List[Document]:
|
||||
"""Load the specified URLs with Playwright and create Documents asynchronously.
|
||||
Use this function when in a jupyter notebook environment.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document instances with loaded content.
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
docs: List[Document] = list()
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=self.headless)
|
||||
for url in self.urls:
|
||||
try:
|
||||
page = await browser.new_page()
|
||||
await page.goto(url)
|
||||
|
||||
for selector in self.remove_selectors or []:
|
||||
elements = await page.locator(selector).all()
|
||||
for element in elements:
|
||||
if await element.is_visible():
|
||||
await element.evaluate("element => element.remove()")
|
||||
|
||||
page_source = await page.content()
|
||||
elements = partition_html(text=page_source)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(
|
||||
f"Error fetching or processing {url}, exception: {e}"
|
||||
)
|
||||
else:
|
||||
raise e
|
||||
await browser.close()
|
||||
return docs
|
||||
|
@ -1,4 +1,5 @@
|
||||
"""Tests for the Playwright URL loader"""
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import PlaywrightURLLoader
|
||||
|
||||
@ -19,3 +20,22 @@ def test_playwright_url_loader() -> None:
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_playwright_async_url_loader() -> None:
|
||||
"""Test Playwright async URL loader."""
|
||||
urls = [
|
||||
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
||||
"https://goo.gl/maps/NDSHwePEyaHMFGwh8",
|
||||
"https://techmeme.com",
|
||||
"https://techcrunch.com",
|
||||
]
|
||||
loader = PlaywrightURLLoader(
|
||||
urls=urls,
|
||||
remove_selectors=["header", "footer"],
|
||||
continue_on_failure=False,
|
||||
headless=True,
|
||||
)
|
||||
docs = await loader.aload()
|
||||
assert len(docs) > 0
|
||||
|
Loading…
Reference in New Issue
Block a user