Compare commits

...

5 Commits

Author SHA1 Message Date
William FH
29f6769c21 Merge branch 'master' into wfh/async_chromium 2024-07-16 14:50:11 -07:00
Eugene Yurtsev
7e7247537b Update libs/community/langchain_community/document_loaders/chromium.py
Co-authored-by: Christophe Bornet <cbornet@hotmail.com>
2024-04-03 11:10:45 -04:00
William Fu-Hinthorn
94cc313886 fixup 2024-04-02 18:40:11 -07:00
William Fu-Hinthorn
8b34a49b40 beh 2024-04-02 18:36:45 -07:00
William Fu-Hinthorn
608a8705c3 Fix async chromium alazy_load 2024-04-02 18:35:32 -07:00

View File

@@ -1,4 +1,3 @@
import asyncio
import logging
from typing import AsyncIterator, Iterator, List, Optional
@@ -70,6 +69,33 @@ class AsyncChromiumLoader(BaseLoader):
await browser.close()
return results
def scrape_playwright(self, url: str) -> str:
"""
Synchronously scrape the content of a given URL using Playwright's async API.
Args:
url (str): The URL to scrape.
Returns:
str: The scraped HTML content or an error message if an exception occurs.
"""
from playwright.sync_api import sync_playwright
logger.info("Starting scraping...")
results = ""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
try:
page = browser.new_page()
page.goto(url)
results = page.content()
logger.info("Content scraped")
except Exception as e:
results = f"Error: {e}"
browser.close()
return results
def lazy_load(self) -> Iterator[Document]:
"""
Lazily load text content from the provided URLs.
@@ -82,7 +108,13 @@ class AsyncChromiumLoader(BaseLoader):
"""
for url in self.urls:
html_content = asyncio.run(self.ascrape_playwright(url))
html_content = self.scrape_playwright(url)
metadata = {"source": url}
yield Document(page_content=html_content, metadata=metadata)
async def alazy_load(self) -> AsyncIterator[Document]:
for url in self.urls:
html_content = await self.ascrape_playwright(url)
metadata = {"source": url}
yield Document(page_content=html_content, metadata=metadata)