Merge branch 'master' into wfh/async_chromium

Update libs/community/langchain_community/document_loaders/chromium.py
Co-authored-by: Christophe Bornet <cbornet@hotmail.com>
2026-02-09 02:33:34 +00:00 · 2024-07-16 14:50:11 -07:00 · 2024-04-03 11:10:45 -04:00 · 2024-04-02 18:40:11 -07:00 · 2024-04-02 18:36:45 -07:00 · 2024-04-02 18:35:32 -07:00
1 changed files with 34 additions and 2 deletions
--- a/libs/community/langchain_community/document_loaders/chromium.py
+++ b/libs/community/langchain_community/document_loaders/chromium.py
@@ -1,4 +1,3 @@
-import asyncio
 import logging
 from typing import AsyncIterator, Iterator, List, Optional

@@ -70,6 +69,33 @@ class AsyncChromiumLoader(BaseLoader):
            await browser.close()
        return results

+    def scrape_playwright(self, url: str) -> str:
+        """
+        Synchronously scrape the content of a given URL using Playwright's async API.
+
+        Args:
+            url (str): The URL to scrape.
+
+        Returns:
+            str: The scraped HTML content or an error message if an exception occurs.
+
+        """
+        from playwright.sync_api import sync_playwright
+
+        logger.info("Starting scraping...")
+        results = ""
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            try:
+                page = browser.new_page()
+                page.goto(url)
+                results = page.content()
+                logger.info("Content scraped")
+            except Exception as e:
+                results = f"Error: {e}"
+            browser.close()
+        return results
+
    def lazy_load(self) -> Iterator[Document]:
        """
        Lazily load text content from the provided URLs.
@@ -82,7 +108,13 @@ class AsyncChromiumLoader(BaseLoader):

        """
        for url in self.urls:
-            html_content = asyncio.run(self.ascrape_playwright(url))
+            html_content = self.scrape_playwright(url)
+            metadata = {"source": url}
+            yield Document(page_content=html_content, metadata=metadata)
+
+    async def alazy_load(self) -> AsyncIterator[Document]:
+        for url in self.urls:
+            html_content = await self.ascrape_playwright(url)
            metadata = {"source": url}
            yield Document(page_content=html_content, metadata=metadata)
Author	SHA1	Message	Date
William FH	29f6769c21	Merge branch 'master' into wfh/async_chromium	2024-07-16 14:50:11 -07:00
Eugene Yurtsev	7e7247537b	Update libs/community/langchain_community/document_loaders/chromium.py Co-authored-by: Christophe Bornet <cbornet@hotmail.com>	2024-04-03 11:10:45 -04:00
William Fu-Hinthorn	94cc313886	fixup	2024-04-02 18:40:11 -07:00
William Fu-Hinthorn	8b34a49b40	beh	2024-04-02 18:36:45 -07:00
William Fu-Hinthorn	608a8705c3	Fix async chromium alazy_load	2024-04-02 18:35:32 -07:00