Introduces SeleniumURLLoader for JavaScript-Dependent Web Page Data Retrieval (#2291)

### Summary This PR introduces a `SeleniumURLLoader` which, similar to `UnstructuredURLLoader`, loads data from URLs. However, it utilizes `selenium` to fetch page content, enabling it to work with JavaScript-rendered pages. The `unstructured` library is also employed for loading the HTML content. ### Testing ```bash pip install selenium pip install unstructured ``` ```python from langchain.document_loaders import SeleniumURLLoader urls = [ "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "https://goo.gl/maps/NDSHwePEyaHMFGwh8" ] loader = SeleniumURLLoader(urls=urls) data = loader.load() ```
2025-09-06 05:25:04 +00:00 · 2023-04-02 17:05:00 -04:00
parent 00d3ec5ed8
commit e4cfaa5680
3 changed files with 180 additions and 0 deletions
--- a/docs/modules/indexes/document_loaders/examples/url.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/url.ipynb
@@ -52,6 +52,66 @@
   "source": [
    "data = loader.load()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f3afa135",
   "metadata": {},
   "source": [
    "# Selenium URL Loader\n",
    "\n",
    "This covers how to load HTML documents from a list of URLs using the `SeleniumURLLoader`.\n",
    "\n",
    "Using selenium allows us to load pages that require JavaScript to render.\n",
    "\n",
    "## Setup\n",
    "\n",
    "To use the `SeleniumURLLoader`, you will need to install `selenium` and `unstructured`.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fc50835",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import SeleniumURLLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24e896ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "urls = [\n",
    "    \"https://www.youtube.com/watch?v=dQw4w9WgXcQ\",\n",
    "    \"https://goo.gl/maps/NDSHwePEyaHMFGwh8\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60a29397",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = SeleniumURLLoader(urls=urls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0090cd57",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  }
 ],
 "metadata": {
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@@ -58,6 +58,7 @@ from langchain.document_loaders.unstructured import (
    UnstructuredFileLoader,
 )
 from langchain.document_loaders.url import UnstructuredURLLoader
 from langchain.document_loaders.url_selenium import SeleniumURLLoader
 from langchain.document_loaders.web_base import WebBaseLoader
 from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
 from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
@@ -74,6 +75,7 @@ __all__ = [
    "UnstructuredFileLoader",
    "UnstructuredFileIOLoader",
    "UnstructuredURLLoader",
    "SeleniumURLLoader",
    "DirectoryLoader",
    "NotionDirectoryLoader",
    "NotionDBLoader",
--- a/langchain/document_loaders/url_selenium.py
+++ b/langchain/document_loaders/url_selenium.py
@@ -0,0 +1,118 @@
 """Loader that uses Selenium to load a page, then uses unstructured to load the html.
 """
 import logging
 from typing import TYPE_CHECKING, List, Literal, Optional, Union
 if TYPE_CHECKING:
    from selenium.webdriver import Chrome, Firefox
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 logger = logging.getLogger(__file__)
 class SeleniumURLLoader(BaseLoader):
    """Loader that uses Selenium and to load a page and unstructured to load the html.
    This is useful for loading pages that require javascript to render.
    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        browser (str): The browser to use, either 'chrome' or 'firefox'.
        executable_path (Optional[str]): The path to the browser executable.
        headless (bool): If True, the browser will run in headless mode.
    """
    def __init__(
        self,
        urls: List[str],
        continue_on_failure: bool = True,
        browser: Literal["chrome", "firefox"] = "chrome",
        executable_path: Optional[str] = None,
        headless: bool = True,
    ):
        """Load a list of URLs using Selenium and unstructured."""
        try:
            import selenium  # noqa:F401
        except ImportError:
            raise ValueError(
                "selenium package not found, please install it with "
                "`pip install selenium`"
            )
        try:
            import unstructured  # noqa:F401
        except ImportError:
            raise ValueError(
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )
        self.urls = urls
        self.continue_on_failure = continue_on_failure
        self.browser = browser
        self.executable_path = executable_path
        self.headless = headless
    def _get_driver(self) -> Union["Chrome", "Firefox"]:
        """Create and return a WebDriver instance based on the specified browser.
        Raises:
            ValueError: If an invalid browser is specified.
        Returns:
            Union[Chrome, Firefox]: A WebDriver instance for the specified browser.
        """
        if self.browser.lower() == "chrome":
            from selenium.webdriver import Chrome
            from selenium.webdriver.chrome.options import Options as ChromeOptions
            chrome_options = ChromeOptions()
            if self.headless:
                chrome_options.add_argument("--headless")
            if self.executable_path is None:
                return Chrome(options=chrome_options)
            return Chrome(executable_path=self.executable_path, options=chrome_options)
        elif self.browser.lower() == "firefox":
            from selenium.webdriver import Firefox
            from selenium.webdriver.firefox.options import Options as FirefoxOptions
            firefox_options = FirefoxOptions()
            if self.headless:
                firefox_options.add_argument("--headless")
            if self.executable_path is None:
                return Firefox(options=firefox_options)
            return Firefox(
                executable_path=self.executable_path, options=firefox_options
            )
        else:
            raise ValueError("Invalid browser specified. Use 'chrome' or 'firefox'.")
    def load(self) -> List[Document]:
        """Load the specified URLs using Selenium and create Document instances.
        Returns:
            List[Document]: A list of Document instances with loaded content.
        """
        from unstructured.partition.html import partition_html
        docs: List[Document] = list()
        driver = self._get_driver()
        for url in self.urls:
            try:
                driver.get(url)
                page_content = driver.page_source
                elements = partition_html(text=page_content)
                text = "\n\n".join([str(el) for el in elements])
                metadata = {"source": url}
                docs.append(Document(page_content=text, metadata=metadata))
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching or processing {url}, exception: {e}")
                else:
                    raise e
        driver.quit()
        return docs