[docs/community]: langchain docs + browserbaseloader fix (#30973)

Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" community: fix browserbase integration docs: update docs - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** Updated BrowserbaseLoader to use the new python sdk. - **Issue:** update browserbase integration with langchain - **Dependencies:** n/a - **Twitter handle:** @kylejeong21 - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/
2026-04-25 01:16:55 +00:00 · 2025-04-24 10:38:49 -07:00
parent 403fae8eec
commit d0f0d1f966
3 changed files with 67 additions and 61 deletions
--- a/docs/docs/integrations/document_loaders/browserbase.ipynb
+++ b/docs/docs/integrations/document_loaders/browserbase.ipynb
@@ -49,7 +49,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from langchain_community.document_loaders import BrowserbaseLoader"
+    "import os\n",
+    "\n",
+    "from langchain_community.document_loaders import BrowserbaseLoader\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "BROWSERBASE_API_KEY = os.getenv(\"BROWSERBASE_API_KEY\")\n",
+    "BROWSERBASE_PROJECT_ID = os.getenv(\"BROWSERBASE_PROJECT_ID\")"
   ]
  },
  {
@@ -59,6 +66,8 @@
   "outputs": [],
   "source": [
    "loader = BrowserbaseLoader(\n",
+    "    api_key=BROWSERBASE_API_KEY,\n",
+    "    project_id=BROWSERBASE_PROJECT_ID,\n",
    "    urls=[\n",
    "        \"https://example.com\",\n",
    "    ],\n",
@@ -78,52 +87,11 @@
    "\n",
    "- `urls` Required. A list of URLs to fetch.\n",
    "- `text_content` Retrieve only text content. Default is `False`.\n",
-    "- `api_key` Optional. Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.\n",
-    "- `project_id` Optional. Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.\n",
+    "- `api_key` Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.\n",
+    "- `project_id` Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.\n",
    "- `session_id` Optional. Provide an existing Session ID.\n",
    "- `proxy` Optional. Enable/Disable Proxies."
   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Loading images\n",
-    "\n",
-    "You can also load screenshots of webpages (as bytes) for multi-modal models.\n",
-    "\n",
-    "Full example using GPT-4V:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from browserbase import Browserbase\n",
-    "from browserbase.helpers.gpt4 import GPT4VImage, GPT4VImageDetail\n",
-    "from langchain_core.messages import HumanMessage\n",
-    "from langchain_openai import ChatOpenAI\n",
-    "\n",
-    "chat = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=256)\n",
-    "browser = Browserbase()\n",
-    "\n",
-    "screenshot = browser.screenshot(\"https://browserbase.com\")\n",
-    "\n",
-    "result = chat.invoke(\n",
-    "    [\n",
-    "        HumanMessage(\n",
-    "            content=[\n",
-    "                {\"type\": \"text\", \"text\": \"What color is the logo?\"},\n",
-    "                GPT4VImage(screenshot, GPT4VImageDetail.auto),\n",
-    "            ]\n",
-    "        )\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "print(result.content)"
-   ]
  }
 ],
 "metadata": {
--- a/libs/community/langchain_community/document_loaders/browserbase.py
+++ b/libs/community/langchain_community/document_loaders/browserbase.py
@@ -1,4 +1,4 @@
-from typing import Iterator, Optional, Sequence
+from typing import Any, Dict, Iterator, Optional, Sequence

 from langchain_core.documents import Document

@@ -8,7 +8,7 @@ from langchain_community.document_loaders.base import BaseLoader
 class BrowserbaseLoader(BaseLoader):
    """Load pre-rendered web pages using a headless browser hosted on Browserbase.

-    Depends on `browserbase` package.
+    Depends on `browserbase` and `playwright` packages.
    Get your API key from https://browserbase.com
    """

@@ -24,6 +24,7 @@ class BrowserbaseLoader(BaseLoader):
        self.urls = urls
        self.text_content = text_content
        self.session_id = session_id
+        self.project_id = project_id
        self.proxy = proxy

        try:
@@ -32,22 +33,57 @@ class BrowserbaseLoader(BaseLoader):
            raise ImportError(
                "You must run "
                "`pip install --upgrade "
-                "browserbase` "
+                "browserbase playwright` "
                "to use the Browserbase loader."
            )

-        self.browserbase = Browserbase(api_key, project_id)
+        self.browserbase = Browserbase(api_key=api_key)

    def lazy_load(self) -> Iterator[Document]:
        """Load pages from URLs"""
-        pages = self.browserbase.load_urls(
-            self.urls, self.text_content, self.session_id, self.proxy
-        )
-
-        for i, page in enumerate(pages):
-            yield Document(
-                page_content=page,
-                metadata={
-                    "url": self.urls[i],
-                },
+        try:
+            from playwright.sync_api import sync_playwright
+        except ImportError:
+            raise ImportError(
+                "playwright is required for BrowserbaseLoader. "
+                "Please run `pip install --upgrade playwright`."
            )
+
+        for url in self.urls:
+            with sync_playwright() as playwright:
+                # Create or use existing session
+                if self.session_id:
+                    session = self.browserbase.sessions.retrieve(id=self.session_id)
+                else:
+                    if not self.project_id:
+                        raise ValueError("project_id is required to create a session")
+                    session_params: Dict[str, Any] = {"project_id": self.project_id}
+                    if self.proxy is not None:
+                        session_params["proxy"] = bool(self.proxy)
+                    session = self.browserbase.sessions.create(**session_params)
+
+                # Connect to the remote session
+                browser = playwright.chromium.connect_over_cdp(session.connect_url)
+                context = browser.contexts[0]
+                page = context.pages[0]
+
+                # Navigate to URL and get content
+                page.goto(url)
+                # Get content based on the text_content flag
+                if self.text_content:
+                    page_text = page.inner_text("body")
+                    content = str(page_text)
+                else:
+                    page_html = page.content()
+                    content = str(page_html)
+
+                # Close browser
+                page.close()
+                browser.close()
+
+                yield Document(
+                    page_content=content,
+                    metadata={
+                        "url": url,
+                    },
+                )
--- a/uv.lock
+++ b/uv.lock
@@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.9, <4.0"
 resolution-markers = [
    "python_full_version >= '3.13' and platform_python_implementation == 'PyPy'",
@@ -2178,7 +2179,7 @@ wheels = [

 [[package]]
 name = "langchain"
-version = "0.3.23"
+version = "0.3.24"
 source = { editable = "libs/langchain" }
 dependencies = [
    { name = "async-timeout", marker = "python_full_version < '3.11'" },
@@ -2219,6 +2220,7 @@ requires-dist = [
    { name = "requests", specifier = ">=2,<3" },
    { name = "sqlalchemy", specifier = ">=1.4,<3" },
 ]
+provides-extras = ["community", "anthropic", "openai", "azure-ai", "cohere", "google-vertexai", "google-genai", "fireworks", "ollama", "together", "mistralai", "huggingface", "groq", "aws", "deepseek", "xai", "perplexity"]

 [package.metadata.requires-dev]
 codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }]
@@ -2393,7 +2395,7 @@ typing = [

 [[package]]
 name = "langchain-community"
-version = "0.3.21"
+version = "0.3.22"
 source = { editable = "libs/community" }
 dependencies = [
    { name = "aiohttp" },
@@ -2484,7 +2486,7 @@ typing = [

 [[package]]
 name = "langchain-core"
-version = "0.3.54"
+version = "0.3.55"
 source = { editable = "libs/core" }
 dependencies = [
    { name = "jsonpatch" },