From d0f0d1f96694f329939602297a1a2b593cf11485 Mon Sep 17 00:00:00 2001 From: Kyle Jeong <77771518+Kylejeong2@users.noreply.github.com> Date: Thu, 24 Apr 2025 10:38:49 -0700 Subject: [PATCH] [docs/community]: langchain docs + browserbaseloader fix (#30973) Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" community: fix browserbase integration docs: update docs - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** Updated BrowserbaseLoader to use the new python sdk. - **Issue:** update browserbase integration with langchain - **Dependencies:** n/a - **Twitter handle:** @kylejeong21 - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ --- .../document_loaders/browserbase.ipynb | 56 ++++------------ .../document_loaders/browserbase.py | 64 +++++++++++++++---- uv.lock | 8 ++- 3 files changed, 67 insertions(+), 61 deletions(-) diff --git a/docs/docs/integrations/document_loaders/browserbase.ipynb b/docs/docs/integrations/document_loaders/browserbase.ipynb index 149fef73861..d41d4912f37 100644 --- a/docs/docs/integrations/document_loaders/browserbase.ipynb +++ b/docs/docs/integrations/document_loaders/browserbase.ipynb @@ -49,7 +49,14 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_community.document_loaders import BrowserbaseLoader" + "import os\n", + "\n", + "from langchain_community.document_loaders import BrowserbaseLoader\n", + "\n", + "load_dotenv()\n", + "\n", + "BROWSERBASE_API_KEY = os.getenv(\"BROWSERBASE_API_KEY\")\n", + "BROWSERBASE_PROJECT_ID = os.getenv(\"BROWSERBASE_PROJECT_ID\")" ] }, { @@ -59,6 +66,8 @@ "outputs": [], "source": [ "loader = BrowserbaseLoader(\n", + " api_key=BROWSERBASE_API_KEY,\n", + " project_id=BROWSERBASE_PROJECT_ID,\n", " urls=[\n", " \"https://example.com\",\n", " ],\n", @@ -78,52 +87,11 @@ "\n", "- `urls` Required. A list of URLs to fetch.\n", "- `text_content` Retrieve only text content. Default is `False`.\n", - "- `api_key` Optional. Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.\n", - "- `project_id` Optional. Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.\n", + "- `api_key` Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.\n", + "- `project_id` Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.\n", "- `session_id` Optional. Provide an existing Session ID.\n", "- `proxy` Optional. Enable/Disable Proxies." ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading images\n", - "\n", - "You can also load screenshots of webpages (as bytes) for multi-modal models.\n", - "\n", - "Full example using GPT-4V:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from browserbase import Browserbase\n", - "from browserbase.helpers.gpt4 import GPT4VImage, GPT4VImageDetail\n", - "from langchain_core.messages import HumanMessage\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "chat = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=256)\n", - "browser = Browserbase()\n", - "\n", - "screenshot = browser.screenshot(\"https://browserbase.com\")\n", - "\n", - "result = chat.invoke(\n", - " [\n", - " HumanMessage(\n", - " content=[\n", - " {\"type\": \"text\", \"text\": \"What color is the logo?\"},\n", - " GPT4VImage(screenshot, GPT4VImageDetail.auto),\n", - " ]\n", - " )\n", - " ]\n", - ")\n", - "\n", - "print(result.content)" - ] } ], "metadata": { diff --git a/libs/community/langchain_community/document_loaders/browserbase.py b/libs/community/langchain_community/document_loaders/browserbase.py index 0027fa1eeb6..2bda0c17a66 100644 --- a/libs/community/langchain_community/document_loaders/browserbase.py +++ b/libs/community/langchain_community/document_loaders/browserbase.py @@ -1,4 +1,4 @@ -from typing import Iterator, Optional, Sequence +from typing import Any, Dict, Iterator, Optional, Sequence from langchain_core.documents import Document @@ -8,7 +8,7 @@ from langchain_community.document_loaders.base import BaseLoader class BrowserbaseLoader(BaseLoader): """Load pre-rendered web pages using a headless browser hosted on Browserbase. - Depends on `browserbase` package. + Depends on `browserbase` and `playwright` packages. Get your API key from https://browserbase.com """ @@ -24,6 +24,7 @@ class BrowserbaseLoader(BaseLoader): self.urls = urls self.text_content = text_content self.session_id = session_id + self.project_id = project_id self.proxy = proxy try: @@ -32,22 +33,57 @@ class BrowserbaseLoader(BaseLoader): raise ImportError( "You must run " "`pip install --upgrade " - "browserbase` " + "browserbase playwright` " "to use the Browserbase loader." ) - self.browserbase = Browserbase(api_key, project_id) + self.browserbase = Browserbase(api_key=api_key) def lazy_load(self) -> Iterator[Document]: """Load pages from URLs""" - pages = self.browserbase.load_urls( - self.urls, self.text_content, self.session_id, self.proxy - ) - - for i, page in enumerate(pages): - yield Document( - page_content=page, - metadata={ - "url": self.urls[i], - }, + try: + from playwright.sync_api import sync_playwright + except ImportError: + raise ImportError( + "playwright is required for BrowserbaseLoader. " + "Please run `pip install --upgrade playwright`." ) + + for url in self.urls: + with sync_playwright() as playwright: + # Create or use existing session + if self.session_id: + session = self.browserbase.sessions.retrieve(id=self.session_id) + else: + if not self.project_id: + raise ValueError("project_id is required to create a session") + session_params: Dict[str, Any] = {"project_id": self.project_id} + if self.proxy is not None: + session_params["proxy"] = bool(self.proxy) + session = self.browserbase.sessions.create(**session_params) + + # Connect to the remote session + browser = playwright.chromium.connect_over_cdp(session.connect_url) + context = browser.contexts[0] + page = context.pages[0] + + # Navigate to URL and get content + page.goto(url) + # Get content based on the text_content flag + if self.text_content: + page_text = page.inner_text("body") + content = str(page_text) + else: + page_html = page.content() + content = str(page_html) + + # Close browser + page.close() + browser.close() + + yield Document( + page_content=content, + metadata={ + "url": url, + }, + ) diff --git a/uv.lock b/uv.lock index 5787e45572f..fa83f255c25 100644 --- a/uv.lock +++ b/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.9, <4.0" resolution-markers = [ "python_full_version >= '3.13' and platform_python_implementation == 'PyPy'", @@ -2178,7 +2179,7 @@ wheels = [ [[package]] name = "langchain" -version = "0.3.23" +version = "0.3.24" source = { editable = "libs/langchain" } dependencies = [ { name = "async-timeout", marker = "python_full_version < '3.11'" }, @@ -2219,6 +2220,7 @@ requires-dist = [ { name = "requests", specifier = ">=2,<3" }, { name = "sqlalchemy", specifier = ">=1.4,<3" }, ] +provides-extras = ["community", "anthropic", "openai", "azure-ai", "cohere", "google-vertexai", "google-genai", "fireworks", "ollama", "together", "mistralai", "huggingface", "groq", "aws", "deepseek", "xai", "perplexity"] [package.metadata.requires-dev] codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }] @@ -2393,7 +2395,7 @@ typing = [ [[package]] name = "langchain-community" -version = "0.3.21" +version = "0.3.22" source = { editable = "libs/community" } dependencies = [ { name = "aiohttp" }, @@ -2484,7 +2486,7 @@ typing = [ [[package]] name = "langchain-core" -version = "0.3.54" +version = "0.3.55" source = { editable = "libs/core" } dependencies = [ { name = "jsonpatch" },