mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 02:50:47 +00:00
[docs/community]: langchain docs + browserbaseloader fix (#30973)
Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" community: fix browserbase integration docs: update docs - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** Updated BrowserbaseLoader to use the new python sdk. - **Issue:** update browserbase integration with langchain - **Dependencies:** n/a - **Twitter handle:** @kylejeong21 - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/
This commit is contained in:
@@ -49,7 +49,14 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain_community.document_loaders import BrowserbaseLoader"
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain_community.document_loaders import BrowserbaseLoader\n",
|
||||||
|
"\n",
|
||||||
|
"load_dotenv()\n",
|
||||||
|
"\n",
|
||||||
|
"BROWSERBASE_API_KEY = os.getenv(\"BROWSERBASE_API_KEY\")\n",
|
||||||
|
"BROWSERBASE_PROJECT_ID = os.getenv(\"BROWSERBASE_PROJECT_ID\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -59,6 +66,8 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"loader = BrowserbaseLoader(\n",
|
"loader = BrowserbaseLoader(\n",
|
||||||
|
" api_key=BROWSERBASE_API_KEY,\n",
|
||||||
|
" project_id=BROWSERBASE_PROJECT_ID,\n",
|
||||||
" urls=[\n",
|
" urls=[\n",
|
||||||
" \"https://example.com\",\n",
|
" \"https://example.com\",\n",
|
||||||
" ],\n",
|
" ],\n",
|
||||||
@@ -78,52 +87,11 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"- `urls` Required. A list of URLs to fetch.\n",
|
"- `urls` Required. A list of URLs to fetch.\n",
|
||||||
"- `text_content` Retrieve only text content. Default is `False`.\n",
|
"- `text_content` Retrieve only text content. Default is `False`.\n",
|
||||||
"- `api_key` Optional. Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.\n",
|
"- `api_key` Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.\n",
|
||||||
"- `project_id` Optional. Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.\n",
|
"- `project_id` Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.\n",
|
||||||
"- `session_id` Optional. Provide an existing Session ID.\n",
|
"- `session_id` Optional. Provide an existing Session ID.\n",
|
||||||
"- `proxy` Optional. Enable/Disable Proxies."
|
"- `proxy` Optional. Enable/Disable Proxies."
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Loading images\n",
|
|
||||||
"\n",
|
|
||||||
"You can also load screenshots of webpages (as bytes) for multi-modal models.\n",
|
|
||||||
"\n",
|
|
||||||
"Full example using GPT-4V:"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from browserbase import Browserbase\n",
|
|
||||||
"from browserbase.helpers.gpt4 import GPT4VImage, GPT4VImageDetail\n",
|
|
||||||
"from langchain_core.messages import HumanMessage\n",
|
|
||||||
"from langchain_openai import ChatOpenAI\n",
|
|
||||||
"\n",
|
|
||||||
"chat = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=256)\n",
|
|
||||||
"browser = Browserbase()\n",
|
|
||||||
"\n",
|
|
||||||
"screenshot = browser.screenshot(\"https://browserbase.com\")\n",
|
|
||||||
"\n",
|
|
||||||
"result = chat.invoke(\n",
|
|
||||||
" [\n",
|
|
||||||
" HumanMessage(\n",
|
|
||||||
" content=[\n",
|
|
||||||
" {\"type\": \"text\", \"text\": \"What color is the logo?\"},\n",
|
|
||||||
" GPT4VImage(screenshot, GPT4VImageDetail.auto),\n",
|
|
||||||
" ]\n",
|
|
||||||
" )\n",
|
|
||||||
" ]\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"print(result.content)"
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
from typing import Iterator, Optional, Sequence
|
from typing import Any, Dict, Iterator, Optional, Sequence
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@@ -8,7 +8,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
|||||||
class BrowserbaseLoader(BaseLoader):
|
class BrowserbaseLoader(BaseLoader):
|
||||||
"""Load pre-rendered web pages using a headless browser hosted on Browserbase.
|
"""Load pre-rendered web pages using a headless browser hosted on Browserbase.
|
||||||
|
|
||||||
Depends on `browserbase` package.
|
Depends on `browserbase` and `playwright` packages.
|
||||||
Get your API key from https://browserbase.com
|
Get your API key from https://browserbase.com
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -24,6 +24,7 @@ class BrowserbaseLoader(BaseLoader):
|
|||||||
self.urls = urls
|
self.urls = urls
|
||||||
self.text_content = text_content
|
self.text_content = text_content
|
||||||
self.session_id = session_id
|
self.session_id = session_id
|
||||||
|
self.project_id = project_id
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -32,22 +33,57 @@ class BrowserbaseLoader(BaseLoader):
|
|||||||
raise ImportError(
|
raise ImportError(
|
||||||
"You must run "
|
"You must run "
|
||||||
"`pip install --upgrade "
|
"`pip install --upgrade "
|
||||||
"browserbase` "
|
"browserbase playwright` "
|
||||||
"to use the Browserbase loader."
|
"to use the Browserbase loader."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.browserbase = Browserbase(api_key, project_id)
|
self.browserbase = Browserbase(api_key=api_key)
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load pages from URLs"""
|
"""Load pages from URLs"""
|
||||||
pages = self.browserbase.load_urls(
|
try:
|
||||||
self.urls, self.text_content, self.session_id, self.proxy
|
from playwright.sync_api import sync_playwright
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"playwright is required for BrowserbaseLoader. "
|
||||||
|
"Please run `pip install --upgrade playwright`."
|
||||||
)
|
)
|
||||||
|
|
||||||
for i, page in enumerate(pages):
|
for url in self.urls:
|
||||||
|
with sync_playwright() as playwright:
|
||||||
|
# Create or use existing session
|
||||||
|
if self.session_id:
|
||||||
|
session = self.browserbase.sessions.retrieve(id=self.session_id)
|
||||||
|
else:
|
||||||
|
if not self.project_id:
|
||||||
|
raise ValueError("project_id is required to create a session")
|
||||||
|
session_params: Dict[str, Any] = {"project_id": self.project_id}
|
||||||
|
if self.proxy is not None:
|
||||||
|
session_params["proxy"] = bool(self.proxy)
|
||||||
|
session = self.browserbase.sessions.create(**session_params)
|
||||||
|
|
||||||
|
# Connect to the remote session
|
||||||
|
browser = playwright.chromium.connect_over_cdp(session.connect_url)
|
||||||
|
context = browser.contexts[0]
|
||||||
|
page = context.pages[0]
|
||||||
|
|
||||||
|
# Navigate to URL and get content
|
||||||
|
page.goto(url)
|
||||||
|
# Get content based on the text_content flag
|
||||||
|
if self.text_content:
|
||||||
|
page_text = page.inner_text("body")
|
||||||
|
content = str(page_text)
|
||||||
|
else:
|
||||||
|
page_html = page.content()
|
||||||
|
content = str(page_html)
|
||||||
|
|
||||||
|
# Close browser
|
||||||
|
page.close()
|
||||||
|
browser.close()
|
||||||
|
|
||||||
yield Document(
|
yield Document(
|
||||||
page_content=page,
|
page_content=content,
|
||||||
metadata={
|
metadata={
|
||||||
"url": self.urls[i],
|
"url": url,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
8
uv.lock
generated
8
uv.lock
generated
@@ -1,4 +1,5 @@
|
|||||||
version = 1
|
version = 1
|
||||||
|
revision = 1
|
||||||
requires-python = ">=3.9, <4.0"
|
requires-python = ">=3.9, <4.0"
|
||||||
resolution-markers = [
|
resolution-markers = [
|
||||||
"python_full_version >= '3.13' and platform_python_implementation == 'PyPy'",
|
"python_full_version >= '3.13' and platform_python_implementation == 'PyPy'",
|
||||||
@@ -2178,7 +2179,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain"
|
name = "langchain"
|
||||||
version = "0.3.23"
|
version = "0.3.24"
|
||||||
source = { editable = "libs/langchain" }
|
source = { editable = "libs/langchain" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "async-timeout", marker = "python_full_version < '3.11'" },
|
{ name = "async-timeout", marker = "python_full_version < '3.11'" },
|
||||||
@@ -2219,6 +2220,7 @@ requires-dist = [
|
|||||||
{ name = "requests", specifier = ">=2,<3" },
|
{ name = "requests", specifier = ">=2,<3" },
|
||||||
{ name = "sqlalchemy", specifier = ">=1.4,<3" },
|
{ name = "sqlalchemy", specifier = ">=1.4,<3" },
|
||||||
]
|
]
|
||||||
|
provides-extras = ["community", "anthropic", "openai", "azure-ai", "cohere", "google-vertexai", "google-genai", "fireworks", "ollama", "together", "mistralai", "huggingface", "groq", "aws", "deepseek", "xai", "perplexity"]
|
||||||
|
|
||||||
[package.metadata.requires-dev]
|
[package.metadata.requires-dev]
|
||||||
codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }]
|
codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }]
|
||||||
@@ -2393,7 +2395,7 @@ typing = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-community"
|
name = "langchain-community"
|
||||||
version = "0.3.21"
|
version = "0.3.22"
|
||||||
source = { editable = "libs/community" }
|
source = { editable = "libs/community" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "aiohttp" },
|
{ name = "aiohttp" },
|
||||||
@@ -2484,7 +2486,7 @@ typing = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-core"
|
name = "langchain-core"
|
||||||
version = "0.3.54"
|
version = "0.3.55"
|
||||||
source = { editable = "libs/core" }
|
source = { editable = "libs/core" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "jsonpatch" },
|
{ name = "jsonpatch" },
|
||||||
|
Reference in New Issue
Block a user