mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 06:53:16 +00:00
[docs/community]: langchain docs + browserbaseloader fix (#30973)
Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" community: fix browserbase integration docs: update docs - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** Updated BrowserbaseLoader to use the new python sdk. - **Issue:** update browserbase integration with langchain - **Dependencies:** n/a - **Twitter handle:** @kylejeong21 - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
from typing import Iterator, Optional, Sequence
|
||||
from typing import Any, Dict, Iterator, Optional, Sequence
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@@ -8,7 +8,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
class BrowserbaseLoader(BaseLoader):
|
||||
"""Load pre-rendered web pages using a headless browser hosted on Browserbase.
|
||||
|
||||
Depends on `browserbase` package.
|
||||
Depends on `browserbase` and `playwright` packages.
|
||||
Get your API key from https://browserbase.com
|
||||
"""
|
||||
|
||||
@@ -24,6 +24,7 @@ class BrowserbaseLoader(BaseLoader):
|
||||
self.urls = urls
|
||||
self.text_content = text_content
|
||||
self.session_id = session_id
|
||||
self.project_id = project_id
|
||||
self.proxy = proxy
|
||||
|
||||
try:
|
||||
@@ -32,22 +33,57 @@ class BrowserbaseLoader(BaseLoader):
|
||||
raise ImportError(
|
||||
"You must run "
|
||||
"`pip install --upgrade "
|
||||
"browserbase` "
|
||||
"browserbase playwright` "
|
||||
"to use the Browserbase loader."
|
||||
)
|
||||
|
||||
self.browserbase = Browserbase(api_key, project_id)
|
||||
self.browserbase = Browserbase(api_key=api_key)
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load pages from URLs"""
|
||||
pages = self.browserbase.load_urls(
|
||||
self.urls, self.text_content, self.session_id, self.proxy
|
||||
)
|
||||
|
||||
for i, page in enumerate(pages):
|
||||
yield Document(
|
||||
page_content=page,
|
||||
metadata={
|
||||
"url": self.urls[i],
|
||||
},
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"playwright is required for BrowserbaseLoader. "
|
||||
"Please run `pip install --upgrade playwright`."
|
||||
)
|
||||
|
||||
for url in self.urls:
|
||||
with sync_playwright() as playwright:
|
||||
# Create or use existing session
|
||||
if self.session_id:
|
||||
session = self.browserbase.sessions.retrieve(id=self.session_id)
|
||||
else:
|
||||
if not self.project_id:
|
||||
raise ValueError("project_id is required to create a session")
|
||||
session_params: Dict[str, Any] = {"project_id": self.project_id}
|
||||
if self.proxy is not None:
|
||||
session_params["proxy"] = bool(self.proxy)
|
||||
session = self.browserbase.sessions.create(**session_params)
|
||||
|
||||
# Connect to the remote session
|
||||
browser = playwright.chromium.connect_over_cdp(session.connect_url)
|
||||
context = browser.contexts[0]
|
||||
page = context.pages[0]
|
||||
|
||||
# Navigate to URL and get content
|
||||
page.goto(url)
|
||||
# Get content based on the text_content flag
|
||||
if self.text_content:
|
||||
page_text = page.inner_text("body")
|
||||
content = str(page_text)
|
||||
else:
|
||||
page_html = page.content()
|
||||
content = str(page_html)
|
||||
|
||||
# Close browser
|
||||
page.close()
|
||||
browser.close()
|
||||
|
||||
yield Document(
|
||||
page_content=content,
|
||||
metadata={
|
||||
"url": url,
|
||||
},
|
||||
)
|
||||
|
Reference in New Issue
Block a user