mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 13:23:35 +00:00
community[minor]: added Browserbase loader (#20478)
This commit is contained in:
parent
9e694963a4
commit
6ccecf2363
122
docs/docs/integrations/document_loaders/browserbase.ipynb
Normal file
122
docs/docs/integrations/document_loaders/browserbase.ipynb
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Browserbase\n",
|
||||||
|
"\n",
|
||||||
|
"[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.\n",
|
||||||
|
"\n",
|
||||||
|
"## Installation\n",
|
||||||
|
"\n",
|
||||||
|
"- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).\n",
|
||||||
|
"- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"% pip install browserbase"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Loading documents"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"You can load webpages into LangChain using `BrowserbaseLoader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain_community.document_loaders import BrowserbaseLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = BrowserbaseLoader(\n",
|
||||||
|
" urls=[\n",
|
||||||
|
" \"https://example.com\",\n",
|
||||||
|
" ],\n",
|
||||||
|
" # Text mode\n",
|
||||||
|
" text_content=False,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"docs = loader.load()\n",
|
||||||
|
"print(docs[0].page_content[:61])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Loading images\n",
|
||||||
|
"\n",
|
||||||
|
"You can also load screenshots of webpages (as bytes) for multi-modal models.\n",
|
||||||
|
"\n",
|
||||||
|
"Full example using GPT-4V:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from browserbase import Browserbase\n",
|
||||||
|
"from browserbase.helpers.gpt4 import GPT4VImage, GPT4VImageDetail\n",
|
||||||
|
"from langchain_core.messages import HumanMessage\n",
|
||||||
|
"from langchain_openai import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"chat = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=256)\n",
|
||||||
|
"browser = Browserbase()\n",
|
||||||
|
"\n",
|
||||||
|
"screenshot = browser.screenshot(\"https://browserbase.com\")\n",
|
||||||
|
"\n",
|
||||||
|
"result = chat.invoke(\n",
|
||||||
|
" [\n",
|
||||||
|
" HumanMessage(\n",
|
||||||
|
" content=[\n",
|
||||||
|
" {\"type\": \"text\", \"text\": \"What color is the logo?\"},\n",
|
||||||
|
" GPT4VImage(screenshot, GPT4VImageDetail.auto),\n",
|
||||||
|
" ]\n",
|
||||||
|
" )\n",
|
||||||
|
" ]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(result.content)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.9.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
28
docs/docs/integrations/providers/browserbase.mdx
Normal file
28
docs/docs/integrations/providers/browserbase.mdx
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# Browserbase
|
||||||
|
|
||||||
|
>[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
|
||||||
|
- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).
|
||||||
|
- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):
|
||||||
|
|
||||||
|
```python
|
||||||
|
pip install browserbase
|
||||||
|
```
|
||||||
|
|
||||||
|
## Document loader
|
||||||
|
|
||||||
|
See a [usage example](/docs/integrations/document_loaders/browserbase).
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain_community.document_loaders import BrowserbaseLoader
|
||||||
|
```
|
||||||
|
|
||||||
|
## Multi-Modal
|
||||||
|
|
||||||
|
See a [usage example](/docs/integrations/document_loaders/browserbase).
|
||||||
|
|
||||||
|
```python
|
||||||
|
from browserbase.helpers.gpt4 import GPT4VImage, GPT4VImageDetail
|
||||||
|
```
|
@ -95,6 +95,9 @@ if TYPE_CHECKING:
|
|||||||
from langchain_community.document_loaders.brave_search import (
|
from langchain_community.document_loaders.brave_search import (
|
||||||
BraveSearchLoader, # noqa: F401
|
BraveSearchLoader, # noqa: F401
|
||||||
)
|
)
|
||||||
|
from langchain_community.document_loaders.browserbase import (
|
||||||
|
BrowserbaseLoader, # noqa: F401
|
||||||
|
)
|
||||||
from langchain_community.document_loaders.browserless import (
|
from langchain_community.document_loaders.browserless import (
|
||||||
BrowserlessLoader, # noqa: F401
|
BrowserlessLoader, # noqa: F401
|
||||||
)
|
)
|
||||||
@ -541,6 +544,7 @@ __all__ = [
|
|||||||
"BlobLoader",
|
"BlobLoader",
|
||||||
"BlockchainDocumentLoader",
|
"BlockchainDocumentLoader",
|
||||||
"BraveSearchLoader",
|
"BraveSearchLoader",
|
||||||
|
"BrowserbaseLoader",
|
||||||
"BrowserlessLoader",
|
"BrowserlessLoader",
|
||||||
"CSVLoader",
|
"CSVLoader",
|
||||||
"CassandraLoader",
|
"CassandraLoader",
|
||||||
@ -727,6 +731,7 @@ _module_lookup = {
|
|||||||
"BlobLoader": "langchain_community.document_loaders.blob_loaders",
|
"BlobLoader": "langchain_community.document_loaders.blob_loaders",
|
||||||
"BlockchainDocumentLoader": "langchain_community.document_loaders.blockchain",
|
"BlockchainDocumentLoader": "langchain_community.document_loaders.blockchain",
|
||||||
"BraveSearchLoader": "langchain_community.document_loaders.brave_search",
|
"BraveSearchLoader": "langchain_community.document_loaders.brave_search",
|
||||||
|
"BrowserbaseLoader": "langchain_community.document_loaders.browserbase",
|
||||||
"BrowserlessLoader": "langchain_community.document_loaders.browserless",
|
"BrowserlessLoader": "langchain_community.document_loaders.browserless",
|
||||||
"CSVLoader": "langchain_community.document_loaders.csv_loader",
|
"CSVLoader": "langchain_community.document_loaders.csv_loader",
|
||||||
"CassandraLoader": "langchain_community.document_loaders.cassandra",
|
"CassandraLoader": "langchain_community.document_loaders.cassandra",
|
||||||
|
@ -0,0 +1,47 @@
|
|||||||
|
from typing import Iterator, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class BrowserbaseLoader(BaseLoader):
|
||||||
|
"""Load pre-rendered web pages using a headless browser hosted on Browserbase.
|
||||||
|
|
||||||
|
Depends on `browserbase` package.
|
||||||
|
Get your API key from https://browserbase.com
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
urls: Union[List[str], Tuple[str, ...]],
|
||||||
|
*,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
text_content: bool = False,
|
||||||
|
):
|
||||||
|
self.urls = urls
|
||||||
|
self.text_content = text_content
|
||||||
|
|
||||||
|
try:
|
||||||
|
from browserbase import Browserbase
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"You must run "
|
||||||
|
"`pip install --upgrade "
|
||||||
|
"browserbase` "
|
||||||
|
"to use the Browserbase loader."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.browserbase = Browserbase(api_key=api_key)
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
"""Load pages from URLs"""
|
||||||
|
pages = self.browserbase.load_urls(self.urls, self.text_content)
|
||||||
|
|
||||||
|
for i, page in enumerate(pages):
|
||||||
|
yield Document(
|
||||||
|
page_content=page,
|
||||||
|
metadata={
|
||||||
|
"url": self.urls[i],
|
||||||
|
},
|
||||||
|
)
|
@ -38,6 +38,7 @@ EXPECTED_ALL = [
|
|||||||
"BlobLoader",
|
"BlobLoader",
|
||||||
"BlockchainDocumentLoader",
|
"BlockchainDocumentLoader",
|
||||||
"BraveSearchLoader",
|
"BraveSearchLoader",
|
||||||
|
"BrowserbaseLoader",
|
||||||
"BrowserlessLoader",
|
"BrowserlessLoader",
|
||||||
"CassandraLoader",
|
"CassandraLoader",
|
||||||
"CSVLoader",
|
"CSVLoader",
|
||||||
|
Loading…
Reference in New Issue
Block a user