diff --git a/docs/docs/integrations/document_loaders/browserbase.ipynb b/docs/docs/integrations/document_loaders/browserbase.ipynb index 8ed52fd8ded..ae497ba6dad 100644 --- a/docs/docs/integrations/document_loaders/browserbase.ipynb +++ b/docs/docs/integrations/document_loaders/browserbase.ipynb @@ -6,11 +6,17 @@ "source": [ "# Browserbase\n", "\n", - "[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.\n", + "[Browserbase](https://browserbase.com) is a developer platform to reliably run, manage, and monitor headless browsers.\n", "\n", - "## Installation\n", + "Power your AI data retrievals with:\n", + "- [Serverless Infrastructure](https://docs.browserbase.com/under-the-hood) providing reliable browsers to extract data from complex UIs\n", + "- [Stealth Mode](https://docs.browserbase.com/features/stealth-mode) with included fingerprinting tactics and automatic captcha solving\n", + "- [Session Debugger](https://docs.browserbase.com/features/sessions) to inspect your Browser Session with networks timeline and logs\n", + "- [Live Debug](https://docs.browserbase.com/guides/session-debug-connection/browser-remote-control) to quickly debug your automation\n", "\n", - "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).\n", + "## Installation and Setup\n", + "\n", + "- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`).\n", "- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):" ] }, @@ -64,6 +70,20 @@ "print(docs[0].page_content[:61])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loader Options\n", + "\n", + "- `urls` Required. A list of URLs to fetch.\n", + "- `text_content` Retrieve only text content. Default is `False`.\n", + "- `api_key` Optional. Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.\n", + "- `project_id` Optional. Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.\n", + "- `session_id` Optional. Provide an existing Session ID.\n", + "- `proxy` Optional. Enable/Disable Proxies." + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/docs/docs/integrations/providers/browserbase.mdx b/docs/docs/integrations/providers/browserbase.mdx index d5ec545a3f1..0bd939ffbfc 100644 --- a/docs/docs/integrations/providers/browserbase.mdx +++ b/docs/docs/integrations/providers/browserbase.mdx @@ -1,10 +1,16 @@ # Browserbase ->[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving. +[Browserbase](https://browserbase.com) is a developer platform to reliably run, manage, and monitor headless browsers. + +Power your AI data retrievals with: +- [Serverless Infrastructure](https://docs.browserbase.com/under-the-hood) providing reliable browsers to extract data from complex UIs +- [Stealth Mode](https://docs.browserbase.com/features/stealth-mode) with included fingerprinting tactics and automatic captcha solving +- [Session Debugger](https://docs.browserbase.com/features/sessions) to inspect your Browser Session with networks timeline and logs +- [Live Debug](https://docs.browserbase.com/guides/session-debug-connection/browser-remote-control) to quickly debug your automation ## Installation and Setup -- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`). +- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`). - Install the [Browserbase SDK](http://github.com/browserbase/python-sdk): ```python diff --git a/libs/community/langchain_community/document_loaders/browserbase.py b/libs/community/langchain_community/document_loaders/browserbase.py index 888a89107ab..0027fa1eeb6 100644 --- a/libs/community/langchain_community/document_loaders/browserbase.py +++ b/libs/community/langchain_community/document_loaders/browserbase.py @@ -1,4 +1,4 @@ -from typing import Iterator, List, Optional, Tuple, Union +from typing import Iterator, Optional, Sequence from langchain_core.documents import Document @@ -14,13 +14,17 @@ class BrowserbaseLoader(BaseLoader): def __init__( self, - urls: Union[List[str], Tuple[str, ...]], - *, - api_key: Optional[str] = None, + urls: Sequence[str], text_content: bool = False, + api_key: Optional[str] = None, + project_id: Optional[str] = None, + session_id: Optional[str] = None, + proxy: Optional[bool] = None, ): self.urls = urls self.text_content = text_content + self.session_id = session_id + self.proxy = proxy try: from browserbase import Browserbase @@ -32,11 +36,13 @@ class BrowserbaseLoader(BaseLoader): "to use the Browserbase loader." ) - self.browserbase = Browserbase(api_key=api_key) + self.browserbase = Browserbase(api_key, project_id) def lazy_load(self) -> Iterator[Document]: """Load pages from URLs""" - pages = self.browserbase.load_urls(self.urls, self.text_content) + pages = self.browserbase.load_urls( + self.urls, self.text_content, self.session_id, self.proxy + ) for i, page in enumerate(pages): yield Document(