community[minor]: added Browserbase loader (#20478)

This commit is contained in:
Mish Ushakov
2024-04-25 03:11:03 +02:00
committed by GitHub
parent 9e694963a4
commit 6ccecf2363
5 changed files with 203 additions and 0 deletions

View File

@@ -95,6 +95,9 @@ if TYPE_CHECKING:
from langchain_community.document_loaders.brave_search import (
BraveSearchLoader, # noqa: F401
)
from langchain_community.document_loaders.browserbase import (
BrowserbaseLoader, # noqa: F401
)
from langchain_community.document_loaders.browserless import (
BrowserlessLoader, # noqa: F401
)
@@ -541,6 +544,7 @@ __all__ = [
"BlobLoader",
"BlockchainDocumentLoader",
"BraveSearchLoader",
"BrowserbaseLoader",
"BrowserlessLoader",
"CSVLoader",
"CassandraLoader",
@@ -727,6 +731,7 @@ _module_lookup = {
"BlobLoader": "langchain_community.document_loaders.blob_loaders",
"BlockchainDocumentLoader": "langchain_community.document_loaders.blockchain",
"BraveSearchLoader": "langchain_community.document_loaders.brave_search",
"BrowserbaseLoader": "langchain_community.document_loaders.browserbase",
"BrowserlessLoader": "langchain_community.document_loaders.browserless",
"CSVLoader": "langchain_community.document_loaders.csv_loader",
"CassandraLoader": "langchain_community.document_loaders.cassandra",

View File

@@ -0,0 +1,47 @@
from typing import Iterator, List, Optional, Tuple, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
class BrowserbaseLoader(BaseLoader):
"""Load pre-rendered web pages using a headless browser hosted on Browserbase.
Depends on `browserbase` package.
Get your API key from https://browserbase.com
"""
def __init__(
self,
urls: Union[List[str], Tuple[str, ...]],
*,
api_key: Optional[str] = None,
text_content: bool = False,
):
self.urls = urls
self.text_content = text_content
try:
from browserbase import Browserbase
except ImportError:
raise ImportError(
"You must run "
"`pip install --upgrade "
"browserbase` "
"to use the Browserbase loader."
)
self.browserbase = Browserbase(api_key=api_key)
def lazy_load(self) -> Iterator[Document]:
"""Load pages from URLs"""
pages = self.browserbase.load_urls(self.urls, self.text_content)
for i, page in enumerate(pages):
yield Document(
page_content=page,
metadata={
"url": self.urls[i],
},
)

View File

@@ -38,6 +38,7 @@ EXPECTED_ALL = [
"BlobLoader",
"BlockchainDocumentLoader",
"BraveSearchLoader",
"BrowserbaseLoader",
"BrowserlessLoader",
"CassandraLoader",
"CSVLoader",