mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-12 21:11:43 +00:00
community[minor]: added Browserbase loader (#20478)
This commit is contained in:
@@ -95,6 +95,9 @@ if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.brave_search import (
|
||||
BraveSearchLoader, # noqa: F401
|
||||
)
|
||||
from langchain_community.document_loaders.browserbase import (
|
||||
BrowserbaseLoader, # noqa: F401
|
||||
)
|
||||
from langchain_community.document_loaders.browserless import (
|
||||
BrowserlessLoader, # noqa: F401
|
||||
)
|
||||
@@ -541,6 +544,7 @@ __all__ = [
|
||||
"BlobLoader",
|
||||
"BlockchainDocumentLoader",
|
||||
"BraveSearchLoader",
|
||||
"BrowserbaseLoader",
|
||||
"BrowserlessLoader",
|
||||
"CSVLoader",
|
||||
"CassandraLoader",
|
||||
@@ -727,6 +731,7 @@ _module_lookup = {
|
||||
"BlobLoader": "langchain_community.document_loaders.blob_loaders",
|
||||
"BlockchainDocumentLoader": "langchain_community.document_loaders.blockchain",
|
||||
"BraveSearchLoader": "langchain_community.document_loaders.brave_search",
|
||||
"BrowserbaseLoader": "langchain_community.document_loaders.browserbase",
|
||||
"BrowserlessLoader": "langchain_community.document_loaders.browserless",
|
||||
"CSVLoader": "langchain_community.document_loaders.csv_loader",
|
||||
"CassandraLoader": "langchain_community.document_loaders.cassandra",
|
||||
|
@@ -0,0 +1,47 @@
|
||||
from typing import Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class BrowserbaseLoader(BaseLoader):
|
||||
"""Load pre-rendered web pages using a headless browser hosted on Browserbase.
|
||||
|
||||
Depends on `browserbase` package.
|
||||
Get your API key from https://browserbase.com
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
urls: Union[List[str], Tuple[str, ...]],
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
text_content: bool = False,
|
||||
):
|
||||
self.urls = urls
|
||||
self.text_content = text_content
|
||||
|
||||
try:
|
||||
from browserbase import Browserbase
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You must run "
|
||||
"`pip install --upgrade "
|
||||
"browserbase` "
|
||||
"to use the Browserbase loader."
|
||||
)
|
||||
|
||||
self.browserbase = Browserbase(api_key=api_key)
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load pages from URLs"""
|
||||
pages = self.browserbase.load_urls(self.urls, self.text_content)
|
||||
|
||||
for i, page in enumerate(pages):
|
||||
yield Document(
|
||||
page_content=page,
|
||||
metadata={
|
||||
"url": self.urls[i],
|
||||
},
|
||||
)
|
@@ -38,6 +38,7 @@ EXPECTED_ALL = [
|
||||
"BlobLoader",
|
||||
"BlockchainDocumentLoader",
|
||||
"BraveSearchLoader",
|
||||
"BrowserbaseLoader",
|
||||
"BrowserlessLoader",
|
||||
"CassandraLoader",
|
||||
"CSVLoader",
|
||||
|
Reference in New Issue
Block a user