mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 06:53:59 +00:00
add browserless loader (#7562)
# Browserless Added support for Browserless' `/content` endpoint as a document loader. ### About Browserless Browserless is a cloud service that provides access to headless Chrome browsers via a REST API. It allows developers to automate Chromium in a serverless fashion without having to configure and maintain their own Chrome infrastructure. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev>
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Browserless"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import BrowserlessLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"BROWSERLESS_API_TOKEN = \"YOUR_API_TOKEN\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<!DOCTYPE html><html class=\"client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\" lang=\"en\" dir=\"ltr\"><head>\n",
|
||||
"<meta charset=\"UTF-8\">\n",
|
||||
"<title>Document classification - Wikipedia</title>\n",
|
||||
"<script>document.documentElement.className=\"client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\";(function(){var cookie=document.cookie.match(/(?:^|; )enwikimwclien\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = BrowserlessLoader(\n",
|
||||
" api_token=BROWSERLESS_API_TOKEN,\n",
|
||||
" urls=[\n",
|
||||
" \"https://en.wikipedia.org/wiki/Document_classification\",\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"documents = loader.load()\n",
|
||||
"\n",
|
||||
"print(documents[0].page_content[:1000])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.1"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Reference in New Issue
Block a user