From 5b4d53e8efca88ccc89ee10af3fdc0066efe1afb Mon Sep 17 00:00:00 2001 From: Jasper <37783831+jagilley@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:02:19 -0700 Subject: [PATCH] Add text_content kwarg to BrowserlessLoader (#7856) Added keyword argument to toggle between getting the text content of a site versus its HTML when using the `BrowserlessLoader` --- .../integrations/browserless.ipynb | 43 ++++++++++---- langchain/document_loaders/browserless.py | 58 ++++++++++++++----- 2 files changed, 75 insertions(+), 26 deletions(-) diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb index dc90b998a42..382a60533cd 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb @@ -5,12 +5,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Browserless" + "# Browserless\n", + "\n", + "Browserless is a service that allows you to run headless Chrome instances in the cloud. It's a great way to run browser-based automation at scale without having to worry about managing your own infrastructure.\n", + "\n", + "To use Browserless as a document loader, initialize a `BrowserlessLoader` instance as shown in this notebook. Note that by default, `BrowserlessLoader` returns the `innerText` of the page's `body` element. To disable this and get the raw HTML, set `text_content` to `False`." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -19,26 +23,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "BROWSERLESS_API_TOKEN = \"YOUR_API_TOKEN\"" + "BROWSERLESS_API_TOKEN = \"YOUR_BROWSERLESS_API_TOKEN\"" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "
\n", - "\n", - "