mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 06:23:20 +00:00
Harrison/playwright (#2871)
Co-authored-by: Manuel Saelices <msaelices@gmail.com>
This commit is contained in:
@@ -112,6 +112,79 @@
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "a2c1c79f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Playwright URL Loader\n",
|
||||
"\n",
|
||||
"This covers how to load HTML documents from a list of URLs using the `PlaywrightURLLoader`.\n",
|
||||
"\n",
|
||||
"As in the Selenium case, Playwright allows us to load pages that need JavaScript to render.\n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To use the `PlaywrightURLLoader`, you will need to install `playwright` and `unstructured`. Additionally, you will need to install the Playwright Chromium browser:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "53158417",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install playwright\n",
|
||||
"!pip install \"playwright\"\n",
|
||||
"!pip install \"unstructured\"\n",
|
||||
"!playwright install"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0ab4e115",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PlaywrightURLLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ce5a9a0a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urls = [\n",
|
||||
" \"https://www.youtube.com/watch?v=dQw4w9WgXcQ\",\n",
|
||||
" \"https://goo.gl/maps/NDSHwePEyaHMFGwh8\"\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2dc3e0bc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = PlaywrightURLLoader(urls=urls, remove_selectors=[\"header\", \"footer\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "10b79f80",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -130,7 +203,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Reference in New Issue
Block a user