mirror of
https://github.com/hwchase17/langchain.git
synced 2025-10-08 05:41:56 +00:00
Vwp/docs improved document loaders (#4006)
Huge thanks to @leo-gan for improving the document loaders notebooks --------- Co-authored-by: Leonid Ganeline <leo.gan.57@gmail.com>
This commit is contained in:
@@ -7,7 +7,7 @@
|
||||
"source": [
|
||||
"# HTML\n",
|
||||
"\n",
|
||||
"This covers how to load HTML documents into a document format that we can use downstream."
|
||||
"This covers how to load `HTML` documents into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -48,7 +48,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
|
||||
"text/plain": [
|
||||
"[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
@@ -61,20 +63,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "00337aae",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading HTML with BeautifulSoup4\n",
|
||||
"\n",
|
||||
"We can also use BeautifulSoup4 to load HTML documents using the `BSHTMLLoader`. This will extract the text from the html into `page_content`, and the page title as `title` into `metadata`."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
"We can also use `BeautifulSoup4` to load HTML documents using the `BSHTMLLoader`. This will extract the text from the HTML into `page_content`, and the page title as `title` into `metadata`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 1,
|
||||
"id": "79b1bce4",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import BSHTMLLoader"
|
||||
@@ -82,13 +85,23 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 2,
|
||||
"id": "4be99e6c",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', lookup_str='', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'}, lookup_index=0)]"
|
||||
"text/plain": [
|
||||
"[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -97,19 +110,7 @@
|
||||
"loader = BSHTMLLoader(\"example_data/fake-content.html\")\n",
|
||||
"data = loader.load()\n",
|
||||
"data"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -128,7 +129,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Reference in New Issue
Block a user