diff --git a/docs/modules/document_loaders/examples/example_data/fake-content.html b/docs/modules/document_loaders/examples/example_data/fake-content.html
index 9ad19d308e9..acba76025e4 100644
--- a/docs/modules/document_loaders/examples/example_data/fake-content.html
+++ b/docs/modules/document_loaders/examples/example_data/fake-content.html
@@ -1,5 +1,8 @@
+
+ Test Title
+
My First Heading
diff --git a/docs/modules/document_loaders/examples/html.ipynb b/docs/modules/document_loaders/examples/html.ipynb
index 2a4988284d3..91ff32e08b3 100644
--- a/docs/modules/document_loaders/examples/html.ipynb
+++ b/docs/modules/document_loaders/examples/html.ipynb
@@ -48,9 +48,7 @@
"outputs": [
{
"data": {
- "text/plain": [
- "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
- ]
+ "text/plain": "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
},
"execution_count": 4,
"metadata": {},
@@ -61,13 +59,57 @@
"data"
]
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Loading HTML with BeautifulSoup4\n",
+ "\n",
+ "We can also use BeautifulSoup4 to load HTML documents using the `BSHTMLLoader`. This will extract the text from the html into `page_content`, and the page title as `title` into `metadata`."
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"id": "79b1bce4",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "from langchain.document_loaders import BSHTMLLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', lookup_str='', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'}, lookup_index=0)]"
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loader = BSHTMLLoader(\"example_data/fake-content.html\")\n",
+ "data = loader.load()\n",
+ "data"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "outputs": [],
+ "source": [],
+ "metadata": {
+ "collapsed": false
+ }
}
],
"metadata": {
diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py
index 92802ccb5dd..568d805458f 100644
--- a/langchain/document_loaders/html_bs.py
+++ b/langchain/document_loaders/html_bs.py
@@ -3,8 +3,6 @@
import logging
from typing import Dict, List, Union
-from bs4 import BeautifulSoup
-
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
@@ -15,9 +13,18 @@ class BSHTMLLoader(BaseLoader):
"""Loader that uses beautiful soup to parse HTML files."""
def __init__(self, file_path: str) -> None:
+ try:
+ import bs4 # noqa:F401
+ except ImportError:
+ raise ValueError(
+ "bs4 package not found, please install it with " "`pip install bs4`"
+ )
+
self.file_path = file_path
def load(self) -> List[Document]:
+ from bs4 import BeautifulSoup
+
"""Load HTML document into document objects."""
with open(self.file_path, "r") as f:
soup = BeautifulSoup(f, features="lxml")