diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py
index 4d58780c45d..4a73187ad3e 100644
--- a/langchain/document_loaders/html_bs.py
+++ b/langchain/document_loaders/html_bs.py
@@ -6,7 +6,6 @@ from typing import Dict, List, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
-
logger = logging.getLogger(__name__)
diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py
index e00e9bf798b..94ac136d475 100644
--- a/langchain/document_loaders/parsers/__init__.py
+++ b/langchain/document_loaders/parsers/__init__.py
@@ -1,3 +1,4 @@
+from langchain.document_loaders.parsers.html import BS4HTMLParser
from langchain.document_loaders.parsers.pdf import (
PDFMinerParser,
PDFPlumberParser,
@@ -5,7 +6,6 @@ from langchain.document_loaders.parsers.pdf import (
PyPDFium2Parser,
PyPDFParser,
)
-from langchain.document_loaders.parsers.html import BS4HTMLParser
__all__ = [
"BS4HTMLParser",
diff --git a/langchain/document_loaders/parsers/html/__init__.py b/langchain/document_loaders/parsers/html/__init__.py
index 4568004de8d..7542ae2dc6c 100644
--- a/langchain/document_loaders/parsers/html/__init__.py
+++ b/langchain/document_loaders/parsers/html/__init__.py
@@ -1,4 +1,4 @@
-from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
+from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser
__all__ = ["MarkdownifyHTMLParser", "BS4HTMLParser"]
diff --git a/langchain/document_loaders/parsers/html/bs4.py b/langchain/document_loaders/parsers/html/bs4.py
index 194514df1ae..c7db1863fa8 100644
--- a/langchain/document_loaders/parsers/html/bs4.py
+++ b/langchain/document_loaders/parsers/html/bs4.py
@@ -1,7 +1,7 @@
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
import logging
-from typing import Dict, Union, Iterator, Optional, Mapping, Any
+from typing import Any, Dict, Iterator, Mapping, Optional, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
class BS4HTMLParser(BaseBlobParser):
- """Loader that uses beautiful soup to parse HTML files."""
+ """Parser that uses beautiful soup to parse HTML files."""
def __init__(
self,
@@ -19,8 +19,7 @@ class BS4HTMLParser(BaseBlobParser):
bs_kwargs: Optional[Mapping[str, Any]] = None,
get_text_separator: str = "",
) -> None:
- """Initialise with path, and optionally, file encoding to use, and any kwargs
- to pass to the BeautifulSoup object."""
+ """Initialize a bs4 based HTML parser."""
try:
import bs4 # noqa:F401
except ImportError:
@@ -29,7 +28,7 @@ class BS4HTMLParser(BaseBlobParser):
"`pip install beautifulsoup4`"
)
- if "features" in bs_kwargs:
+ if bs_kwargs and "features" in bs_kwargs:
raise ValueError("features cannot be set in bs_kwargs")
_bs_kwargs = bs_kwargs or {}
@@ -54,4 +53,4 @@ class BS4HTMLParser(BaseBlobParser):
"source": blob.source,
"title": title,
}
- return [Document(page_content=text, metadata=metadata)]
+ yield Document(page_content=text, metadata=metadata)
diff --git a/langchain/document_loaders/parsers/html/markdownify.py b/langchain/document_loaders/parsers/html/markdownify.py
index 953c79f3608..503cf5a8228 100644
--- a/langchain/document_loaders/parsers/html/markdownify.py
+++ b/langchain/document_loaders/parsers/html/markdownify.py
@@ -1,8 +1,9 @@
"""Load and chunk HTMLs with potential pre-processing to clean the html."""
import re
+from typing import Iterator, Tuple
+
from bs4 import BeautifulSoup
-from typing import Tuple, Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
diff --git a/tests/integration_tests/document_loaders/parsers/test_html_parsers.py b/tests/integration_tests/document_loaders/parsers/test_html_parsers.py
deleted file mode 100644
index e3943a553d7..00000000000
--- a/tests/integration_tests/document_loaders/parsers/test_html_parsers.py
+++ /dev/null
@@ -1,14 +0,0 @@
-"""Tests for the HTML parsers."""
-from pathlib import Path
-from typing import Iterator
-
-from langchain.document_loaders.base import BaseBlobParser
-from langchain.document_loaders.blob_loaders import Blob
-from langchain.document_loaders.parsers.html import BS4HTMLParser
-
-# PDFs to test parsers on.
-HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
-
-LAYOUT_PARSER_PAPER_PDF = (
- Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
-)
diff --git a/tests/unit_tests/document_loaders/parsers/test_html_parsers.py b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
new file mode 100644
index 00000000000..39564b98108
--- /dev/null
+++ b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
@@ -0,0 +1,27 @@
+"""Tests for the HTML parsers."""
+import pytest
+from pathlib import Path
+
+from langchain.document_loaders.blob_loaders import Blob
+from langchain.document_loaders.parsers.html import BS4HTMLParser
+
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
+
+
+@pytest.mark.requires("bs4")
+def test_bs_html_loader() -> None:
+ """Test unstructured loader."""
+ file_path = EXAMPLES / "example.html"
+ blob = Blob.from_path(file_path)
+ parser = BS4HTMLParser(get_text_separator="|")
+ docs = list(parser.lazy_parse(blob))
+ assert isinstance(docs, list)
+ assert len(docs) == 1
+
+ metadata = docs[0].metadata
+ content = docs[0].page_content
+
+ assert metadata["title"] == "Chew dad's slippers"
+ assert metadata["source"] == str(file_path)
+ assert content[:2] == "\n|"