diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py index 4d58780c45d..4a73187ad3e 100644 --- a/langchain/document_loaders/html_bs.py +++ b/langchain/document_loaders/html_bs.py @@ -6,7 +6,6 @@ from typing import Dict, List, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader - logger = logging.getLogger(__name__) diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py index e00e9bf798b..94ac136d475 100644 --- a/langchain/document_loaders/parsers/__init__.py +++ b/langchain/document_loaders/parsers/__init__.py @@ -1,3 +1,4 @@ +from langchain.document_loaders.parsers.html import BS4HTMLParser from langchain.document_loaders.parsers.pdf import ( PDFMinerParser, PDFPlumberParser, @@ -5,7 +6,6 @@ from langchain.document_loaders.parsers.pdf import ( PyPDFium2Parser, PyPDFParser, ) -from langchain.document_loaders.parsers.html import BS4HTMLParser __all__ = [ "BS4HTMLParser", diff --git a/langchain/document_loaders/parsers/html/__init__.py b/langchain/document_loaders/parsers/html/__init__.py index 4568004de8d..7542ae2dc6c 100644 --- a/langchain/document_loaders/parsers/html/__init__.py +++ b/langchain/document_loaders/parsers/html/__init__.py @@ -1,4 +1,4 @@ -from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser +from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser __all__ = ["MarkdownifyHTMLParser", "BS4HTMLParser"] diff --git a/langchain/document_loaders/parsers/html/bs4.py b/langchain/document_loaders/parsers/html/bs4.py index 194514df1ae..c7db1863fa8 100644 --- a/langchain/document_loaders/parsers/html/bs4.py +++ b/langchain/document_loaders/parsers/html/bs4.py @@ -1,7 +1,7 @@ """Loader that uses bs4 to load HTML files, enriching metadata with page title.""" import logging -from typing import Dict, Union, Iterator, Optional, Mapping, Any +from typing import Any, Dict, Iterator, Mapping, Optional, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseBlobParser @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) class BS4HTMLParser(BaseBlobParser): - """Loader that uses beautiful soup to parse HTML files.""" + """Parser that uses beautiful soup to parse HTML files.""" def __init__( self, @@ -19,8 +19,7 @@ class BS4HTMLParser(BaseBlobParser): bs_kwargs: Optional[Mapping[str, Any]] = None, get_text_separator: str = "", ) -> None: - """Initialise with path, and optionally, file encoding to use, and any kwargs - to pass to the BeautifulSoup object.""" + """Initialize a bs4 based HTML parser.""" try: import bs4 # noqa:F401 except ImportError: @@ -29,7 +28,7 @@ class BS4HTMLParser(BaseBlobParser): "`pip install beautifulsoup4`" ) - if "features" in bs_kwargs: + if bs_kwargs and "features" in bs_kwargs: raise ValueError("features cannot be set in bs_kwargs") _bs_kwargs = bs_kwargs or {} @@ -54,4 +53,4 @@ class BS4HTMLParser(BaseBlobParser): "source": blob.source, "title": title, } - return [Document(page_content=text, metadata=metadata)] + yield Document(page_content=text, metadata=metadata) diff --git a/langchain/document_loaders/parsers/html/markdownify.py b/langchain/document_loaders/parsers/html/markdownify.py index 953c79f3608..503cf5a8228 100644 --- a/langchain/document_loaders/parsers/html/markdownify.py +++ b/langchain/document_loaders/parsers/html/markdownify.py @@ -1,8 +1,9 @@ """Load and chunk HTMLs with potential pre-processing to clean the html.""" import re +from typing import Iterator, Tuple + from bs4 import BeautifulSoup -from typing import Tuple, Iterator from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob diff --git a/tests/integration_tests/document_loaders/parsers/test_html_parsers.py b/tests/integration_tests/document_loaders/parsers/test_html_parsers.py deleted file mode 100644 index e3943a553d7..00000000000 --- a/tests/integration_tests/document_loaders/parsers/test_html_parsers.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Tests for the HTML parsers.""" -from pathlib import Path -from typing import Iterator - -from langchain.document_loaders.base import BaseBlobParser -from langchain.document_loaders.blob_loaders import Blob -from langchain.document_loaders.parsers.html import BS4HTMLParser - -# PDFs to test parsers on. -HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf" - -LAYOUT_PARSER_PAPER_PDF = ( - Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf" -) diff --git a/tests/unit_tests/document_loaders/parsers/test_html_parsers.py b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py new file mode 100644 index 00000000000..39564b98108 --- /dev/null +++ b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py @@ -0,0 +1,27 @@ +"""Tests for the HTML parsers.""" +import pytest +from pathlib import Path + +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.parsers.html import BS4HTMLParser + +HERE = Path(__file__).parent +EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples" + + +@pytest.mark.requires("bs4") +def test_bs_html_loader() -> None: + """Test unstructured loader.""" + file_path = EXAMPLES / "example.html" + blob = Blob.from_path(file_path) + parser = BS4HTMLParser(get_text_separator="|") + docs = list(parser.lazy_parse(blob)) + assert isinstance(docs, list) + assert len(docs) == 1 + + metadata = docs[0].metadata + content = docs[0].page_content + + assert metadata["title"] == "Chew dad's slippers" + assert metadata["source"] == str(file_path) + assert content[:2] == "\n|"