diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index 7bb1d97fba5..f09366f1539 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -1,5 +1,6 @@ """Test text splitting functionality.""" import re +from pathlib import Path from typing import List import pytest @@ -7,6 +8,7 @@ from langchain_core.documents import Document from langchain.text_splitter import ( CharacterTextSplitter, + HTMLHeaderTextSplitter, Language, MarkdownHeaderTextSplitter, PythonCodeTextSplitter, @@ -1128,3 +1130,48 @@ def test_solidity_code_splitter() -> None: "+ b;", "}\n }", ] + + +@pytest.mark.requires("lxml") +def test_html_header_text_splitter(tmp_path: Path) -> None: + splitter = HTMLHeaderTextSplitter( + headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")] + ) + + content = """ +

Sample Document

+

Section

+

Reference content.

+ +

Lists

+ + +

A block

+
+

Some text

+

Some more text

+
+ """ + + docs = splitter.split_text(content) + expected = [ + Document( + page_content="Reference content.", + metadata={"Header 1": "Sample Document", "Header 2": "Section"}, + ), + Document( + page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text", + metadata={"Header 1": "Sample Document", "Header 2": "Lists"}, + ), + ] + assert docs == expected + + with open(tmp_path / "doc.html", "w") as tmp: + tmp.write(content) + docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html") + + assert docs_from_file == expected