diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index 7bb1d97fba5..f09366f1539 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -1,5 +1,6 @@ """Test text splitting functionality.""" import re +from pathlib import Path from typing import List import pytest @@ -7,6 +8,7 @@ from langchain_core.documents import Document from langchain.text_splitter import ( CharacterTextSplitter, + HTMLHeaderTextSplitter, Language, MarkdownHeaderTextSplitter, PythonCodeTextSplitter, @@ -1128,3 +1130,48 @@ def test_solidity_code_splitter() -> None: "+ b;", "}\n }", ] + + +@pytest.mark.requires("lxml") +def test_html_header_text_splitter(tmp_path: Path) -> None: + splitter = HTMLHeaderTextSplitter( + headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")] + ) + + content = """ +
Reference content.
+ +Some text
+Some more text
+