mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-06 07:04:01 +00:00
docs[patch]: add text_splitter.py test (#14025)
Description: Add HTMLHeaderTextSplitter unit test Dependencies: none
This commit is contained in:
parent
7d5341dbd3
commit
62a3473ac0
@ -1,5 +1,6 @@
|
|||||||
"""Test text splitting functionality."""
|
"""Test text splitting functionality."""
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -7,6 +8,7 @@ from langchain_core.documents import Document
|
|||||||
|
|
||||||
from langchain.text_splitter import (
|
from langchain.text_splitter import (
|
||||||
CharacterTextSplitter,
|
CharacterTextSplitter,
|
||||||
|
HTMLHeaderTextSplitter,
|
||||||
Language,
|
Language,
|
||||||
MarkdownHeaderTextSplitter,
|
MarkdownHeaderTextSplitter,
|
||||||
PythonCodeTextSplitter,
|
PythonCodeTextSplitter,
|
||||||
@ -1128,3 +1130,48 @@ def test_solidity_code_splitter() -> None:
|
|||||||
"+ b;",
|
"+ b;",
|
||||||
"}\n }",
|
"}\n }",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("lxml")
|
||||||
|
def test_html_header_text_splitter(tmp_path: Path) -> None:
|
||||||
|
splitter = HTMLHeaderTextSplitter(
|
||||||
|
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
|
||||||
|
)
|
||||||
|
|
||||||
|
content = """
|
||||||
|
<h1>Sample Document</h1>
|
||||||
|
<h2>Section</h2>
|
||||||
|
<p id="1234">Reference content.</p>
|
||||||
|
|
||||||
|
<h2>Lists</h2>
|
||||||
|
<ul>
|
||||||
|
<li>Item 1</li>
|
||||||
|
<li>Item 2</li>
|
||||||
|
<li>Item 3</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>A block</h3>
|
||||||
|
<div class="amazing">
|
||||||
|
<p>Some text</p>
|
||||||
|
<p>Some more text</p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
docs = splitter.split_text(content)
|
||||||
|
expected = [
|
||||||
|
Document(
|
||||||
|
page_content="Reference content.",
|
||||||
|
metadata={"Header 1": "Sample Document", "Header 2": "Section"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text",
|
||||||
|
metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
assert docs == expected
|
||||||
|
|
||||||
|
with open(tmp_path / "doc.html", "w") as tmp:
|
||||||
|
tmp.write(content)
|
||||||
|
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
|
||||||
|
|
||||||
|
assert docs_from_file == expected
|
||||||
|
Loading…
Reference in New Issue
Block a user