mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 12:07:36 +00:00
text-splitters[patch]: fix HTMLSectionSplitter parsing of xslt paths (#22176)
## Description This PR allows passing the HTMLSectionSplitter paths to xslt files. It does so by fixing two trivial bugs with how passed paths were being handled. It also changes the default value of the param `xslt_path` to `None` so the special case where the file was part of the langchain package could be handled. ## Issue #22175
This commit is contained in:
9
libs/text-splitters/tests/test_data/test_splitter.xslt
Normal file
9
libs/text-splitters/tests/test_data/test_splitter.xslt
Normal file
@@ -0,0 +1,9 @@
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*" />
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
@@ -1619,6 +1619,37 @@ def test_happy_path_splitting_based_on_header_with_whitespace_chars() -> None:
|
||||
assert docs[2].metadata["Header 2"] == "Baz"
|
||||
|
||||
|
||||
@pytest.mark.requires("lxml")
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_section_splitter_accepts_a_relative_path() -> None:
|
||||
html_string = """<html><body><p>Foo</p></body></html>"""
|
||||
test_file = Path("tests/test_data/test_splitter.xslt")
|
||||
assert test_file.is_file()
|
||||
|
||||
sec_splitter = HTMLSectionSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
|
||||
xslt_path=test_file.as_posix(),
|
||||
)
|
||||
|
||||
sec_splitter.split_text(html_string)
|
||||
|
||||
|
||||
@pytest.mark.requires("lxml")
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_section_splitter_accepts_an_absolute_path() -> None:
|
||||
html_string = """<html><body><p>Foo</p></body></html>"""
|
||||
test_file = Path("tests/test_data/test_splitter.xslt").absolute()
|
||||
assert test_file.is_absolute()
|
||||
assert test_file.is_file()
|
||||
|
||||
sec_splitter = HTMLSectionSplitter(
|
||||
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
|
||||
xslt_path=test_file.as_posix(),
|
||||
)
|
||||
|
||||
sec_splitter.split_text(html_string)
|
||||
|
||||
|
||||
def test_split_json() -> None:
|
||||
"""Test json text splitter"""
|
||||
max_chunk = 800
|
||||
|
Reference in New Issue
Block a user