mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-25 21:37:20 +00:00
Attribute support for html tags (#5782)
# What does this PR do? Change the HTML tags so that a tag with attributes can be found. ## Before submitting - [x] Tests added - [x] CI/CD validated ### Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
This commit is contained in:
@@ -576,3 +576,39 @@ This is a code block
|
||||
"block",
|
||||
"```",
|
||||
]
|
||||
|
||||
|
||||
def test_html_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.HTML, chunk_size=60, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
<h1>Sample Document</h1>
|
||||
<h2>Section</h2>
|
||||
<p id="1234">Reference content.</p>
|
||||
|
||||
<h2>Lists</h2>
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2</li>
|
||||
<li>Item 3</li>
|
||||
</ul>
|
||||
|
||||
<h3>A block</h3>
|
||||
<div class="amazing">
|
||||
<p>Some text</p>
|
||||
<p>Some more text</p>
|
||||
</div>
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"<h1>Sample Document</h1>\n <h2>Section</h2>",
|
||||
'<p id="1234">Reference content.</p>',
|
||||
"<h2>Lists</h2>\n <ul>",
|
||||
"<li>Item 1</li>\n <li>Item 2</li>",
|
||||
"<li>Item 3</li>\n </ul>",
|
||||
"<h3>A block</h3>",
|
||||
'<div class="amazing">',
|
||||
"<p>Some text</p>",
|
||||
"<p>Some more text</p>\n </div>",
|
||||
]
|
||||
|
Reference in New Issue
Block a user