mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-28 17:38:36 +00:00
Attribute support for html tags (#5782)
# What does this PR do? Change the HTML tags so that a tag with attributes can be found. ## Before submitting - [x] Tests added - [x] CI/CD validated ### Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
This commit is contained in:
parent
0cfaa76e45
commit
65111eb2b3
@ -740,33 +740,33 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
elif language == Language.HTML:
|
||||
return [
|
||||
# First, try to split along HTML tags
|
||||
"<body>",
|
||||
"<div>",
|
||||
"<p>",
|
||||
"<br>",
|
||||
"<li>",
|
||||
"<h1>",
|
||||
"<h2>",
|
||||
"<h3>",
|
||||
"<h4>",
|
||||
"<h5>",
|
||||
"<h6>",
|
||||
"<span>",
|
||||
"<table>",
|
||||
"<tr>",
|
||||
"<td>",
|
||||
"<th>",
|
||||
"<ul>",
|
||||
"<ol>",
|
||||
"<header>",
|
||||
"<footer>",
|
||||
"<nav>",
|
||||
"<body",
|
||||
"<div",
|
||||
"<p",
|
||||
"<br",
|
||||
"<li",
|
||||
"<h1",
|
||||
"<h2",
|
||||
"<h3",
|
||||
"<h4",
|
||||
"<h5",
|
||||
"<h6",
|
||||
"<span",
|
||||
"<table",
|
||||
"<tr",
|
||||
"<td",
|
||||
"<th",
|
||||
"<ul",
|
||||
"<ol",
|
||||
"<header",
|
||||
"<footer",
|
||||
"<nav",
|
||||
# Head
|
||||
"<head>",
|
||||
"<style>",
|
||||
"<script>",
|
||||
"<meta>",
|
||||
"<title>",
|
||||
"<head",
|
||||
"<style",
|
||||
"<script",
|
||||
"<meta",
|
||||
"<title",
|
||||
"",
|
||||
]
|
||||
else:
|
||||
|
@ -576,3 +576,39 @@ This is a code block
|
||||
"block",
|
||||
"```",
|
||||
]
|
||||
|
||||
|
||||
def test_html_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.HTML, chunk_size=60, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
<h1>Sample Document</h1>
|
||||
<h2>Section</h2>
|
||||
<p id="1234">Reference content.</p>
|
||||
|
||||
<h2>Lists</h2>
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2</li>
|
||||
<li>Item 3</li>
|
||||
</ul>
|
||||
|
||||
<h3>A block</h3>
|
||||
<div class="amazing">
|
||||
<p>Some text</p>
|
||||
<p>Some more text</p>
|
||||
</div>
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"<h1>Sample Document</h1>\n <h2>Section</h2>",
|
||||
'<p id="1234">Reference content.</p>',
|
||||
"<h2>Lists</h2>\n <ul>",
|
||||
"<li>Item 1</li>\n <li>Item 2</li>",
|
||||
"<li>Item 3</li>\n </ul>",
|
||||
"<h3>A block</h3>",
|
||||
'<div class="amazing">',
|
||||
"<p>Some text</p>",
|
||||
"<p>Some more text</p>\n </div>",
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user