mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 10:54:15 +00:00
Attribute support for html tags (#5782)
# What does this PR do? Change the HTML tags so that a tag with attributes can be found. ## Before submitting - [x] Tests added - [x] CI/CD validated ### Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
This commit is contained in:
parent
0cfaa76e45
commit
65111eb2b3
@ -740,33 +740,33 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
elif language == Language.HTML:
|
elif language == Language.HTML:
|
||||||
return [
|
return [
|
||||||
# First, try to split along HTML tags
|
# First, try to split along HTML tags
|
||||||
"<body>",
|
"<body",
|
||||||
"<div>",
|
"<div",
|
||||||
"<p>",
|
"<p",
|
||||||
"<br>",
|
"<br",
|
||||||
"<li>",
|
"<li",
|
||||||
"<h1>",
|
"<h1",
|
||||||
"<h2>",
|
"<h2",
|
||||||
"<h3>",
|
"<h3",
|
||||||
"<h4>",
|
"<h4",
|
||||||
"<h5>",
|
"<h5",
|
||||||
"<h6>",
|
"<h6",
|
||||||
"<span>",
|
"<span",
|
||||||
"<table>",
|
"<table",
|
||||||
"<tr>",
|
"<tr",
|
||||||
"<td>",
|
"<td",
|
||||||
"<th>",
|
"<th",
|
||||||
"<ul>",
|
"<ul",
|
||||||
"<ol>",
|
"<ol",
|
||||||
"<header>",
|
"<header",
|
||||||
"<footer>",
|
"<footer",
|
||||||
"<nav>",
|
"<nav",
|
||||||
# Head
|
# Head
|
||||||
"<head>",
|
"<head",
|
||||||
"<style>",
|
"<style",
|
||||||
"<script>",
|
"<script",
|
||||||
"<meta>",
|
"<meta",
|
||||||
"<title>",
|
"<title",
|
||||||
"",
|
"",
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
|
@ -576,3 +576,39 @@ This is a code block
|
|||||||
"block",
|
"block",
|
||||||
"```",
|
"```",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_code_splitter() -> None:
|
||||||
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
|
Language.HTML, chunk_size=60, chunk_overlap=0
|
||||||
|
)
|
||||||
|
code = """
|
||||||
|
<h1>Sample Document</h1>
|
||||||
|
<h2>Section</h2>
|
||||||
|
<p id="1234">Reference content.</p>
|
||||||
|
|
||||||
|
<h2>Lists</h2>
|
||||||
|
<ul>
|
||||||
|
<li>Item 1</li>
|
||||||
|
<li>Item 2</li>
|
||||||
|
<li>Item 3</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>A block</h3>
|
||||||
|
<div class="amazing">
|
||||||
|
<p>Some text</p>
|
||||||
|
<p>Some more text</p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
chunks = splitter.split_text(code)
|
||||||
|
assert chunks == [
|
||||||
|
"<h1>Sample Document</h1>\n <h2>Section</h2>",
|
||||||
|
'<p id="1234">Reference content.</p>',
|
||||||
|
"<h2>Lists</h2>\n <ul>",
|
||||||
|
"<li>Item 1</li>\n <li>Item 2</li>",
|
||||||
|
"<li>Item 3</li>\n </ul>",
|
||||||
|
"<h3>A block</h3>",
|
||||||
|
'<div class="amazing">',
|
||||||
|
"<p>Some text</p>",
|
||||||
|
"<p>Some more text</p>\n </div>",
|
||||||
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user