fix markdown text splitter horizontal lines (#5625)

Fixes #5614 

#### Issue

The `***` combination produces an exception when used as a seperator in
`re.split`. Instead `\*\*\*` should be used for regex exprations.

#### Who can review?

@eyurtsev
This commit is contained in:
Ilya
2023-06-06 02:40:26 +03:00
committed by GitHub
parent 25487fa5ee
commit d5b1608216
2 changed files with 83 additions and 18 deletions

View File

@@ -275,6 +275,12 @@ Lists
- Item 1
- Item 2
- Item 3
Comment
*******
Not a comment
.. This is a comment
"""
chunks = splitter.split_text(code)
assert chunks == [
@@ -285,10 +291,16 @@ Lists
"This is the",
"content of the",
"section.",
"Lists\n-----",
"Lists",
"-----",
"- Item 1",
"- Item 2",
"- Item 3",
"Comment",
"*******",
"Not a comment",
".. This is a",
"comment",
]
@@ -509,3 +521,58 @@ fn main() {
"""
chunks = splitter.split_text(code)
assert chunks == ["fn main() {", 'println!("Hello', ",", 'World!");', "}"]
def test_markdown_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.MARKDOWN, chunk_size=CHUNK_SIZE, chunk_overlap=0
)
code = """
# Sample Document
## Section
This is the content of the section.
## Lists
- Item 1
- Item 2
- Item 3
### Horizontal lines
***********
____________
-------------------
#### Code blocks
```
This is a code block
```
"""
chunks = splitter.split_text(code)
assert chunks == [
"# Sample",
"Document",
"## Section",
"This is the",
"content of the",
"section.",
"## Lists",
"- Item 1",
"- Item 2",
"- Item 3",
"### Horizontal",
"lines",
"***********",
"____________",
"---------------",
"----",
"#### Code",
"blocks",
"```",
"This is a code",
"block",
"```",
]