mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-02 19:47:13 +00:00
text-splitters[patch]: Extend TextSplitter:keep_separator functionality (#21130)
**Description:** Added extra functionality to `CharacterTextSplitter`, `TextSplitter` classes. The user can select whether to append the separator to the previous chunk with `keep_separator='end' ` or else prepend to the next chunk. Previous functionality prepended by default to next chunk. **Issue:** Fixes #20908 --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
b859765752
commit
c3bcfad66d
@@ -112,6 +112,50 @@ def test_character_text_splitter_keep_separator_regex(
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"separator, is_separator_regex", [(re.escape("."), True), (".", False)]
|
||||
)
|
||||
def test_character_text_splitter_keep_separator_regex_start(
|
||||
separator: str, is_separator_regex: bool
|
||||
) -> None:
|
||||
"""Test splitting by characters while keeping the separator
|
||||
that is a regex special character and placing it at the start of each chunk.
|
||||
"""
|
||||
text = "foo.bar.baz.123"
|
||||
splitter = CharacterTextSplitter(
|
||||
separator=separator,
|
||||
chunk_size=1,
|
||||
chunk_overlap=0,
|
||||
keep_separator="start",
|
||||
is_separator_regex=is_separator_regex,
|
||||
)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = ["foo", ".bar", ".baz", ".123"]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"separator, is_separator_regex", [(re.escape("."), True), (".", False)]
|
||||
)
|
||||
def test_character_text_splitter_keep_separator_regex_end(
|
||||
separator: str, is_separator_regex: bool
|
||||
) -> None:
|
||||
"""Test splitting by characters while keeping the separator
|
||||
that is a regex special character and placing it at the end of each chunk.
|
||||
"""
|
||||
text = "foo.bar.baz.123"
|
||||
splitter = CharacterTextSplitter(
|
||||
separator=separator,
|
||||
chunk_size=1,
|
||||
chunk_overlap=0,
|
||||
keep_separator="end",
|
||||
is_separator_regex=is_separator_regex,
|
||||
)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = ["foo.", "bar.", "baz.", "123"]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"separator, is_separator_regex", [(re.escape("."), True), (".", False)]
|
||||
)
|
||||
|
Reference in New Issue
Block a user