mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 17:08:47 +00:00
Fix inconsistent behavior of CharacterTextSplitter
when changing keep_separator
(#7263)
- Description: - When `keep_separator` is `True` the `_split_text_with_regex()` method in `text_splitter` uses regex to split, but when `keep_separator` is `False` it uses `str.split()`. This causes problems when the separator is a special regex character like `.` or `*`. This PR fixes that by using `re.split()` in both cases. - Issue: #7262 - Tag maintainer: @baskaryan
This commit is contained in:
parent
b151d4257a
commit
0c7a5cb206
@ -47,7 +47,7 @@ def _split_text_with_regex(
|
|||||||
splits += _splits[-1:]
|
splits += _splits[-1:]
|
||||||
splits = [_splits[0]] + splits
|
splits = [_splits[0]] + splits
|
||||||
else:
|
else:
|
||||||
splits = text.split(separator)
|
splits = re.split(separator, text)
|
||||||
else:
|
else:
|
||||||
splits = list(text)
|
splits = list(text)
|
||||||
return [s for s in splits if s != ""]
|
return [s for s in splits if s != ""]
|
||||||
|
@ -80,6 +80,31 @@ def test_character_text_splitter_longer_words() -> None:
|
|||||||
assert output == expected_output
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_character_text_splitter_keep_separator_regex() -> None:
|
||||||
|
"""Test splitting by characters while keeping the separator
|
||||||
|
that is a regex special character.
|
||||||
|
"""
|
||||||
|
text = "foo.bar.baz.123"
|
||||||
|
splitter = CharacterTextSplitter(
|
||||||
|
separator=r"\.", chunk_size=1, chunk_overlap=0, keep_separator=True
|
||||||
|
)
|
||||||
|
output = splitter.split_text(text)
|
||||||
|
expected_output = ["foo", ".bar", ".baz", ".123"]
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_character_text_splitter_discard_separator_regex() -> None:
|
||||||
|
"""Test splitting by characters discarding the separator
|
||||||
|
that is a regex special character."""
|
||||||
|
text = "foo.bar.baz.123"
|
||||||
|
splitter = CharacterTextSplitter(
|
||||||
|
separator=r"\.", chunk_size=1, chunk_overlap=0, keep_separator=False
|
||||||
|
)
|
||||||
|
output = splitter.split_text(text)
|
||||||
|
expected_output = ["foo", "bar", "baz", "123"]
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
def test_character_text_splitting_args() -> None:
|
def test_character_text_splitting_args() -> None:
|
||||||
"""Test invalid arguments."""
|
"""Test invalid arguments."""
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
Loading…
Reference in New Issue
Block a user