mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
text-splitters: Fix regex separator merge bug in CharacterTextSplitter (#31137)
**Description:** Fix the merge logic in `CharacterTextSplitter.split_text` so that when using a regex lookahead separator (`is_separator_regex=True`) with `keep_separator=False`, the raw pattern is not re-inserted between chunks. **Issue:** Fixes #31136 **Dependencies:** None **Twitter handle:** None Since this is my first open-source PR, please feel free to point out any mistakes, and I'll be eager to make corrections.
This commit is contained in:
parent
0ef4ac75b7
commit
683da2c9e9
@ -18,14 +18,30 @@ class CharacterTextSplitter(TextSplitter):
|
||||
self._is_separator_regex = is_separator_regex
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
# First we naively split the large input into a bunch of smaller ones.
|
||||
separator = (
|
||||
"""Split into chunks without re-inserting lookaround separators."""
|
||||
# 1. Determine split pattern: raw regex or escaped literal
|
||||
sep_pattern = (
|
||||
self._separator if self._is_separator_regex else re.escape(self._separator)
|
||||
)
|
||||
splits = _split_text_with_regex(text, separator, self._keep_separator)
|
||||
_separator = "" if self._keep_separator else self._separator
|
||||
return self._merge_splits(splits, _separator)
|
||||
|
||||
# 2. Initial split (keep separator if requested)
|
||||
splits = _split_text_with_regex(text, sep_pattern, self._keep_separator)
|
||||
|
||||
# 3. Detect zero-width lookaround so we never re-insert it
|
||||
lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!")
|
||||
is_lookaround = self._is_separator_regex and any(
|
||||
self._separator.startswith(p) for p in lookaround_prefixes
|
||||
)
|
||||
|
||||
# 4. Decide merge separator:
|
||||
# - if keep_separator or lookaround → don’t re-insert
|
||||
# - else → re-insert literal separator
|
||||
merge_sep = ""
|
||||
if not (self._keep_separator or is_lookaround):
|
||||
merge_sep = self._separator
|
||||
|
||||
# 5. Merge adjacent splits and return
|
||||
return self._merge_splits(splits, merge_sep)
|
||||
|
||||
|
||||
def _split_text_with_regex(
|
||||
|
@ -3373,3 +3373,51 @@ def test_html_splitter_with_media_preservation() -> None:
|
||||
]
|
||||
|
||||
assert documents == expected
|
||||
|
||||
|
||||
def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
|
||||
"""Test that regex lookahead separator is not re-inserted when merging."""
|
||||
text = "SCE191 First chunk. SCE103 Second chunk."
|
||||
splitter = CharacterTextSplitter(
|
||||
separator=r"(?=SCE\d{3})",
|
||||
is_separator_regex=True,
|
||||
chunk_size=200,
|
||||
chunk_overlap=0,
|
||||
keep_separator=False,
|
||||
)
|
||||
output = splitter.split_text(text)
|
||||
assert output == ["SCE191 First chunk. SCE103 Second chunk."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"separator,is_regex,text,chunk_size,expected",
|
||||
[
|
||||
# 1) regex lookaround & split happens
|
||||
# "abcmiddef" split by "(?<=mid)" → ["abcmid","def"], chunk_size=5 keeps both
|
||||
(r"(?<=mid)", True, "abcmiddef", 5, ["abcmid", "def"]),
|
||||
# 2) regex lookaround & no split
|
||||
# chunk_size=100 merges back into ["abcmiddef"]
|
||||
(r"(?<=mid)", True, "abcmiddef", 100, ["abcmiddef"]),
|
||||
# 3) literal separator & split happens
|
||||
# split on "mid" → ["abc","def"], chunk_size=3 keeps both
|
||||
("mid", False, "abcmiddef", 3, ["abc", "def"]),
|
||||
# 4) literal separator & no split
|
||||
# chunk_size=100 merges back into ["abcmiddef"]
|
||||
("mid", False, "abcmiddef", 100, ["abcmiddef"]),
|
||||
],
|
||||
)
|
||||
def test_character_text_splitter_chunk_size_effect(
|
||||
separator: str,
|
||||
is_regex: bool,
|
||||
text: str,
|
||||
chunk_size: int,
|
||||
expected: List[str],
|
||||
) -> None:
|
||||
splitter = CharacterTextSplitter(
|
||||
separator=separator,
|
||||
is_separator_regex=is_regex,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=0,
|
||||
keep_separator=False,
|
||||
)
|
||||
assert splitter.split_text(text) == expected
|
||||
|
Loading…
Reference in New Issue
Block a user