From 683da2c9e9f5fbedce4a42b0e90a24316af8bde8 Mon Sep 17 00:00:00 2001 From: Sumin Shin <89023026+suminnnnn@users.noreply.github.com> Date: Sun, 11 May 2025 04:42:03 +0900 Subject: [PATCH] text-splitters: Fix regex separator merge bug in CharacterTextSplitter (#31137) **Description:** Fix the merge logic in `CharacterTextSplitter.split_text` so that when using a regex lookahead separator (`is_separator_regex=True`) with `keep_separator=False`, the raw pattern is not re-inserted between chunks. **Issue:** Fixes #31136 **Dependencies:** None **Twitter handle:** None Since this is my first open-source PR, please feel free to point out any mistakes, and I'll be eager to make corrections. --- .../langchain_text_splitters/character.py | 28 ++++++++--- .../tests/unit_tests/test_text_splitters.py | 48 +++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py index a2918bd27f0..f3c25b89e73 100644 --- a/libs/text-splitters/langchain_text_splitters/character.py +++ b/libs/text-splitters/langchain_text_splitters/character.py @@ -18,14 +18,30 @@ class CharacterTextSplitter(TextSplitter): self._is_separator_regex = is_separator_regex def split_text(self, text: str) -> List[str]: - """Split incoming text and return chunks.""" - # First we naively split the large input into a bunch of smaller ones. - separator = ( + """Split into chunks without re-inserting lookaround separators.""" + # 1. Determine split pattern: raw regex or escaped literal + sep_pattern = ( self._separator if self._is_separator_regex else re.escape(self._separator) ) - splits = _split_text_with_regex(text, separator, self._keep_separator) - _separator = "" if self._keep_separator else self._separator - return self._merge_splits(splits, _separator) + + # 2. Initial split (keep separator if requested) + splits = _split_text_with_regex(text, sep_pattern, self._keep_separator) + + # 3. Detect zero-width lookaround so we never re-insert it + lookaround_prefixes = ("(?=", "(? None: ] assert documents == expected + + +def test_character_text_splitter_discard_regex_separator_on_merge() -> None: + """Test that regex lookahead separator is not re-inserted when merging.""" + text = "SCE191 First chunk. SCE103 Second chunk." + splitter = CharacterTextSplitter( + separator=r"(?=SCE\d{3})", + is_separator_regex=True, + chunk_size=200, + chunk_overlap=0, + keep_separator=False, + ) + output = splitter.split_text(text) + assert output == ["SCE191 First chunk. SCE103 Second chunk."] + + +@pytest.mark.parametrize( + "separator,is_regex,text,chunk_size,expected", + [ + # 1) regex lookaround & split happens + # "abcmiddef" split by "(?<=mid)" → ["abcmid","def"], chunk_size=5 keeps both + (r"(?<=mid)", True, "abcmiddef", 5, ["abcmid", "def"]), + # 2) regex lookaround & no split + # chunk_size=100 merges back into ["abcmiddef"] + (r"(?<=mid)", True, "abcmiddef", 100, ["abcmiddef"]), + # 3) literal separator & split happens + # split on "mid" → ["abc","def"], chunk_size=3 keeps both + ("mid", False, "abcmiddef", 3, ["abc", "def"]), + # 4) literal separator & no split + # chunk_size=100 merges back into ["abcmiddef"] + ("mid", False, "abcmiddef", 100, ["abcmiddef"]), + ], +) +def test_character_text_splitter_chunk_size_effect( + separator: str, + is_regex: bool, + text: str, + chunk_size: int, + expected: List[str], +) -> None: + splitter = CharacterTextSplitter( + separator=separator, + is_separator_regex=is_regex, + chunk_size=chunk_size, + chunk_overlap=0, + keep_separator=False, + ) + assert splitter.split_text(text) == expected