From 3b9750f0a4640fd54e7dd2714c997b62ef8df33c Mon Sep 17 00:00:00 2001 From: Dayna Blackwell Date: Mon, 27 Apr 2026 10:48:19 -0700 Subject: [PATCH] fix(text-splitters): remove incorrect C# and Elixir separator keywords (#37037) ## Summary Removes two incorrect separators from `get_separators_for_language()` in `RecursiveCharacterTextSplitter`: - **C#**: `"\nimplements "` is a Java keyword. C# uses `:` for interface implementation. This separator never matches valid C# source code. - **Elixir**: `"\nwhile "` does not exist in Elixir. The language uses recursion and `Enum.reduce_while/3` instead of while loops. Both are dead separators that silently degrade chunking quality by occupying positions in the separator priority list without contributing useful split points. ## Tests Added two targeted tests: - `test_csharp_separators_no_java_keywords`: verifies `"\nimplements "` is not in the C# separator list - `test_elixir_separators_no_while`: verifies `"\nwhile "` is not in the Elixir separator list Existing `test_csharp_code_splitter` continues to pass (no change to expected output since `implements` never matched valid C# code). Full suite: 129 passed, 0 failed. Fixes #37030 --- .../langchain_text_splitters/character.py | 2 -- .../tests/unit_tests/test_text_splitters.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py index 469afdb9132..dd2378a7876 100644 --- a/libs/text-splitters/langchain_text_splitters/character.py +++ b/libs/text-splitters/langchain_text_splitters/character.py @@ -440,7 +440,6 @@ class RecursiveCharacterTextSplitter(TextSplitter): # Split along control flow statements "\nif ", "\nunless ", - "\nwhile ", "\ncase ", "\ncond ", "\nwith ", @@ -593,7 +592,6 @@ class RecursiveCharacterTextSplitter(TextSplitter): return [ "\ninterface ", "\nenum ", - "\nimplements ", "\ndelegate ", "\nevent ", # Split along class definitions diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index c7d7e2c480d..cae9f16d103 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1011,6 +1011,23 @@ class Program ] +def test_csharp_separators_no_java_keywords() -> None: + """C# separators should not contain Java-only keywords.""" + splitter = RecursiveCharacterTextSplitter.from_language( + Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0 + ) + # "implements" is a Java keyword; C# uses ":" for interface implementation + assert "\nimplements " not in splitter._separators + + +def test_elixir_separators_no_while() -> None: + """Elixir has no while loop; the separator should not be present.""" + splitter = RecursiveCharacterTextSplitter.from_language( + Language.ELIXIR, chunk_size=CHUNK_SIZE, chunk_overlap=0 + ) + assert "\nwhile " not in splitter._separators + + def test_cpp_code_splitter() -> None: splitter = RecursiveCharacterTextSplitter.from_language( Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0