fix(text-splitters): prevent JSFrameworkTextSplitter from mutating self._separators on each split_text() call (#35316)

This commit is contained in:
Maxime Grenu
2026-02-18 23:51:42 +01:00
committed by GitHub
parent 8323f556d9
commit 8951c01fe8
2 changed files with 34 additions and 2 deletions

View File

@@ -92,11 +92,15 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
"\ndefault ",
" default ",
]
# Build the effective separator list for this call only.
# Do NOT assign back to self._separators: doing so would permanently
# append js_separators + component_separators on every invocation,
# causing the list to grow unboundedly when split_text() is called
# multiple times on the same instance.
separators = (
self._separators
+ js_separators
+ component_separators
+ ["<>", "\n\n", "&&\n", "||\n"]
)
self._separators = separators
return super().split_text(text)
return self._split_text(text, separators)

View File

@@ -624,6 +624,34 @@ def test_svelte_text_splitter() -> None:
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]
def test_jsx_splitter_separator_not_mutated_across_calls() -> None:
"""Regression test: repeated split_text() calls must not mutate separators.
Calling split_text() multiple times on the same JSFrameworkTextSplitter
instance must not grow the internal separator list between calls.
Before the fix, self._separators was overwritten with the full expanded list
on every invocation, so a second call would start with the already-expanded
list and append even more separators.
"""
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
# Record separator count after constructing (should be 0 - no custom separators)
initial_sep_count = len(splitter._separators)
# Call split_text twice; the results should be identical for identical input
splits_first = splitter.split_text(FAKE_JSX_TEXT)
splits_second = splitter.split_text(FAKE_JSX_TEXT)
assert splits_first == splits_second, (
"split_text() must return identical results on repeated calls with the "
"same input"
)
assert len(splitter._separators) == initial_sep_count, (
"split_text() must not mutate self._separators between calls"
)
CHUNK_SIZE = 16