mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-21 06:33:41 +00:00
fix(text-splitters): prevent JSFrameworkTextSplitter from mutating self._separators on each split_text() call (#35316)
This commit is contained in:
@@ -92,11 +92,15 @@ class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"\ndefault ",
|
||||
" default ",
|
||||
]
|
||||
# Build the effective separator list for this call only.
|
||||
# Do NOT assign back to self._separators: doing so would permanently
|
||||
# append js_separators + component_separators on every invocation,
|
||||
# causing the list to grow unboundedly when split_text() is called
|
||||
# multiple times on the same instance.
|
||||
separators = (
|
||||
self._separators
|
||||
+ js_separators
|
||||
+ component_separators
|
||||
+ ["<>", "\n\n", "&&\n", "||\n"]
|
||||
)
|
||||
self._separators = separators
|
||||
return super().split_text(text)
|
||||
return self._split_text(text, separators)
|
||||
|
||||
@@ -624,6 +624,34 @@ def test_svelte_text_splitter() -> None:
|
||||
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]
|
||||
|
||||
|
||||
def test_jsx_splitter_separator_not_mutated_across_calls() -> None:
|
||||
"""Regression test: repeated split_text() calls must not mutate separators.
|
||||
|
||||
Calling split_text() multiple times on the same JSFrameworkTextSplitter
|
||||
instance must not grow the internal separator list between calls.
|
||||
|
||||
Before the fix, self._separators was overwritten with the full expanded list
|
||||
on every invocation, so a second call would start with the already-expanded
|
||||
list and append even more separators.
|
||||
"""
|
||||
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
|
||||
|
||||
# Record separator count after constructing (should be 0 - no custom separators)
|
||||
initial_sep_count = len(splitter._separators)
|
||||
|
||||
# Call split_text twice; the results should be identical for identical input
|
||||
splits_first = splitter.split_text(FAKE_JSX_TEXT)
|
||||
splits_second = splitter.split_text(FAKE_JSX_TEXT)
|
||||
|
||||
assert splits_first == splits_second, (
|
||||
"split_text() must return identical results on repeated calls with the "
|
||||
"same input"
|
||||
)
|
||||
assert len(splitter._separators) == initial_sep_count, (
|
||||
"split_text() must not mutate self._separators between calls"
|
||||
)
|
||||
|
||||
|
||||
CHUNK_SIZE = 16
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user