mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 13:23:35 +00:00
text-splitters: Fix/recursive json splitter data persistence issue (#21529)
Thank you for contributing to LangChain! **Description:** Noticed an issue with when I was calling `RecursiveJsonSplitter().split_json()` multiple times that I was getting weird results. I found an issue where `chunks` list in the `_json_split` method. If chunks is not provided when _json_split (which is the case when split_json calls _json_split) then the same list is used for subsequent calls to `_json_split`. You can see this in the test case i also added to this commit. Output should be: ``` [{'a': 1, 'b': 2}] [{'c': 3, 'd': 4}] ``` Instead you get: ``` [{'a': 1, 'b': 2}] [{'a': 1, 'b': 2, 'c': 3, 'd': 4}] ``` --------- Co-authored-by: Nuno Campos <nuno@langchain.dev> Co-authored-by: isaac hershenson <ihershenson@hmc.edu> Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
This commit is contained in:
parent
9ab7a6df39
commit
3d54784e6d
@ -55,7 +55,7 @@ class RecursiveJsonSplitter:
|
|||||||
Split json into maximum size dictionaries while preserving structure.
|
Split json into maximum size dictionaries while preserving structure.
|
||||||
"""
|
"""
|
||||||
current_path = current_path or []
|
current_path = current_path or []
|
||||||
chunks = chunks or [{}]
|
chunks = chunks if chunks is not None else [{}]
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
new_path = current_path + [key]
|
new_path = current_path + [key]
|
||||||
|
@ -1953,3 +1953,24 @@ def test_split_json_with_lists() -> None:
|
|||||||
texts_list = splitter.split_text(json_data=test_data_list, convert_lists=True)
|
texts_list = splitter.split_text(json_data=test_data_list, convert_lists=True)
|
||||||
|
|
||||||
assert len(texts_list) >= len(texts)
|
assert len(texts_list) >= len(texts)
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_json_many_calls() -> None:
|
||||||
|
x = {"a": 1, "b": 2}
|
||||||
|
y = {"c": 3, "d": 4}
|
||||||
|
|
||||||
|
splitter = RecursiveJsonSplitter()
|
||||||
|
chunk0 = splitter.split_json(x)
|
||||||
|
assert chunk0 == [{"a": 1, "b": 2}]
|
||||||
|
|
||||||
|
chunk1 = splitter.split_json(y)
|
||||||
|
assert chunk1 == [{"c": 3, "d": 4}]
|
||||||
|
|
||||||
|
# chunk0 is now altered by creating chunk1
|
||||||
|
assert chunk0 == [{"a": 1, "b": 2}]
|
||||||
|
|
||||||
|
chunk0_output = [{"a": 1, "b": 2}]
|
||||||
|
chunk1_output = [{"c": 3, "d": 4}]
|
||||||
|
|
||||||
|
assert chunk0 == chunk0_output
|
||||||
|
assert chunk1 == chunk1_output
|
||||||
|
Loading…
Reference in New Issue
Block a user